hakmem/core/hakmem_tiny_superslab.c

// hakmem_tiny_superslab.c - SuperSlab allocator implementation (Phase 6.22)
// Purpose: 2MB aligned slab allocation with fast pointer→slab lookup
// License: MIT
// Date: 2025-10-24

#include "hakmem_tiny_superslab.h"
#include "box/ss_hot_cold_box.h"  // Phase 3d-C: Hot/Cold Split
#include "hakmem_super_registry.h"  // Phase 1: Registry integration
#include "hakmem_tiny.h"  // For tiny_self_u32
#include "hakmem_tiny_config.h"  // For extern g_tiny_class_sizes
#include "hakmem_shared_pool.h"  // Phase 12: Shared SuperSlab pool backend (skeleton)
#include <sys/mman.h>
#include <sys/resource.h>
#include <errno.h>
#include <string.h>
#include <stdio.h>
#include <stdlib.h>  // getenv, atoi
#include <pthread.h>
#include <unistd.h>
#include <sys/resource.h>  // getrlimit for OOM diagnostics
#include <sys/mman.h>
#include "hakmem_internal.h"  // HAKMEM_LOG for release-silent logging
#include "tiny_region_id.h"   // For HEADER_MAGIC / HEADER_CLASS_MASK (restore header on remote-drain)
#include "hakmem_tiny_integrity.h"  // HAK_CHECK_CLASS_IDX
#include "box/tiny_next_ptr_box.h" // For tiny_next_write
#include "box/slab_freelist_atomic.h" // Phase 1: Atomic freelist accessor

static int g_ss_force_lg = -1;
static _Atomic int g_ss_populate_once = 0;

// Forward: decide next SuperSlab lg for a class (ACE-aware, clamped)
static inline uint8_t hak_tiny_superslab_next_lg(int class_idx)
{
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
        return SUPERSLAB_LG_DEFAULT;
    }
    // Prefer ACE target if within allowed range
    uint8_t t = atomic_load_explicit((_Atomic uint8_t*)&g_ss_ace[class_idx].target_lg,
                                     memory_order_relaxed);
    if (t < SUPERSLAB_LG_MIN || t > SUPERSLAB_LG_MAX) {
        return SUPERSLAB_LG_DEFAULT;
    }
    return t;
}

// ============================================================================
// Global Statistics
// ============================================================================

static pthread_mutex_t g_superslab_lock = PTHREAD_MUTEX_INITIALIZER;
uint64_t g_superslabs_allocated = 0;  // Non-static for debugging
uint64_t g_superslabs_freed = 0;      // Phase 7.6: Non-static for test access
uint64_t g_bytes_allocated = 0;  // Non-static for debugging

// ============================================================================
// Phase 2a: Dynamic Expansion - Global per-class SuperSlabHeads
// ============================================================================

SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS] = {NULL};

// Debug counters
_Atomic uint64_t g_ss_active_dec_calls = 0;
_Atomic uint64_t g_hak_tiny_free_calls = 0;
_Atomic uint64_t g_ss_remote_push_calls = 0;
// Free path instrumentation (lightweight, for OOM/route diagnosis)
_Atomic uint64_t g_free_ss_enter = 0;          // hak_tiny_free_superslab() entries
_Atomic uint64_t g_free_local_box_calls = 0;   // same-thread freelist pushes
_Atomic uint64_t g_free_remote_box_calls = 0;  // cross-thread remote pushes
// Per-class counters for gating/metrics (Tiny classes = 8)
uint64_t g_ss_alloc_by_class[8] = {0};
uint64_t g_ss_freed_by_class[8] = {0};

typedef struct SuperslabCacheEntry {
    struct SuperslabCacheEntry* next;
} SuperslabCacheEntry;

static SuperslabCacheEntry* g_ss_cache_head[8] = {0};
static size_t g_ss_cache_count[8] = {0};
static size_t g_ss_cache_cap[8] = {0};
static size_t g_ss_precharge_target[8] = {0};
static _Atomic int g_ss_precharge_done[8] = {0};
static int g_ss_cache_enabled = 0;

static pthread_once_t g_ss_cache_once = PTHREAD_ONCE_INIT;
static pthread_mutex_t g_ss_cache_lock[8];

uint64_t g_ss_cache_hits[8] = {0};
uint64_t g_ss_cache_misses[8] = {0};
uint64_t g_ss_cache_puts[8] = {0};
uint64_t g_ss_cache_drops[8] = {0};
uint64_t g_ss_cache_precharged[8] = {0};

uint64_t g_superslabs_reused = 0;
uint64_t g_superslabs_cached = 0;

static void ss_cache_global_init(void) {
    for (int i = 0; i < 8; i++) {
        pthread_mutex_init(&g_ss_cache_lock[i], NULL);
    }
}

static inline void ss_cache_ensure_init(void) {
    pthread_once(&g_ss_cache_once, ss_cache_global_init);
}

static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate);
static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask);
static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class);
static int ss_cache_push(uint8_t size_class, SuperSlab* ss);

// Drain remote MPSC stack into freelist (ownership already verified by caller)
void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta)
{
    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss) || !meta) return;

    static _Atomic uint32_t g_remote_drain_diag_once = 0;
    static int g_remote_drain_diag_en = -1;

    // Atomically take the whole remote list
    uintptr_t head = atomic_exchange_explicit(&ss->remote_heads[slab_idx], 0,
                                              memory_order_acq_rel);
    if (head == 0) return;

    // Convert remote stack (offset 0 next) into freelist encoding via Box API
    // and splice in front of current freelist preserving relative order.
    void* prev = meta->freelist;
    int cls = (int)meta->class_idx;
    HAK_CHECK_CLASS_IDX(cls, "_ss_remote_drain_to_freelist_unsafe");
    if (__builtin_expect(cls < 0 || cls >= TINY_NUM_CLASSES, 0)) {
        static _Atomic int g_remote_drain_cls_oob = 0;
        if (atomic_fetch_add_explicit(&g_remote_drain_cls_oob, 1, memory_order_relaxed) == 0) {
            fprintf(stderr,
                    "[REMOTE_DRAIN_CLASS_OOB] ss=%p slab_idx=%d meta=%p cls=%d head=%#lx\n",
                    (void*)ss, slab_idx, (void*)meta, cls, (unsigned long)head);
        }
        return;
    }
    uintptr_t cur = head;
    while (cur != 0) {
        uintptr_t next = *(uintptr_t*)cur;  // remote-next stored at offset 0
#if !HAKMEM_BUILD_RELEASE
        if (__builtin_expect(g_remote_drain_diag_en == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_SLL_DIAG");
            g_remote_drain_diag_en = (e && *e && *e != '0') ? 1 : 0;
        }
#else
        if (__builtin_expect(g_remote_drain_diag_en == -1, 0)) {
            g_remote_drain_diag_en = 0;
        }
#endif
        if (__builtin_expect(g_remote_drain_diag_en, 0)) {
            uintptr_t addr = (uintptr_t)next;
            if (addr != 0 && (addr < 4096 || addr > 0x00007fffffffffffULL)) {
                uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed);
                if (shot < 8) {
                    fprintf(stderr,
                            "[REMOTE_DRAIN_NEXT_INVALID] cls=%d slab=%d cur=%p next=%p head=%#lx prev=%p count=%u\n",
                            cls,
                            slab_idx,
                            (void*)cur,
                            (void*)next,
                            (unsigned long)head,
                            prev,
                            (unsigned)meta->used);
                }
            }
#if HAKMEM_TINY_HEADER_CLASSIDX
            int hdr_cls = tiny_region_id_read_header((uint8_t*)cur + 1);
            if (hdr_cls >= 0 && hdr_cls != cls) {
                uint32_t shot = atomic_fetch_add_explicit(&g_remote_drain_diag_once, 1, memory_order_relaxed);
                if (shot < 8) {
                    fprintf(stderr,
                            "[REMOTE_DRAIN_HDR_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d head=%#lx\n",
                            cls, slab_idx, (void*)cur, hdr_cls, (int)meta->class_idx, (unsigned long)head);
                }
            }
#endif
        }
#if HAKMEM_TINY_HEADER_CLASSIDX
        // Cross-check header vs meta before writing next (even if diag is off)
        {
            int hdr_cls_pre = tiny_region_id_read_header((uint8_t*)cur + 1);
            if (hdr_cls_pre >= 0 && hdr_cls_pre != cls) {
                static _Atomic uint32_t g_hdr_meta_mismatch_rd = 0;
                uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_mismatch_rd, 1, memory_order_relaxed);
                if (n < 16) {
                    fprintf(stderr,
                            "[REMOTE_DRAIN_HDR_META_MISMATCH] cls=%d slab=%d cur=%p hdr_cls=%d meta_cls=%d\n",
                            cls, slab_idx, (void*)cur, hdr_cls_pre, (int)meta->class_idx);
                }
            }
        }
#endif
        // Restore header for header-classes (class 1-6) which were clobbered by remote push
#if HAKMEM_TINY_HEADER_CLASSIDX
        if (cls != 0) {
            uint8_t expected = (uint8_t)(HEADER_MAGIC | (cls & HEADER_CLASS_MASK));
            *(uint8_t*)(uintptr_t)cur = expected;
        }
#endif
        // Rewrite next pointer to Box representation for this class
        tiny_next_write(cls, (void*)cur, prev);
        prev = (void*)cur;
        cur = next;
    }
    meta->freelist = prev;
    // Reset remote count after full drain
    atomic_store_explicit(&ss->remote_counts[slab_idx], 0, memory_order_release);

    // Update freelist/nonempty visibility bits
    uint32_t bit = (1u << slab_idx);
    atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
    atomic_fetch_or_explicit(&ss->nonempty_mask, bit, memory_order_release);
}

static inline void ss_stats_os_alloc(uint8_t size_class, size_t ss_size) {
    pthread_mutex_lock(&g_superslab_lock);
    g_superslabs_allocated++;
    if (size_class < 8) {
        g_ss_alloc_by_class[size_class]++;
    }
    g_bytes_allocated += ss_size;
    pthread_mutex_unlock(&g_superslab_lock);
}

static inline void ss_stats_cache_reuse(void) {
    pthread_mutex_lock(&g_superslab_lock);
    g_superslabs_reused++;
    pthread_mutex_unlock(&g_superslab_lock);
}

static inline void ss_stats_cache_store(void) {
    pthread_mutex_lock(&g_superslab_lock);
    g_superslabs_cached++;
    pthread_mutex_unlock(&g_superslab_lock);
}

// ============================================================================
// Phase 8.3: ACE (Adaptive Cache Engine) State
// ============================================================================

SuperSlabACEState g_ss_ace[TINY_NUM_CLASSES_SS] = {{0}};

// Phase 8.3: hak_now_ns() is now defined in hakmem_tiny_superslab.h as static inline

// ============================================================================
// Diagnostics
// ============================================================================

static void log_superslab_oom_once(size_t ss_size, size_t alloc_size, int err) {
    static int logged = 0;
    if (logged) return;
    logged = 1;

    // CRITICAL FIX: Increment lock depth FIRST before any LIBC calls
    // fopen/fclose/getrlimit/fprintf all may call malloc internally
    // Must bypass HAKMEM wrapper to avoid header mismatch crash
    extern __thread int g_hakmem_lock_depth;
    g_hakmem_lock_depth++;  // Force wrapper to use __libc_malloc

    struct rlimit rl = {0};
    if (getrlimit(RLIMIT_AS, &rl) != 0) {
        rl.rlim_cur = RLIM_INFINITY;
        rl.rlim_max = RLIM_INFINITY;
    }

    unsigned long vm_size_kb = 0;
    unsigned long vm_rss_kb = 0;
    FILE* status = fopen("/proc/self/status", "r");
    if (status) {
        char line[256];
        while (fgets(line, sizeof(line), status)) {
            if (strncmp(line, "VmSize:", 7) == 0) {
                (void)sscanf(line + 7, "%lu", &vm_size_kb);
            } else if (strncmp(line, "VmRSS:", 6) == 0) {
                (void)sscanf(line + 6, "%lu", &vm_rss_kb);
            }
        }
        fclose(status);
    }
    // CRITICAL FIX: Do NOT decrement lock_depth yet!
    // fprintf() below may call malloc for buffering

    char rl_cur_buf[32];
    char rl_max_buf[32];
    if (rl.rlim_cur == RLIM_INFINITY) {
        strcpy(rl_cur_buf, "inf");
    } else {
        snprintf(rl_cur_buf, sizeof(rl_cur_buf), "%llu", (unsigned long long)rl.rlim_cur);
    }
    if (rl.rlim_max == RLIM_INFINITY) {
        strcpy(rl_max_buf, "inf");
    } else {
        snprintf(rl_max_buf, sizeof(rl_max_buf), "%llu", (unsigned long long)rl.rlim_max);
    }

#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr,
            "[SS OOM] mmap failed: err=%d ss_size=%zu alloc_size=%zu "
            "alloc=%llu freed=%llu bytes=%llu "
            "RLIMIT_AS(cur=%s max=%s) VmSize=%lu kB VmRSS=%lu kB\n",
            err,
            ss_size,
            alloc_size,
            (unsigned long long)g_superslabs_allocated,
            (unsigned long long)g_superslabs_freed,
            (unsigned long long)g_bytes_allocated,
            rl_cur_buf,
            rl_max_buf,
            vm_size_kb,
            vm_rss_kb);
#endif

    g_hakmem_lock_depth--;  // Now safe to restore (all libc calls complete)
}

// Global counters for debugging (non-static for external access)
_Atomic uint64_t g_ss_mmap_count = 0;
_Atomic uint64_t g_final_fallback_mmap_count = 0;

static void* ss_os_acquire(uint8_t size_class, size_t ss_size, uintptr_t ss_mask, int populate) {
    void* ptr = NULL;
    static int log_count = 0;

#ifdef MAP_ALIGNED_SUPER
    int map_flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_ALIGNED_SUPER;
#ifdef MAP_POPULATE
    if (populate) {
        map_flags |= MAP_POPULATE;
    }
#endif
    ptr = mmap(NULL, ss_size,
               PROT_READ | PROT_WRITE,
               map_flags,
               -1, 0);
    if (ptr != MAP_FAILED) {
        atomic_fetch_add(&g_ss_mmap_count, 1);
        if (((uintptr_t)ptr & ss_mask) == 0) {
            ss_stats_os_alloc(size_class, ss_size);
            return ptr;
        }
        munmap(ptr, ss_size);
        ptr = NULL;
    } else {
        log_superslab_oom_once(ss_size, ss_size, errno);
    }
#endif

    size_t alloc_size = ss_size * 2;
    int flags = MAP_PRIVATE | MAP_ANONYMOUS;
#ifdef MAP_POPULATE
    if (populate) {
        flags |= MAP_POPULATE;
    }
#endif
    void* raw = mmap(NULL, alloc_size,
                     PROT_READ | PROT_WRITE,
                     flags,
                     -1, 0);
    if (raw != MAP_FAILED) {
        uint64_t count = atomic_fetch_add(&g_ss_mmap_count, 1) + 1;
        #if !HAKMEM_BUILD_RELEASE
        if (log_count < 10) {
            fprintf(stderr, "[SUPERSLAB_MMAP] #%lu: class=%d size=%zu (total SuperSlab mmaps so far)\n",
                    (unsigned long)count, size_class, ss_size);
            log_count++;
        }
        #endif
    }
    if (raw == MAP_FAILED) {
        log_superslab_oom_once(ss_size, alloc_size, errno);
        return NULL;
    }

    uintptr_t raw_addr = (uintptr_t)raw;
    uintptr_t aligned_addr = (raw_addr + ss_mask) & ~ss_mask;
    ptr = (void*)aligned_addr;

    size_t prefix_size = aligned_addr - raw_addr;
    if (prefix_size > 0) {
        munmap(raw, prefix_size);
    }
    size_t suffix_size = alloc_size - prefix_size - ss_size;
    if (suffix_size > 0) {
        if (populate) {
#ifdef MADV_DONTNEED
            madvise((char*)ptr + ss_size, suffix_size, MADV_DONTNEED);
#endif
        } else {
            munmap((char*)ptr + ss_size, suffix_size);
        }
    }

    ss_stats_os_alloc(size_class, ss_size);
    return ptr;
}

static void ss_cache_precharge(uint8_t size_class, size_t ss_size, uintptr_t ss_mask) {
    if (!g_ss_cache_enabled) return;
    if (size_class >= 8) return;
    if (g_ss_precharge_target[size_class] == 0) return;
    if (atomic_load_explicit(&g_ss_precharge_done[size_class], memory_order_acquire)) return;

    ss_cache_ensure_init();
    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
    size_t target = g_ss_precharge_target[size_class];
    size_t cap = g_ss_cache_cap[size_class];
    size_t desired = target;
    if (cap != 0 && desired > cap) {
        desired = cap;
    }
    while (g_ss_cache_count[size_class] < desired) {
        void* raw = ss_os_acquire(size_class, ss_size, ss_mask, 1);
        if (!raw) {
            break;
        }
        SuperslabCacheEntry* entry = (SuperslabCacheEntry*)raw;
        entry->next = g_ss_cache_head[size_class];
        g_ss_cache_head[size_class] = entry;
        g_ss_cache_count[size_class]++;
        g_ss_cache_precharged[size_class]++;
    }
    atomic_store_explicit(&g_ss_precharge_done[size_class], 1, memory_order_release);
    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
}

static SuperslabCacheEntry* ss_cache_pop(uint8_t size_class) {
    if (!g_ss_cache_enabled) return NULL;
    if (size_class >= 8) return NULL;

    ss_cache_ensure_init();

    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
    SuperslabCacheEntry* entry = g_ss_cache_head[size_class];
    if (entry) {
        g_ss_cache_head[size_class] = entry->next;
        if (g_ss_cache_count[size_class] > 0) {
            g_ss_cache_count[size_class]--;
        }
        entry->next = NULL;
        g_ss_cache_hits[size_class]++;
    } else {
        g_ss_cache_misses[size_class]++;
    }
    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
    return entry;
}

static int ss_cache_push(uint8_t size_class, SuperSlab* ss) {
    if (!g_ss_cache_enabled) return 0;
    if (size_class >= 8) return 0;

    ss_cache_ensure_init();
    pthread_mutex_lock(&g_ss_cache_lock[size_class]);
    size_t cap = g_ss_cache_cap[size_class];
    if (cap != 0 && g_ss_cache_count[size_class] >= cap) {
        g_ss_cache_drops[size_class]++;
        pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
        return 0;
    }
    SuperslabCacheEntry* entry = (SuperslabCacheEntry*)ss;
    entry->next = g_ss_cache_head[size_class];
    g_ss_cache_head[size_class] = entry;
    g_ss_cache_count[size_class]++;
    g_ss_cache_puts[size_class]++;
    pthread_mutex_unlock(&g_ss_cache_lock[size_class]);
    return 1;
}

/*
 * Legacy backend for hak_tiny_alloc_superslab_box().
 *
 * Phase 12 Stage A/B:
 *  - Uses per-class SuperSlabHead (g_superslab_heads) as the implementation.
 *  - Callers MUST use hak_tiny_alloc_superslab_box() and never touch this directly.
 *  - Later Stage C: this function will be replaced by a shared_pool backend.
 */
static SuperSlabHead* init_superslab_head(int class_idx);
static int expand_superslab_head(SuperSlabHead* head);

static void* hak_tiny_alloc_superslab_backend_legacy(int class_idx)
{
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
        return NULL;
    }

    SuperSlabHead* head = g_superslab_heads[class_idx];
    if (!head) {
        head = init_superslab_head(class_idx);
        if (!head) {
            return NULL;
        }
        g_superslab_heads[class_idx] = head;
    }

    SuperSlab* chunk = head->current_chunk ? head->current_chunk : head->first_chunk;

    while (chunk) {
        int cap = ss_slabs_capacity(chunk);
        for (int slab_idx = 0; slab_idx < cap; slab_idx++) {
            TinySlabMeta* meta = &chunk->slabs[slab_idx];

            // Skip slabs that belong to a different class (or are uninitialized).
            if (meta->class_idx != (uint8_t)class_idx && meta->class_idx != 255) {
                continue;
            }

            // P1.2 FIX: Initialize slab on first use (like shared backend does)
            // This ensures class_map is populated for all slabs, not just slab 0
            if (meta->capacity == 0) {
                size_t block_size = g_tiny_class_sizes[class_idx];
                uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
                superslab_init_slab(chunk, slab_idx, block_size, owner_tid);
                meta = &chunk->slabs[slab_idx];  // Refresh pointer after init
                meta->class_idx = (uint8_t)class_idx;
                // P1.2: Update class_map for dynamic slab initialization
                chunk->class_map[slab_idx] = (uint8_t)class_idx;
            }

            if (meta->used < meta->capacity) {
                size_t stride = tiny_block_stride_for_class(class_idx);
                size_t offset = (size_t)meta->used * stride;
                uint8_t* base = (uint8_t*)chunk
                              + SUPERSLAB_SLAB0_DATA_OFFSET
                              + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE
                              + offset;

                meta->used++;
                atomic_fetch_add_explicit(&chunk->total_active_blocks, 1, memory_order_relaxed);
                return (void*)base;
            }
        }
        chunk = chunk->next_chunk;
    }

    if (expand_superslab_head(head) < 0) {
        return NULL;
    }

    SuperSlab* new_chunk = head->current_chunk;
    if (!new_chunk) {
        return NULL;
    }

    int cap2 = ss_slabs_capacity(new_chunk);
    for (int slab_idx = 0; slab_idx < cap2; slab_idx++) {
        TinySlabMeta* meta = &new_chunk->slabs[slab_idx];

        // P1.2 FIX: Initialize slab on first use (like shared backend does)
        if (meta->capacity == 0) {
            size_t block_size = g_tiny_class_sizes[class_idx];
            uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();
            superslab_init_slab(new_chunk, slab_idx, block_size, owner_tid);
            meta = &new_chunk->slabs[slab_idx];  // Refresh pointer after init
            meta->class_idx = (uint8_t)class_idx;
            // P1.2: Update class_map for dynamic slab initialization
            new_chunk->class_map[slab_idx] = (uint8_t)class_idx;
        }

        if (meta->used < meta->capacity) {
            size_t stride = tiny_block_stride_for_class(class_idx);
            size_t offset = (size_t)meta->used * stride;
            uint8_t* base = (uint8_t*)new_chunk
                          + SUPERSLAB_SLAB0_DATA_OFFSET
                          + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE
                          + offset;

            meta->used++;
            atomic_fetch_add_explicit(&new_chunk->total_active_blocks, 1, memory_order_relaxed);
            return (void*)base;
        }
    }

    return NULL;
}

/*
 * Shared pool backend for hak_tiny_alloc_superslab_box().
 *
 * Phase 12-2:
 *  - Uses SharedSuperSlabPool (g_shared_pool) to obtain a SuperSlab/slab
 *    for the requested class_idx.
 *  - This backend EXPRESSLY owns only:
 *      - choosing (ss, slab_idx) via shared_pool_acquire_slab()
 *      - initializing that slab's TinySlabMeta via superslab_init_slab()
 *    and nothing else; all callers must go through hak_tiny_alloc_superslab_box().
 *
 *  - For now this is a minimal, conservative implementation:
 *      - One linear bump-run is carved from the acquired slab using tiny_block_stride_for_class().
 *      - No complex per-slab freelist or refill policy yet (Phase 12-3+).
 *      - If shared_pool_acquire_slab() fails, we fall back to legacy backend.
 */
static void* hak_tiny_alloc_superslab_backend_shared(int class_idx)
{
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
        return NULL;
    }

    SuperSlab* ss = NULL;
    int slab_idx = -1;

    if (shared_pool_acquire_slab(class_idx, &ss, &slab_idx) != 0 || !ss) {
        // Shared pool could not provide a slab; caller may choose to fall back.
        return NULL;
    }

    TinySlabMeta* meta = &ss->slabs[slab_idx];

    // Defensive: shared_pool must either hand us an UNASSIGNED slab or one
    // already bound to this class. Anything else is a hard bug.
    if (meta->class_idx != 255 && meta->class_idx != (uint8_t)class_idx) {
#if !HAKMEM_BUILD_RELEASE
        fprintf(stderr,
                "[HAKMEM][SS_SHARED] BUG: acquire_slab mismatch: cls=%d meta->class_idx=%u slab_idx=%d ss=%p\n",
                class_idx, (unsigned)meta->class_idx, slab_idx, (void*)ss);
#endif
        return NULL;
    }

    // Initialize slab geometry once for this class.
    if (meta->capacity == 0) {
        size_t block_size = g_tiny_class_sizes[class_idx];
        // LARSON FIX: Pass actual thread ID for cross-thread free detection
        uint32_t my_tid = (uint32_t)(uintptr_t)pthread_self();
        superslab_init_slab(ss, slab_idx, block_size, my_tid);
        meta = &ss->slabs[slab_idx];

        // CRITICAL FIX: Always set class_idx after init to avoid C0/C7 confusion.
        // New SuperSlabs start with meta->class_idx=0 (mmap zero-init).
        // Must explicitly set to requested class, not just when class_idx==255.
        meta->class_idx = (uint8_t)class_idx;
        // P1.1: Update class_map in shared acquire path
        ss->class_map[slab_idx] = (uint8_t)class_idx;
    }

    // Final contract check before computing addresses.
    if (meta->class_idx != (uint8_t)class_idx ||
        meta->capacity == 0 ||
        meta->used > meta->capacity) {
#if !HAKMEM_BUILD_RELEASE
        fprintf(stderr,
                "[HAKMEM][SS_SHARED] BUG: invalid slab meta before alloc: "
                "cls=%d slab_idx=%d meta_cls=%u used=%u cap=%u ss=%p\n",
                class_idx, slab_idx,
                (unsigned)meta->class_idx,
                (unsigned)meta->used,
                (unsigned)meta->capacity,
                (void*)ss);
#endif
        return NULL;
    }

    // Simple bump allocation within this slab.
    if (meta->used >= meta->capacity) {
        // Slab exhausted: in minimal Phase12-2 backend we do not loop;
        // caller or future logic must acquire another slab.
        return NULL;
    }

    size_t stride = tiny_block_stride_for_class(class_idx);
    size_t offset = (size_t)meta->used * stride;

    // Phase 12-2 minimal geometry:
    //  - slab 0 data offset via SUPERSLAB_SLAB0_DATA_OFFSET
    //  - subsequent slabs at fixed SUPERSLAB_SLAB_USABLE_SIZE strides.
    size_t slab_base_off = SUPERSLAB_SLAB0_DATA_OFFSET
                         + (size_t)slab_idx * SUPERSLAB_SLAB_USABLE_SIZE;
    uint8_t* base = (uint8_t*)ss + slab_base_off + offset;

    meta->used++;
    atomic_fetch_add_explicit(&ss->total_active_blocks, 1, memory_order_relaxed);

    return (void*)base;
}

/*
 * Box API entry:
 *  - Single front-door for tiny-side Superslab allocations.
 *
 * Phase 12 policy:
 *  - HAKMEM_TINY_SS_SHARED=0 → legacy backendのみ（回帰確認用）
 *  - HAKMEM_TINY_SS_SHARED=1 → shared backendを優先し、失敗時のみ legacy にフォールバック
 */
void* hak_tiny_alloc_superslab_box(int class_idx)
{
    static int g_ss_shared_mode = -1;
    static _Atomic uint32_t g_ss_backend_log = 0;
    if (__builtin_expect(g_ss_shared_mode == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_SS_SHARED");
        if (!e || !*e) {
            g_ss_shared_mode = 1; // デフォルト: shared 有効
        } else {
            int v = atoi(e);
            g_ss_shared_mode = (v != 0) ? 1 : 0;
        }
    }

    if (g_ss_shared_mode == 1) {
        void* p = hak_tiny_alloc_superslab_backend_shared(class_idx);
        if (p != NULL) {
            uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
            if (n < 4) {
                fprintf(stderr, "[SS_BACKEND] shared cls=%d ptr=%p\n", class_idx, p);
            }
            return p;
        }
        // shared backend が失敗した場合は安全側で legacy にフォールバック
        uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
        if (n < 4) {
            fprintf(stderr, "[SS_BACKEND] shared_fail→legacy cls=%d\n", class_idx);
        }
        return hak_tiny_alloc_superslab_backend_legacy(class_idx);
    }

    // shared OFF 時は legacy のみ
    uint32_t n = atomic_fetch_add_explicit(&g_ss_backend_log, 1, memory_order_relaxed);
    if (n < 4) {
        fprintf(stderr, "[SS_BACKEND] legacy cls=%d\n", class_idx);
    }
    return hak_tiny_alloc_superslab_backend_legacy(class_idx);
}

// ============================================================================
// SuperSlab Allocation (2MB aligned)
// ============================================================================

SuperSlab* superslab_allocate(uint8_t size_class) {
    // Optional fault injection for testing: HAKMEM_TINY_SS_FAULT_RATE=N → 1/N で失敗
    static int fault_rate = -1;  // -1=unparsed, 0=disabled, >0=rate
    static __thread unsigned long fault_tick = 0;
    if (__builtin_expect(fault_rate == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_SS_FAULT_RATE");
        if (e && *e) {
            int v = atoi(e); if (v < 0) v = 0; fault_rate = v;
        } else {
            fault_rate = 0;
        }
    }
    if (fault_rate > 0) {
        unsigned long t = ++fault_tick;
        if ((t % (unsigned long)fault_rate) == 0ul) {
            return NULL;  // simulate OOM
        }
    }
    // Optional env clamp for SuperSlab size
    static int env_parsed = 0;
    static uint8_t g_ss_min_lg_env = SUPERSLAB_LG_DEFAULT;  // Start with default (2MB)
    static uint8_t g_ss_max_lg_env = SUPERSLAB_LG_MAX;
    if (!env_parsed) {
        char* maxmb = getenv("HAKMEM_TINY_SS_MAX_MB");
        if (maxmb) {
            int m = atoi(maxmb); if (m == 1) g_ss_max_lg_env = 20; else if (m == 2) g_ss_max_lg_env = 21;
        }
        char* minmb = getenv("HAKMEM_TINY_SS_MIN_MB");
        if (minmb) {
            int m = atoi(minmb); if (m == 1) g_ss_min_lg_env = 20; else if (m == 2) g_ss_min_lg_env = 21;
        }
        if (g_ss_min_lg_env > g_ss_max_lg_env) g_ss_min_lg_env = g_ss_max_lg_env;
        const char* force_lg_env = getenv("HAKMEM_TINY_SS_FORCE_LG");
        if (force_lg_env && *force_lg_env) {
            int v = atoi(force_lg_env);
            if (v >= SUPERSLAB_LG_MIN && v <= SUPERSLAB_LG_MAX) {
                g_ss_force_lg = v;
                g_ss_min_lg_env = g_ss_max_lg_env = v;
            }
        }
        size_t precharge_default = 0;
        const char* precharge_env = getenv("HAKMEM_TINY_SS_PRECHARGE");
        if (precharge_env && *precharge_env) {
            long v = atol(precharge_env);
            if (v < 0) v = 0;
            precharge_default = (size_t)v;
            if (v > 0) {
                atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
            }
        }
        size_t cache_default = 0;
        const char* cache_env = getenv("HAKMEM_TINY_SS_CACHE");
        if (cache_env && *cache_env) {
            long v = atol(cache_env);
            if (v < 0) v = 0;
            cache_default = (size_t)v;
        }
        for (int i = 0; i < 8; i++) {
            g_ss_cache_cap[i] = cache_default;
            g_ss_precharge_target[i] = precharge_default;
        }
        for (int i = 0; i < 8; i++) {
            char name[64];
            snprintf(name, sizeof(name), "HAKMEM_TINY_SS_CACHE_C%d", i);
            char* cap_env = getenv(name);
            if (cap_env && *cap_env) {
                long v = atol(cap_env);
                if (v < 0) v = 0;
                g_ss_cache_cap[i] = (size_t)v;
            }
            snprintf(name, sizeof(name), "HAKMEM_TINY_SS_PRECHARGE_C%d", i);
            char* pre_env = getenv(name);
            if (pre_env && *pre_env) {
                long v = atol(pre_env);
                if (v < 0) v = 0;
                g_ss_precharge_target[i] = (size_t)v;
                if (v > 0) {
                    atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
                }
            }
            if (g_ss_cache_cap[i] > 0 || g_ss_precharge_target[i] > 0) {
                g_ss_cache_enabled = 1;
            }
        }
        const char* populate_env = getenv("HAKMEM_TINY_SS_POPULATE_ONCE");
        if (populate_env && atoi(populate_env) != 0) {
            atomic_store_explicit(&g_ss_populate_once, 1, memory_order_relaxed);
        }
        env_parsed = 1;
    }

    uint8_t lg = (g_ss_force_lg >= 0) ? (uint8_t)g_ss_force_lg : hak_tiny_superslab_next_lg(size_class);
    if (lg < g_ss_min_lg_env) lg = g_ss_min_lg_env;
    if (lg > g_ss_max_lg_env) lg = g_ss_max_lg_env;
    size_t ss_size = (size_t)1 << lg;  // 2^20 = 1MB, 2^21 = 2MB
    uintptr_t ss_mask = ss_size - 1;
    int from_cache = 0;
    void* ptr = NULL;

    // Debug logging flag (lazy init)
    static __thread int dbg = -1;
#if HAKMEM_BUILD_RELEASE
    dbg = 0;
#else
    if (__builtin_expect(dbg == -1, 0)) {
        const char* e = getenv("HAKMEM_SS_PREWARM_DEBUG");
        dbg = (e && *e && *e != '0') ? 1 : 0;
    }
#endif

    // Phase 9: Try LRU cache first (lazy deallocation)
    SuperSlab* cached_ss = hak_ss_lru_pop(size_class);
    if (cached_ss) {
        ptr = (void*)cached_ss;
        from_cache = 1;
        // Debug logging for REFILL from LRU
        if (dbg == 1) {
            fprintf(stderr, "[REFILL] class=%d from_lru=1 ss=%p\n",
                    size_class, (void*)cached_ss);
        }
        // Skip old cache path - LRU cache takes priority
    } else if (g_ss_cache_enabled && size_class < 8) {
        // Fallback to old cache (will be deprecated)
        ss_cache_precharge(size_class, ss_size, ss_mask);
        SuperslabCacheEntry* old_cached = ss_cache_pop(size_class);
        if (old_cached) {
            ptr = (void*)old_cached;
            from_cache = 1;
            // Debug logging for REFILL from prewarm (old cache is essentially prewarm)
            if (dbg == 1) {
                fprintf(stderr, "[REFILL] class=%d from_prewarm=1 ss=%p\n",
                        size_class, (void*)old_cached);
            }
        }
    }

    if (!ptr) {
        int populate = atomic_exchange_explicit(&g_ss_populate_once, 0, memory_order_acq_rel);
        ptr = ss_os_acquire(size_class, ss_size, ss_mask, populate);
        if (!ptr) {
            return NULL;
        }
        // Debug logging for REFILL with new allocation
        if (dbg == 1) {
            fprintf(stderr, "[REFILL] class=%d new_alloc=1 ss=%p\n",
                    size_class, (void*)ptr);
        }
    }

    // Initialize SuperSlab header (Phase 12: no global size_class field)
    SuperSlab* ss = (SuperSlab*)ptr;
    ss->magic = SUPERSLAB_MAGIC;
    ss->active_slabs = 0;
    ss->lg_size = lg;  // Phase 8.3: Use ACE-determined lg_size (20=1MB, 21=2MB)
    ss->slab_bitmap = 0;
    ss->nonempty_mask = 0;  // Phase 6-2.1: ChatGPT Pro P0 - init nonempty mask
    ss->partial_epoch = 0;
    ss->publish_hint = 0xFF;

    // Initialize atomics explicitly
    atomic_store_explicit(&ss->total_active_blocks, 0, memory_order_relaxed);
    atomic_store_explicit(&ss->refcount, 0, memory_order_relaxed);
    atomic_store_explicit(&ss->listed, 0, memory_order_relaxed);
    ss->partial_next = NULL;

    // Phase 9: Initialize LRU fields
    ss->last_used_ns = 0;
    ss->generation = 0;
    ss->lru_prev = NULL;
    ss->lru_next = NULL;

    // Phase 3d-C: Initialize Hot/Cold Split fields
    ss->hot_count = 0;
    ss->cold_count = 0;
    for (int i = 0; i < 16; i++) {
        ss->hot_indices[i] = 0;
        ss->cold_indices[i] = 0;
    }

    // Initialize all slab metadata (only up to max slabs for this size)
    int max_slabs = (int)(ss_size / SLAB_SIZE);

    // DEFENSIVE FIX: Zero all slab metadata arrays to prevent ANY uninitialized pointers
    // This catches the 0xa2a2a2a2a2a2a2a2 pattern bug (ASan/debug fill pattern)
    // Even though mmap should return zeroed pages, sanitizers may fill with debug patterns
    memset(ss->slabs, 0, max_slabs * sizeof(TinySlabMeta));
    memset(ss->remote_heads, 0, max_slabs * sizeof(uintptr_t));
    memset(ss->remote_counts, 0, max_slabs * sizeof(uint32_t));
    memset(ss->slab_listed, 0, max_slabs * sizeof(uint32_t));

    for (int i = 0; i < max_slabs; i++) {
        // Phase 1: Atomic initialization (freelist + used are now _Atomic)
        slab_freelist_store_relaxed(&ss->slabs[i], NULL);  // Explicit NULL (redundant after memset, but clear intent)
        atomic_store_explicit(&ss->slabs[i].used, 0, memory_order_relaxed);
        ss->slabs[i].capacity = 0;
        ss->slabs[i].owner_tid_low = 0;

        // Initialize remote queue atomics (memset already zeroed, but use proper atomic init)
        atomic_store_explicit(&ss->remote_heads[i], 0, memory_order_relaxed);
        atomic_store_explicit(&ss->remote_counts[i], 0, memory_order_relaxed);
        atomic_store_explicit(&ss->slab_listed[i], 0, memory_order_relaxed);
    }

    if (from_cache) {
        ss_stats_cache_reuse();
    }

    // Phase 8.3: Update ACE current_lg to match allocated size
    g_ss_ace[size_class].current_lg = lg;

    // Phase 1: Register SuperSlab in global registry for fast lookup
    // CRITICAL: Register AFTER full initialization (ss structure is ready)
    uintptr_t base = (uintptr_t)ss;
    if (!hak_super_register(base, ss)) {
        // Registry full - this is a fatal error
        fprintf(stderr, "HAKMEM FATAL: SuperSlab registry full, cannot register %p\n", ss);
        // Still return ss to avoid memory leak, but lookups may fail
    }

    return ss;
}

// ============================================================================
// Phase 2a: Dynamic Expansion - Chunk Management Functions
// ============================================================================

// Initialize SuperSlabHead for a class
SuperSlabHead* init_superslab_head(int class_idx) {
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
        return NULL;
    }

    // Allocate SuperSlabHead structure
    SuperSlabHead* head = (SuperSlabHead*)calloc(1, sizeof(SuperSlabHead));
    if (!head) {
        extern __thread int g_hakmem_lock_depth;
        g_hakmem_lock_depth++;
        fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate SuperSlabHead for class %d\n", class_idx);
        g_hakmem_lock_depth--;
        return NULL;
    }

    head->class_idx = (uint8_t)class_idx;
    atomic_store_explicit(&head->total_chunks, 0, memory_order_relaxed);
    head->first_chunk = NULL;
    head->current_chunk = NULL;
    pthread_mutex_init(&head->expansion_lock, NULL);

    // Allocate initial chunk(s)
    // Hot classes (1, 4, 6) get 2 initial chunks to reduce contention
    int initial_chunks = 1;

    // Phase 2a: Start with 1 chunk for all classes (expansion will handle growth)
    // This reduces startup memory overhead while still allowing unlimited growth
    initial_chunks = 1;

    for (int i = 0; i < initial_chunks; i++) {
        if (expand_superslab_head(head) < 0) {
            extern __thread int g_hakmem_lock_depth;
            g_hakmem_lock_depth++;
            fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate initial chunk %d for class %d\n",
                    i, class_idx);
            g_hakmem_lock_depth--;

            // Cleanup on failure
            SuperSlab* chunk = head->first_chunk;
            while (chunk) {
                SuperSlab* next = chunk->next_chunk;
                superslab_free(chunk);
                chunk = next;
            }
            pthread_mutex_destroy(&head->expansion_lock);
            free(head);
            return NULL;
        }
    }

    extern __thread int g_hakmem_lock_depth;
    g_hakmem_lock_depth++;
#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "[HAKMEM] Initialized SuperSlabHead for class %d: %zu initial chunks\n",
            class_idx, atomic_load_explicit(&head->total_chunks, memory_order_relaxed));
#endif
    g_hakmem_lock_depth--;

    return head;
}

// Expand SuperSlabHead by allocating and linking a new chunk
int expand_superslab_head(SuperSlabHead* head) {
    if (!head) {
        return -1;
    }

    // Allocate new chunk via existing superslab_allocate
    SuperSlab* new_chunk = superslab_allocate(head->class_idx);
    if (!new_chunk) {
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
        extern __thread int g_hakmem_lock_depth;
        g_hakmem_lock_depth++;
        fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate new chunk for class %d (system OOM)\n",
                head->class_idx);
        g_hakmem_lock_depth--;
#endif
        return -1;  // True OOM (system out of memory)
    }

    // CRITICAL FIX: Initialize slab 0 so bitmap != 0x00000000
    // Phase 2a chunks must have at least one usable slab after allocation
    size_t block_size = g_tiny_class_sizes[head->class_idx];
    // Use pthread_self() directly since tiny_self_u32() is static inline in hakmem_tiny.c
    uint32_t owner_tid = (uint32_t)(uintptr_t)pthread_self();

    superslab_init_slab(new_chunk, 0, block_size, owner_tid);

    // Initialize the next_chunk link to NULL
    new_chunk->next_chunk = NULL;

    // Thread-safe linking
    pthread_mutex_lock(&head->expansion_lock);

    if (head->current_chunk) {
        // Find the tail of the list (optimization: could cache tail pointer)
        SuperSlab* tail = head->current_chunk;
        while (tail->next_chunk) {
            tail = tail->next_chunk;
        }
        tail->next_chunk = new_chunk;
    } else {
        // First chunk
        head->first_chunk = new_chunk;
    }

    // Update current chunk to new chunk (for fast allocation)
    head->current_chunk = new_chunk;

    // Increment total chunks atomically
    size_t old_count = atomic_fetch_add_explicit(&head->total_chunks, 1, memory_order_relaxed);
    size_t new_count = old_count + 1;

    pthread_mutex_unlock(&head->expansion_lock);

#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
    extern __thread int g_hakmem_lock_depth;
    g_hakmem_lock_depth++;
    fprintf(stderr, "[HAKMEM] Expanded SuperSlabHead for class %d: %zu chunks now (bitmap=0x%08x)\n",
            head->class_idx, new_count, new_chunk->slab_bitmap);
    g_hakmem_lock_depth--;
#endif

    return 0;
}

// Find which chunk a pointer belongs to
SuperSlab* find_chunk_for_ptr(void* ptr, int class_idx) {
    if (!ptr || class_idx < 0 || class_idx >= TINY_NUM_CLASSES_SS) {
        return NULL;
    }

    SuperSlabHead* head = g_superslab_heads[class_idx];
    if (!head) {
        return NULL;
    }

    uintptr_t ptr_addr = (uintptr_t)ptr;

    // Walk the chunk list
    SuperSlab* chunk = head->first_chunk;
    while (chunk) {
        // Check if ptr is within this chunk's memory range
        // Each chunk is aligned to SUPERSLAB_SIZE (1MB or 2MB)
        uintptr_t chunk_start = (uintptr_t)chunk;
        size_t chunk_size = (size_t)1 << chunk->lg_size;  // Use actual chunk size
        uintptr_t chunk_end = chunk_start + chunk_size;

        if (ptr_addr >= chunk_start && ptr_addr < chunk_end) {
            // Found the chunk
            return chunk;
        }

        chunk = chunk->next_chunk;
    }

    return NULL;  // Not found in any chunk
}

// ============================================================================
// SuperSlab Deallocation
// ============================================================================

void superslab_free(SuperSlab* ss) {
    if (!ss || ss->magic != SUPERSLAB_MAGIC) {
        return;  // Invalid SuperSlab
    }

    // ADD DEBUG LOGGING
    static __thread int dbg = -1;
#if HAKMEM_BUILD_RELEASE
    dbg = 0;
#else
    if (__builtin_expect(dbg == -1, 0)) {
        const char* e = getenv("HAKMEM_SS_FREE_DEBUG");
        dbg = (e && *e && *e != '0') ? 1 : 0;
    }
#endif
    if (dbg == 1) {
        fprintf(stderr, "[SS_FREE] CALLED: ss=%p lg_size=%d active_slabs=%u\n",
                (void*)ss, ss->lg_size, ss->active_slabs);
    }

    // Phase 9: Lazy Deallocation - try to cache in LRU instead of munmap
    size_t ss_size = (size_t)1 << ss->lg_size;

    // Phase 1: Unregister SuperSlab from registry FIRST
    // CRITICAL: Must unregister BEFORE adding to LRU cache
    // Reason: Cached SuperSlabs should NOT be found by lookups
    uintptr_t base = (uintptr_t)ss;
    hak_super_unregister(base);

    // Memory fence to ensure unregister is visible
    atomic_thread_fence(memory_order_release);

    // Phase 9: Try LRU cache first (lazy deallocation)
    // NOTE: LRU cache keeps magic=SUPERSLAB_MAGIC for validation
    // Magic will be cleared on eviction or reuse
    int lru_cached = hak_ss_lru_push(ss);
    if (dbg == 1) {
        fprintf(stderr, "[SS_FREE] hak_ss_lru_push() returned %d\n", lru_cached);
    }
    if (lru_cached) {
        // Successfully cached in LRU - defer munmap
        return;
    }

    // LRU cache full or disabled - try old cache using head class_idx (if known)
    int old_cached = ss_cache_push(0, ss);
    if (old_cached) {
        ss_stats_cache_store();
        return;
    }

    // Both caches full - immediately free to OS (eager deallocation)
    // Clear magic to prevent use-after-free
    ss->magic = 0;

#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "[DEBUG ss_os_release] Freeing SuperSlab ss=%p size=%zu active=%u (LRU full)\n",
            (void*)ss, ss_size,
            atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed));
#endif

    munmap(ss, ss_size);

    // Update statistics for actual release to OS
    pthread_mutex_lock(&g_superslab_lock);
    g_superslabs_freed++;
    // Phase 12: we no longer track per-SS size_class on header; skip g_ss_freed_by_class here
    g_bytes_allocated -= ss_size;
    pthread_mutex_unlock(&g_superslab_lock);

#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "[DEBUG ss_os_release] g_superslabs_freed now = %llu\n",
            (unsigned long long)g_superslabs_freed);
#endif
}

// ============================================================================
// Slab Initialization within SuperSlab
// ============================================================================

void superslab_init_slab(SuperSlab* ss, int slab_idx, size_t block_size, uint32_t owner_tid)
{
    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
        return;
    }

    // Phase E1-CORRECT unified geometry:
    // - block_size is the TOTAL stride for this class (g_tiny_class_sizes[cls])
    // - usable bytes are determined by slab index (slab0 vs others)
    // - capacity = usable / stride for ALL classes (including former C7)
    size_t usable_size = (slab_idx == 0)
                           ? SUPERSLAB_SLAB0_USABLE_SIZE
                           : SUPERSLAB_SLAB_USABLE_SIZE;
    size_t stride = block_size;
    uint16_t capacity = (uint16_t)(usable_size / stride);

#if !HAKMEM_BUILD_RELEASE
    if (slab_idx == 0) {
        fprintf(stderr,
                "[SUPERSLAB_INIT] slab 0: usable_size=%zu stride=%zu capacity=%u\n",
                usable_size, stride, (unsigned)capacity);
    }
#endif

    TinySlabMeta* meta = &ss->slabs[slab_idx];
    meta->freelist = NULL;          // NULL = linear allocation mode
    meta->used = 0;
    meta->active = 0;               // P1.3: blocks in use by user (starts at 0)
    meta->capacity = capacity;
    meta->carved = 0;
    // LARSON FIX: Use bits 8-15 instead of 0-7 since pthread TIDs are aligned to 256 bytes
    meta->owner_tid_low = (uint8_t)((owner_tid >> 8) & 0xFFu);
    // Fail-safe: stamp class_idx from geometry (stride → class).
    // This ensures legacy/shared/legacy-refill paths all end with a correct class.
    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        if (g_tiny_class_sizes[i] == stride) {
            meta->class_idx = (uint8_t)i;
            // P1.1: Update class_map for out-of-band lookup on free path
            ss->class_map[slab_idx] = (uint8_t)i;
            break;
        }
    }

    superslab_activate_slab(ss, slab_idx);
}

// ============================================================================
// Slab Bitmap Management
// ============================================================================

void superslab_activate_slab(SuperSlab* ss, int slab_idx) {
    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
        return;
    }
    uint32_t mask = 1u << slab_idx;
    if ((ss->slab_bitmap & mask) == 0) {
        ss->slab_bitmap |= mask;
        ss->active_slabs++;

        // Phase 3d-C: Update hot/cold indices after activating new slab
        ss_update_hot_cold_indices(ss);
    }
}

void superslab_deactivate_slab(SuperSlab* ss, int slab_idx) {
    if (!ss || slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) {
        return;
    }
    uint32_t mask = 1u << slab_idx;
    if (ss->slab_bitmap & mask) {
        ss->slab_bitmap &= ~mask;
        ss->active_slabs--;
    }
}

int superslab_find_free_slab(SuperSlab* ss) {
    if (!ss) return -1;
    if ((int)ss->active_slabs >= ss_slabs_capacity(ss)) {
        return -1;  // No free slabs
    }
    // Find first 0 bit in bitmap
    int cap = ss_slabs_capacity(ss);
    for (int i = 0; i < cap; i++) {
        if ((ss->slab_bitmap & (1u << i)) == 0) {
            return i;
        }
    }
    return -1;
}

// ============================================================================
// Statistics / Debugging
// ============================================================================

void superslab_print_stats(SuperSlab* ss) {
    if (!ss || ss->magic != SUPERSLAB_MAGIC) {
        printf("Invalid SuperSlab\n");
        return;
    }

    printf("=== SuperSlab Stats ===\n");
    printf("Address: %p\n", (void*)ss);
    // Phase 12: per-SS size_class removed; classes are per-slab via meta->class_idx.
    printf("Active slabs: %u / %d\n", ss->active_slabs, ss_slabs_capacity(ss));
    printf("Bitmap: 0x%08X\n", ss->slab_bitmap);
    printf("\nPer-slab details:\n");
    for (int i = 0; i < ss_slabs_capacity(ss); i++) {
        if (ss->slab_bitmap & (1u << i)) {
            TinySlabMeta* meta = &ss->slabs[i];
            printf("  Slab %2d: used=%u/%u freelist=%p class=%u owner_tid_low=%u\n",
                   i, meta->used, meta->capacity, meta->freelist,
                   (unsigned)meta->class_idx, (unsigned)meta->owner_tid_low);
        }
    }
    printf("\n");
}

// Global statistics
void superslab_print_global_stats(void) {
    pthread_mutex_lock(&g_superslab_lock);
    printf("=== Global SuperSlab Stats ===\n");
    printf("SuperSlabs allocated: %lu\n", g_superslabs_allocated);
    printf("SuperSlabs freed: %lu\n", g_superslabs_freed);
    printf("SuperSlabs active: %lu\n", g_superslabs_allocated - g_superslabs_freed);
    printf("Total bytes allocated: %lu MB\n", g_bytes_allocated / (1024 * 1024));
    pthread_mutex_unlock(&g_superslab_lock);
}

// ============================================================================
// Phase 8.3: ACE Statistics / Debugging
// ============================================================================

void superslab_ace_print_stats(void) {
    printf("=== ACE (Adaptive Cache Engine) Stats ===\n");
    const char* class_names[8] = {"8B", "16B", "24B", "32B", "40B", "48B", "56B", "64B"};

    printf("Class   Curr  Targ  Hot  Allocs  Refills  Spills  LiveBlks\n");
    printf("--------------------------------------------------------------\n");

    for (int i = 0; i < TINY_NUM_CLASSES_SS; i++) {
        SuperSlabACEState* c = &g_ss_ace[i];
        printf("%-6s  %2uMB  %2uMB  %4u  %7u  %8u  %7u  %9u\n",
               class_names[i],
               (1u << c->current_lg) / (1024 * 1024),
               (1u << c->target_lg) / (1024 * 1024),
               c->hot_score,
               c->alloc_count,
               c->refill_count,
               c->spill_count,
               c->live_blocks);
    }
    printf("\n");
}

// ============================================================================
// Phase 8.3: ACE Tick Function (Promotion/Demotion Logic)
// ============================================================================

#define ACE_TICK_NS        (150ULL * 1000 * 1000)  // 150ms tick interval
#define ACE_COOLDOWN_NS    (800ULL * 1000 * 1000)  // 0.8s cooldown (anti-oscillation)

// Simplified thresholds for refill activity
#define HI_REFILL(k)       (g_ss_ace[k].refill_count > 64)   // High refill rate
#define MID_REFILL(k)      (g_ss_ace[k].refill_count > 16)   // Medium refill rate

// Object sizes per class (for capacity calculation)
// Must match TINY size classes: 8, 16, 24, 32, 40, 48, 56, 64 bytes
static const int g_tiny_obj_sizes[TINY_NUM_CLASSES_SS] = {8, 16, 24, 32, 40, 48, 56, 64};

void hak_tiny_superslab_ace_tick(int k, uint64_t now) {
    if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;

    SuperSlabACEState* c = &g_ss_ace[k];

    // Rate limiting: only tick every ACE_TICK_NS (~150ms)
    if (now - c->last_tick_ns < ACE_TICK_NS) return;

    // Calculate capacity for 1MB and 2MB SuperSlabs
    int obj_size = g_tiny_obj_sizes[k];
    double cap1MB = (double)((1U << 20) / obj_size);  // 1MB capacity
    double cap2MB = (double)((1U << 21) / obj_size);  // 2MB capacity

    // Calculate hotness score (weighted: 60% live blocks, 40% refill rate)
    double hot = 0.6 * (double)c->live_blocks + 0.4 * (double)c->refill_count;
    if (hot < 0) hot = 0;
    if (hot > 1000) hot = 1000;
    c->hot_score = (uint16_t)hot;

    // Cooldown mechanism: prevent size changes within 0.8s of last change
    static uint64_t last_switch_ns[TINY_NUM_CLASSES_SS] = {0};

    if (now - last_switch_ns[k] >= ACE_COOLDOWN_NS) {
        if (c->current_lg <= 20) {
            // Promotion condition: 1MB → 2MB
            // High demand (live > 75% capacity) AND high refill rate
            if (c->live_blocks > 0.75 * cap1MB && HI_REFILL(k)) {
                c->target_lg = 21;  // Promote to 2MB
                last_switch_ns[k] = now;
            }
        } else {
            // Demotion condition: 2MB → 1MB
            // Low demand (live < 35% capacity) AND low refill rate
            if (c->live_blocks < 0.35 * cap2MB && !MID_REFILL(k)) {
                c->target_lg = 20;  // Demote to 1MB
                last_switch_ns[k] = now;
            }
        }
    }

    // EMA-style decay for counters (reduce by 75% each tick)
    c->alloc_count  = c->alloc_count  / 4;
    c->refill_count = c->refill_count / 4;
    c->spill_count  = c->spill_count  / 4;
    // live_blocks is updated incrementally by alloc/free, not decayed here

    c->last_tick_ns = now;
}

// ============================================================================
// Phase 8.4: ACE Observer (Registry-based, zero hot-path overhead)
// ============================================================================

// Global debug flag (set once at initialization)
static int g_ace_debug = 0;

// Registry-based observation: scan all SuperSlabs for usage stats
static void ace_observe_and_decide(int k) {
    if (k < 0 || k >= TINY_NUM_CLASSES_SS) return;

    SuperSlabACEState* c = &g_ss_ace[k];

    // Scan Registry to count SuperSlabs and total live blocks
    int ss_count = 0;
    uint32_t total_live = 0;

    for (int i = 0; i < SUPER_REG_SIZE; i++) {
        SuperRegEntry* e = &g_super_reg[i];

        // Atomic read (thread-safe)
        uintptr_t base = atomic_load_explicit(
            (_Atomic uintptr_t*)&e->base,
            memory_order_acquire);

        if (base == 0) continue;  // Empty slot

        // Phase 8.4: Safety check - skip if ss pointer is invalid
        if (!e->ss) continue;
        // Phase 12: per-SS size_class removed; registry entries are per-class by construction.

        ss_count++;
        // Phase 8.4: Scan all slabs to count used blocks (zero hot-path overhead)
        uint32_t ss_live = 0;
        int cap_scan = ss_slabs_capacity(e->ss);
        for (int slab_idx = 0; slab_idx < cap_scan; slab_idx++) {
            TinySlabMeta* meta = &e->ss->slabs[slab_idx];
            // Relaxed read is OK (stats only, no hot-path impact)
            ss_live += meta->used;
        }
        total_live += ss_live;
    }

    // Calculate utilization
    int obj_size = g_tiny_obj_sizes[k];
    uint8_t current_lg = atomic_load_explicit(
        (_Atomic uint8_t*)&c->current_lg,
        memory_order_relaxed);

    uint32_t capacity = (ss_count > 0) ? ss_count * ((1U << current_lg) / obj_size) : 1;
    double util = (double)total_live / capacity;

    // Update hot_score (for debugging/visualization)
    c->hot_score = (uint16_t)(util * 1000);
    if (c->hot_score > 1000) c->hot_score = 1000;

    // Promotion/Demotion decision
    uint8_t new_target = current_lg;

    if (current_lg <= 20) {
        // Promotion: 1MB → 2MB
        if (util > 0.75) {
            new_target = 21;
        }
    } else {
        // Demotion: 2MB → 1MB
        if (util < 0.35) {
            new_target = 20;
        }
    }

    // Debug output (if enabled)
    if (g_ace_debug && ss_count > 0) {
        fprintf(stderr, "[ACE] Class %d (%dB): ss=%d live=%u cap=%u util=%.2f%% lg=%d->%d hot=%d\n",
                k, obj_size, ss_count, total_live, capacity, util * 100.0,
                current_lg, new_target, c->hot_score);
    }

    // Atomic write (thread-safe)
    if (new_target != current_lg) {
        atomic_store_explicit(
            (_Atomic uint8_t*)&c->target_lg,
            new_target,
            memory_order_release);
        if (g_ace_debug) {
            fprintf(stderr, "[ACE] *** Class %d: SIZE CHANGE %dMB -> %dMB (util=%.2f%%)\n",
                    k, 1 << (current_lg - 20), 1 << (new_target - 20), util * 100.0);
        }
    }
}

// Called from Learner thread (background observation)
void hak_tiny_superslab_ace_observe_all(void) {
    // Initialize debug flag once
    static int initialized = 0;
    if (!initialized) {
        const char* ace_debug = getenv("HAKMEM_ACE_DEBUG");
        g_ace_debug = (ace_debug && atoi(ace_debug) != 0) ? 1 : 0;
        initialized = 1;
    }

    for (int k = 0; k < TINY_NUM_CLASSES_SS; k++) {
        ace_observe_and_decide(k);
    }
}