// tiny_superslab_alloc.inc.h - SuperSlab Allocation Layer
// Purpose: Slab allocation, refill, and adoption logic
// Extracted from: hakmem_tiny_free.inc lines 626-1170
// Box Theory: Box 4 (Refill/Adoption) integration
//
// Public functions:
// - superslab_alloc_from_slab(): Allocate from specific slab (linear or freelist)
// - superslab_refill(): Refill TLS slab (adoption, registry scan, fresh alloc)
// - hak_tiny_alloc_superslab(): Main SuperSlab allocation entry point

// ============================================================================
// Phase 6.23: SuperSlab Allocation Helpers
// ============================================================================

// Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation)
#include "hakmem_tiny_superslab_constants.h"
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
    TinySlabMeta* meta = &ss->slabs[slab_idx];

    // Ensure remote queue is drained before handing blocks back to TLS (UNLIKELY in 1T)
    if (__builtin_expect(atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0, 0)) {
        uint32_t self_tid = tiny_self_u32();
        SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
        if (slab_is_valid(&h)) {
            slab_drain_remote_full(&h);
            int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0;
            if (__builtin_expect(pending, 0)) {
                if (__builtin_expect(g_debug_remote_guard, 0)) {
                    uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed);
                    tiny_remote_watch_note("alloc_pending_remote",
                                           ss,
                                           slab_idx,
                                           (void*)head,
                                           0xA243u,
                                           self_tid,
                                           0);
                }
                slab_release(&h);
                return NULL;
            }
            slab_release(&h);
        } else {
            if (__builtin_expect(g_debug_remote_guard, 0)) {
                tiny_remote_watch_note("alloc_acquire_fail",
                                       ss,
                                       slab_idx,
                                       meta,
                                       0xA244u,
                                       self_tid,
                                       0);
            }
            return NULL;
        }
    }

    if (__builtin_expect(g_debug_remote_guard, 0)) {
        uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
        if (head_pending != 0) {
            tiny_remote_watch_note("alloc_remote_pending",
                                   ss,
                                   slab_idx,
                                   (void*)head_pending,
                                   0xA247u,
                                   tiny_self_u32(),
                                   1);
            return NULL;
        }
    }

    // Phase 6.24: Linear allocation mode (freelist == NULL)
    // This avoids the 4000-8000 cycle cost of building freelist on init
    if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
        // Linear allocation: use canonical tiny_slab_base_for() only
        size_t unit_sz = g_tiny_class_sizes[ss->size_class]
#if HAKMEM_TINY_HEADER_CLASSIDX
                         + ((ss->size_class != 7) ? 1 : 0)
#endif
                         ;
        uint8_t* base = tiny_slab_base_for(ss, slab_idx);
        void* block_base = (void*)(base + ((size_t)meta->used * unit_sz));
#if !HAKMEM_BUILD_RELEASE
        // Debug safety: Ensure we never carve past slab usable region (capacity mismatch guard)
        size_t dbg_usable = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
        uintptr_t dbg_off = (uintptr_t)((uint8_t*)block_base - base);
        if (__builtin_expect(dbg_off + unit_sz > dbg_usable, 0)) {
            fprintf(stderr, "[TINY_ALLOC_BOUNDS] cls=%u slab=%d used=%u cap=%u unit=%zu off=%lu usable=%zu\n",
                    ss->size_class, slab_idx, meta->used, meta->capacity, unit_sz,
                    (unsigned long)dbg_off, dbg_usable);
            return NULL;
        }
#endif
        meta->used++;
        void* user =
#if HAKMEM_TINY_HEADER_CLASSIDX
            tiny_region_id_write_header(block_base, ss->size_class);
#else
            block_base;
#endif
        if (__builtin_expect(g_debug_remote_guard, 0)) {
            tiny_remote_track_on_alloc(ss, slab_idx, user, "linear_alloc", 0);
            tiny_remote_assert_not_remote(ss, slab_idx, user, "linear_alloc_ret", 0);
        }
        return user;  // Fast path: O(1) pointer arithmetic
    }

    // Freelist mode (after first free())
    if (__builtin_expect(meta->freelist != NULL, 0)) {
        void* block = meta->freelist;

        // CORRUPTION DEBUG: Validate freelist head before popping
        if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
            size_t blk = g_tiny_class_sizes[ss->size_class];
            uint8_t* slab_base = tiny_slab_base_for(ss, slab_idx);
            uintptr_t block_addr = (uintptr_t)block;
            uintptr_t slab_addr = (uintptr_t)slab_base;
            uintptr_t offset = block_addr - slab_addr;

            fprintf(stderr, "[ALLOC_POP] cls=%u slab=%d block=%p offset=%zu (used=%u cap=%u)\n",
                    ss->size_class, slab_idx, block, offset, meta->used, meta->capacity);

            if (offset % blk != 0) {
                fprintf(stderr, "[ALLOC_CORRUPT] Freelist head is misaligned! block=%p offset=%zu blk=%zu\n",
                        block, offset, blk);
                fprintf(stderr, "[ALLOC_CORRUPT] Expected alignment: %zu, actual: %zu\n",
                        blk, offset % blk);
                tiny_failfast_abort_ptr("alloc_pop_misalign", ss, slab_idx, block, "freelist_head_corrupt");
            }

            size_t index = offset / blk;
            if (index >= meta->capacity) {
                fprintf(stderr, "[ALLOC_CORRUPT] Freelist head out of bounds! block=%p index=%zu cap=%u\n",
                        block, index, meta->capacity);
                tiny_failfast_abort_ptr("alloc_pop_oob", ss, slab_idx, block, "freelist_head_oob");
            }
        }

        meta->freelist = *(void**)block;  // Pop from freelist
        meta->used++;

        if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
            if (__builtin_expect(meta->used > meta->capacity, 0)) {
                fprintf(stderr, "[ALLOC_CORRUPT] meta->used overflow on freelist alloc: used=%u cap=%u cls=%u slab=%d\n",
                        meta->used, meta->capacity, ss->size_class, slab_idx);
                tiny_failfast_abort_ptr("alloc_used_overflow",
                                        ss,
                                        slab_idx,
                                        block,
                                        "freelist_used_over_capacity");
            }
        }

        if (__builtin_expect(g_debug_remote_guard, 0)) {
            tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
            tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
        }
        return block;
    }

    return NULL;  // Slab is full
}

// Adopt helper: acquire → drain → bind (single boundary) – returns 1 on success
static inline int adopt_bind_if_safe(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx, int class_idx) {
    uint32_t self_tid = tiny_self_u32();
    SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
    if (!slab_is_valid(&h)) return 0;
    slab_drain_remote_full(&h);
    if (__builtin_expect(slab_is_safe_to_bind(&h), 1)) {
        // Optional: move a few nodes to Front SLL to boost next hits
        tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
        tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
        // Ownership now associated with TLS slab; release handle bookkeeping
        slab_release(&h);
        return 1;
    }
    slab_release(&h);
    return 0;
}

// Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation)
static SuperSlab* superslab_refill(int class_idx) {
#if HAKMEM_DEBUG_COUNTERS
    g_superslab_refill_calls_dbg[class_idx]++;
#endif
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];

    // ============================================================================
    // Phase 2a: Dynamic Expansion - Initialize SuperSlabHead if needed
    // ============================================================================
    extern SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS];
    extern SuperSlabHead* init_superslab_head(int class_idx);
    extern int expand_superslab_head(SuperSlabHead* head);

    SuperSlabHead* head = g_superslab_heads[class_idx];
    if (!head) {
        // First-time initialization for this class
        head = init_superslab_head(class_idx);
        if (!head) {
            extern __thread int g_hakmem_lock_depth;
            g_hakmem_lock_depth++;
            fprintf(stderr, "[DEBUG] superslab_refill: Failed to init SuperSlabHead for class %d\n", class_idx);
            g_hakmem_lock_depth--;
            return NULL;  // Critical failure
        }
        g_superslab_heads[class_idx] = head;
    }

    // Try current chunk first (fast path)
    SuperSlab* current_chunk = head->current_chunk;
    if (current_chunk) {
        // Check if current chunk has available slabs
        // Bitmap semantics: 0=FREE, 1=OCCUPIED
        // - 0x00000000 = all free (32 available)
        // - 0xFFFFFFFF = all occupied (0 available)
        int chunk_cap = ss_slabs_capacity(current_chunk);
        uint32_t full_mask = (chunk_cap >= 32) ? 0xFFFFFFFF : ((1U << chunk_cap) - 1);

        if (current_chunk->slab_bitmap != full_mask) {
            // Current chunk has free slabs, use normal refill logic below
            // (Will be handled by existing code that checks tls->ss)
            if (tls->ss != current_chunk) {
                // Update TLS to point to current chunk
                tls->ss = current_chunk;
            }
        } else {
            // Current chunk exhausted (all slabs occupied), try to expand
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
            extern __thread int g_hakmem_lock_depth;
            g_hakmem_lock_depth++;
            fprintf(stderr, "[HAKMEM] SuperSlab chunk exhausted for class %d (bitmap=0x%08x), expanding...\n",
                    class_idx, current_chunk->slab_bitmap);
            g_hakmem_lock_depth--;
#endif

            // Protect expansion with global lock (race condition fix)
            static pthread_mutex_t expand_lock = PTHREAD_MUTEX_INITIALIZER;
            pthread_mutex_lock(&expand_lock);

            // Re-check after acquiring lock (another thread may have expanded)
            current_chunk = head->current_chunk;
            uint32_t recheck_mask = (ss_slabs_capacity(current_chunk) >= 32) ? 0xFFFFFFFF :
                                     ((1U << ss_slabs_capacity(current_chunk)) - 1);

            if (current_chunk->slab_bitmap == recheck_mask) {
                // Still exhausted, expand now
                if (expand_superslab_head(head) < 0) {
                    pthread_mutex_unlock(&expand_lock);
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
                    g_hakmem_lock_depth++;
                    fprintf(stderr, "[HAKMEM] CRITICAL: Failed to expand SuperSlabHead for class %d (system OOM)\n", class_idx);
                    g_hakmem_lock_depth--;
#endif
                    return NULL;  // True system OOM
                }

#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
                g_hakmem_lock_depth++;
                fprintf(stderr, "[HAKMEM] Successfully expanded SuperSlabHead for class %d\n", class_idx);
                g_hakmem_lock_depth--;
#endif
            }

            // Update current_chunk and tls->ss to point to (potentially new) chunk
            current_chunk = head->current_chunk;
            tls->ss = current_chunk;
            pthread_mutex_unlock(&expand_lock);

            // Verify chunk has free slabs
            full_mask = (ss_slabs_capacity(current_chunk) >= 32) ? 0xFFFFFFFF :
                        ((1U << ss_slabs_capacity(current_chunk)) - 1);
            if (!current_chunk || current_chunk->slab_bitmap == full_mask) {
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
                g_hakmem_lock_depth++;
                fprintf(stderr, "[HAKMEM] CRITICAL: Chunk still has no free slabs for class %d after expansion\n", class_idx);
                g_hakmem_lock_depth--;
#endif
                return NULL;
            }
        }
    }

    // ============================================================================
    // Continue with existing refill logic
    // ============================================================================
    static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen
    if (g_ss_adopt_en == -1) {
        char* e = getenv("HAKMEM_TINY_SS_ADOPT");
        if (e) {
            g_ss_adopt_en = (*e != '0') ? 1 : 0;
        } else {
            extern _Atomic int g_ss_remote_seen;
            g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0;
        }
    }
    extern int g_adopt_cool_period;
    extern __thread int g_tls_adopt_cd[];
    if (g_adopt_cool_period == -1) {
        char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN");
        int v = (cd ? atoi(cd) : 0);
        if (v < 0) v = 0; if (v > 1024) v = 1024;
        g_adopt_cool_period = v;
    }

    static int g_superslab_refill_debug_once = 0;
    SuperSlab* prev_ss = tls->ss;
    TinySlabMeta* prev_meta = tls->meta;
    uint8_t prev_slab_idx = tls->slab_idx;
    uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0;
    uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0;
    uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0;
    uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0;
    int free_idx_attempted = -2;  // -2 = not evaluated, -1 = none, >=0 = chosen
    int reused_slabs = 0;

    // Optional: Mid-size simple refill to avoid multi-layer scans (class>=4)
    do {
        static int g_mid_simple_warn = 0;
        if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) {
            // If current TLS has a SuperSlab, prefer taking a virgin slab directly
            if (tls->ss) {
                int tls_cap = ss_slabs_capacity(tls->ss);
                if (tls->ss->active_slabs < tls_cap) {
                    int free_idx = superslab_find_free_slab(tls->ss);
                    if (free_idx >= 0) {
                        uint32_t my_tid = tiny_self_u32();
                        superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
                        tiny_tls_bind_slab(tls, tls->ss, free_idx);
                        return tls->ss;
                    }
                }
            }
            // Otherwise allocate a fresh SuperSlab and bind first slab
            SuperSlab* ssn = superslab_allocate((uint8_t)class_idx);
            if (!ssn) {
                if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) {
                    g_mid_simple_warn++;
                    int err = errno;
                    fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err);
                }
                return NULL;
            }
            uint32_t my_tid = tiny_self_u32();
            superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid);
            SuperSlab* old = tls->ss;
            tiny_tls_bind_slab(tls, ssn, 0);
            superslab_ref_inc(ssn);
            if (old && old != ssn) { superslab_ref_dec(old); }
            return ssn;
        }
    } while (0);


    // First, try to adopt a published partial SuperSlab for this class
    if (g_ss_adopt_en) {
        if (g_adopt_cool_period > 0) {
            if (g_tls_adopt_cd[class_idx] > 0) {
                g_tls_adopt_cd[class_idx]--;
            } else {
                // eligible to adopt
            }
        }
        if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) {
        SuperSlab* adopt = ss_partial_adopt(class_idx);
        if (adopt && adopt->magic == SUPERSLAB_MAGIC) {
            // ========================================================================
            // Quick Win #2: First-Fit Adopt (vs Best-Fit scoring all 32 slabs)
            // For Larson, any slab with freelist works - no need to score all 32!
            // Expected improvement: -3,000 cycles (from 32 atomic loads + 32 scores)
            // ========================================================================
            int adopt_cap = ss_slabs_capacity(adopt);
            int best = -1;
            for (int s = 0; s < adopt_cap; s++) {
                TinySlabMeta* m = &adopt->slabs[s];
                // Quick check: Does this slab have a freelist?
                if (m->freelist) {
                    // Yes! Try to acquire it immediately (first-fit)
                    best = s;
                    break;  // ✅ OPTIMIZATION: Stop at first slab with freelist!
                }
                // Optional: Also check remote_heads if we want to prioritize those
                // (But for Larson, freelist is sufficient)
            }
            if (best >= 0) {
                if (adopt_bind_if_safe(tls, adopt, best, class_idx)) {
                    if (g_adopt_cool_period > 0) g_tls_adopt_cd[class_idx] = g_adopt_cool_period;
                    return adopt;
                }
            }
            // If no freelist found, ignore and continue (optional: republish)
        }
    }
    }

    // Phase 7.6 Step 4: Check existing SuperSlab with priority order
    if (tls->ss) {
        // Priority 1: Reuse slabs with freelist (already freed blocks)
        int tls_cap = ss_slabs_capacity(tls->ss);
        uint32_t nonempty_mask = 0;
        do {
            static int g_mask_en = -1;
            if (__builtin_expect(g_mask_en == -1, 0)) {
                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
            }
            if (__builtin_expect(g_mask_en, 0)) {
                nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire);
                break;
            }
            for (int i = 0; i < tls_cap; i++) {
                if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i);
            }
        } while (0);

        // O(1) lookup: scan mask with ctz (1 instruction!)
        while (__builtin_expect(nonempty_mask != 0, 1)) {
            int i = __builtin_ctz(nonempty_mask);  // Find first non-empty slab (O(1))
            nonempty_mask &= ~(1u << i);  // Clear bit for next iteration

            // FIX #1 DELETED (Race condition fix):
            // Previous drain without ownership caused concurrent freelist corruption.
            // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
            // Remote frees will be drained when the slab is adopted (see tiny_refill.h paths).

            if (adopt_bind_if_safe(tls, tls->ss, i, class_idx)) {
                reused_slabs = 1;
                return tls->ss;
            }
        }

        // Priority 2: Use unused slabs (virgin slabs)
        if (tls->ss->active_slabs < tls_cap) {
            // Find next free slab
            int free_idx = superslab_find_free_slab(tls->ss);
            free_idx_attempted = free_idx;
            if (free_idx >= 0) {
                // Initialize this slab
                uint32_t my_tid = tiny_self_u32();
                superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);

                // Update TLS cache (unified update)
                tiny_tls_bind_slab(tls, tls->ss, free_idx);

                return tls->ss;
            }
        }
    }

    // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan)
    // This reduces pressure to allocate new SS when other threads freed blocks.
    // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan
    if (!tls->ss) {
        // Phase 6: Use per-class registry (262K → ~10-100 entries per class!)
        extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
        extern int g_super_reg_class_size[TINY_NUM_CLASSES];

        const int scan_max = tiny_reg_scan_max();
        int reg_size = g_super_reg_class_size[class_idx];
        int scan_limit = (scan_max < reg_size) ? scan_max : reg_size;

        for (int i = 0; i < scan_limit; i++) {
            SuperSlab* ss = g_super_reg_by_class[class_idx][i];
            if (!ss || ss->magic != SUPERSLAB_MAGIC) continue;
            // Note: class_idx check is not needed (per-class registry!)

            // Pick first slab with freelist (Box 4: adopt boundary helper)
            int reg_cap = ss_slabs_capacity(ss);
            for (int s = 0; s < reg_cap; s++) {
                if (ss->slabs[s].freelist) {
                    if (adopt_bind_if_safe(tls, ss, s, class_idx)) return ss;
                }
            }
        }
    }

    // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window
    {
        SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls);
        if (gate_ss) return gate_ss;
    }

    // Allocate new SuperSlab
    SuperSlab* ss = superslab_allocate((uint8_t)class_idx);
    if (!ss) {
        if (!g_superslab_refill_debug_once) {
            g_superslab_refill_debug_once = 1;
            int err = errno;

            // CRITICAL FIX (BUG #11): Protect fprintf() with lock_depth
            // fprintf() can call malloc for buffering → must use libc malloc
            extern __thread int g_hakmem_lock_depth;
            g_hakmem_lock_depth++;

            fprintf(stderr,
                    "[DEBUG] superslab_refill returned NULL (OOM) detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n",
                    class_idx,
                    (void*)prev_ss,
                    (unsigned)prev_active,
                    prev_bitmap,
                    (void*)prev_meta,
                    (unsigned)prev_meta_used,
                    (unsigned)prev_meta_cap,
                    (unsigned)prev_slab_idx,
                    reused_slabs,
                    free_idx_attempted,
                    err);

            g_hakmem_lock_depth--;
        }
        // Clear errno to avoid confusion in fallback paths
        errno = 0;
        return NULL;  // OOM
    }

    // Initialize first slab
    uint32_t my_tid = tiny_self_u32();
    superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid);

    // Cache in unified TLS（前のSS参照を解放）
    SuperSlab* old = tls->ss;
    tiny_tls_bind_slab(tls, ss, 0);
    // Maintain refcount（将来の空回収に備え、TLS参照をカウント）
    superslab_ref_inc(ss);
    if (old && old != ss) {
        superslab_ref_dec(old);
    }

    return ss;
}

// Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix)
static inline void* hak_tiny_alloc_superslab(int class_idx) {
    // DEBUG: Function entry trace (gated to avoid ring spam)
    do {
        static int g_alloc_ring = -1;
        if (__builtin_expect(g_alloc_ring == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_ALLOC_RING");
            g_alloc_ring = (e && *e && *e != '0') ? 1 : 0;
        }
        if (g_alloc_ring) {
            tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0);
        }
    } while (0);

    // MidTC fast path: 128..1024B（class>=4）はTLS tcacheを最優先
    do {
        void* mp = midtc_pop(class_idx);
        if (mp) {
            HAK_RET_ALLOC(class_idx, mp);
        }
    } while (0);

    // Phase 6.24: 1 TLS read (down from 3)
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];

    TinySlabMeta* meta = tls->meta;
    int slab_idx = tls->slab_idx;
    if (meta && slab_idx >= 0 && tls->ss) {
        // CRITICAL: Verify class consistency BEFORE using tls->ss
        // If tls->ss->size_class != class_idx, unbind and refill
        if (tls->ss->size_class != class_idx) {
            // Class mismatch: TLS is bound to wrong SuperSlab
            // This happens when TLS was previously bound to different class
            tls->ss = NULL;
            tls->meta = NULL;
            tls->slab_idx = -1;
            tls->slab_base = NULL;
            meta = NULL;  // Force refill path below
        } else {
            // Ensure TLS view is consistent with canonical slab_base
            uint8_t* canonical = tiny_slab_base_for(tls->ss, slab_idx);
            if (tls->slab_base != canonical) {
                tls->slab_base = canonical;
            }
        }
        // A/B: Relaxed read for remote head presence check
        static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed
        if (__builtin_expect(g_alloc_remote_relax == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX");
            g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0;
        }
        uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx],
                                                 g_alloc_remote_relax ? memory_order_relaxed
                                                                       : memory_order_acquire);
        if (__builtin_expect(pending != 0, 0)) {
            uint32_t self_tid = tiny_self_u32();
            if (ss_owner_try_acquire(meta, self_tid)) {
                _ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta);
            }
        }
    }

    // FIX #2 DELETED (Race condition fix):
    // Previous drain-all-slabs without ownership caused concurrent freelist corruption.
    // Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash.
    // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
    // Remote frees will be drained when the slab is adopted via refill paths.

    // Fast path: Direct metadata access (no repeated TLS reads!)
    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
        // Linear allocation (lazy init)
        size_t block_size = g_tiny_class_sizes[tls->ss->size_class];
        uint8_t* base = tls->slab_base;  // tls_slab_base は tiny_slab_base_for(ss, slab_idx) 由来（唯一の真実）

        // ULTRATHINK DEBUG: Capture the 53-byte mystery
        if (tiny_refill_failfast_level() >= 3 && tls->ss->size_class == 7 && slab_idx == 0) {
            fprintf(stderr, "[ULTRA_53_DEBUG] === Before allocation ===\n");
            fprintf(stderr, "[ULTRA_53_DEBUG] ss=%p, slab_idx=%d, class=%d\n",
                    tls->ss, slab_idx, tls->ss->size_class);
            fprintf(stderr, "[ULTRA_53_DEBUG] block_size=%zu, meta->used=%d, meta->capacity=%d\n",
                    block_size, meta->used, meta->capacity);
            fprintf(stderr, "[ULTRA_53_DEBUG] tls->slab_base=%p\n", base);
            fprintf(stderr, "[ULTRA_53_DEBUG] tiny_slab_base_for(ss,%d)=%p\n",
                    slab_idx, tiny_slab_base_for(tls->ss, slab_idx));
            fprintf(stderr, "[ULTRA_53_DEBUG] sizeof(SuperSlab)=%zu\n", sizeof(SuperSlab));
            fprintf(stderr, "[ULTRA_53_DEBUG] Expected base should be: ss + %zu\n", sizeof(SuperSlab));
            fprintf(stderr, "[ULTRA_53_DEBUG] Actual base is: ss + 1024\n");
            fprintf(stderr, "[ULTRA_53_DEBUG] Base error: %zu - 1024 = %zu bytes\n",
                    sizeof(SuperSlab), sizeof(SuperSlab) - 1024);
        }

        void* block = (void*)(base + ((size_t)meta->used * block_size));

        // ULTRATHINK DEBUG: After calculation
        if (tiny_refill_failfast_level() >= 3 && tls->ss->size_class == 7 && slab_idx == 0) {
            size_t offset_from_ss = (uintptr_t)block - (uintptr_t)tls->ss;
            size_t expected_offset = 1024 + ((size_t)meta->used * block_size);
            fprintf(stderr, "[ULTRA_53_DEBUG] === Calculated block address ===\n");
            fprintf(stderr, "[ULTRA_53_DEBUG] block=%p\n", block);
            fprintf(stderr, "[ULTRA_53_DEBUG] offset from ss=%zu (0x%zx)\n", offset_from_ss, offset_from_ss);
            fprintf(stderr, "[ULTRA_53_DEBUG] expected offset=%zu (0x%zx)\n", expected_offset, expected_offset);
            fprintf(stderr, "[ULTRA_53_DEBUG] difference=%zd bytes\n",
                    (ssize_t)offset_from_ss - (ssize_t)expected_offset);
        }

        meta->used++;

        // Fail-Fast: self-check（デバッグ時のみ有効）
        if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
            uintptr_t base_ss = (uintptr_t)tls->ss;
            size_t ss_size = (size_t)1ULL << tls->ss->lg_size;
            uintptr_t limit_ss = base_ss + ss_size;
            uintptr_t p = (uintptr_t)block;
            size_t off = (p >= base_ss) ? (size_t)(p - base_ss) : 0;
            int in_range = (p >= base_ss) && (p < limit_ss);
            int aligned = ((p - (uintptr_t)base) % block_size) == 0;
            int idx_ok = (tls->slab_idx >= 0) && (tls->slab_idx < ss_slabs_capacity(tls->ss));
            if (!in_range || !aligned || !idx_ok || meta->used > (uint32_t)meta->capacity) {
                // Diagnostic log before abort
                fprintf(stderr, "[ALLOC_CARVE_BUG] cls=%u slab=%d used=%u cap=%u base=%p bs=%zu ptr=%p offset=%zu\n",
                        tls->ss->size_class, tls->slab_idx, meta->used, meta->capacity,
                        (void*)base, block_size, block, off);
                fprintf(stderr, "[ALLOC_CARVE_BUG] in_range=%d aligned=%d idx_ok=%d used_check=%d\n",
                        in_range, aligned, idx_ok, meta->used > (uint32_t)meta->capacity);
                fflush(stderr);

                tiny_failfast_abort_ptr("alloc_ret_align",
                                        tls->ss,
                                        tls->slab_idx,
                                        block,
                                        !in_range ? "out_of_range"
                                                  : (!aligned ? "misaligned"
                                                              : (!idx_ok ? "bad_slab_idx"
                                                                         : "over_capacity")));
            }
        }

        // Track active blocks in SuperSlab for conservative reclamation
        ss_active_inc(tls->ss);
        // Route: slab linear
        ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60);
        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
    }

    if (meta && meta->freelist) {
        // Freelist allocation
        void* block = meta->freelist;
        // Safety: bounds/alignment check (debug)
        if (__builtin_expect(g_tiny_safe_free, 0)) {
            size_t blk = g_tiny_class_sizes[tls->ss->size_class];
            uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
            uintptr_t delta = (uintptr_t)block - (uintptr_t)base;
            int align_ok = ((delta % blk) == 0);
            int range_ok = (delta / blk) < meta->capacity;
            if (!align_ok || !range_ok) {
                uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u);
                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u);
                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; }
                return NULL;
            }
        }
        void* next = *(void**)block;
        meta->freelist = next;
        meta->used++;
        // Optional: clear freelist bit when becomes empty
        do {
            static int g_mask_en = -1;
            if (__builtin_expect(g_mask_en == -1, 0)) {
                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
            }
            if (__builtin_expect(g_mask_en, 0) && next == NULL) {
                uint32_t bit = (1u << slab_idx);
                atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release);
            }
        } while (0);
        // Track active blocks in SuperSlab for conservative reclamation
        ss_active_inc(tls->ss);
        // Route: slab freelist
        ROUTE_MARK(12); ROUTE_COMMIT(class_idx, 0x61);
        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
    }

    // Slow path: Refill TLS slab
    SuperSlab* ss = superslab_refill(class_idx);
    if (!ss) {
        static int log_oom = 0;
        if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; }
        return NULL;  // OOM
    }

    // Retry allocation (metadata already cached in superslab_refill)
    meta = tls->meta;

    // DEBUG: Check each condition (disabled for benchmarks)
    // static int log_retry = 0;
    // if (log_retry < 2) {
    //     fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n",
    //             (void*)meta, meta ? meta->freelist : NULL,
    //             meta ? meta->used : 0, meta ? meta->capacity : 0,
    //             (void*)tls->slab_base);
    //     log_retry++;
    // }

    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
        size_t block_size = g_tiny_class_sizes[ss->size_class];
        void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));

        // Disabled for benchmarks
        // static int log_success = 0;
        // if (log_success < 2) {
        //     fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n",
        //             block, class_idx, meta->used, meta->used + 1);
        //     log_success++;
        // }

        meta->used++;
        // Track active blocks in SuperSlab for conservative reclamation
        ss_active_inc(ss);
        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
    }

    // Disabled for benchmarks
    // static int log_fail = 0;
    // if (log_fail < 2) {
    //     fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n");
    //     log_fail++;
    // }
    return NULL;
}