#include <inttypes.h>
#include "tiny_remote.h"
#include "slab_handle.h"
#include "tiny_refill.h"
#include "tiny_tls_guard.h"
#include "mid_tcache.h"
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
#if !HAKMEM_BUILD_RELEASE
#include "hakmem_tiny_magazine.h"
#endif
extern int g_tiny_force_remote;

// ENV: HAKMEM_TINY_DRAIN_TO_SLL (0=off) — adopt/bind境界でfreelist→TLS SLLへN個スプライス
static inline int tiny_drain_to_sll_budget(void) {
    static int v = -1;
    if (__builtin_expect(v == -1, 0)) {
        const char* s = getenv("HAKMEM_TINY_DRAIN_TO_SLL");
        int parsed = (s && *s) ? atoi(s) : 0;
        if (parsed < 0) parsed = 0; if (parsed > 256) parsed = 256;
        v = parsed;
    }
    return v;
}

static inline void tiny_drain_freelist_to_sll_once(SuperSlab* ss, int slab_idx, int class_idx) {
    int budget = tiny_drain_to_sll_budget();
    if (__builtin_expect(budget <= 0, 1)) return;
    if (!(ss && ss->magic == SUPERSLAB_MAGIC)) return;
    if (slab_idx < 0) return;
    TinySlabMeta* m = &ss->slabs[slab_idx];
    int moved = 0;
    while (m->freelist && moved < budget) {
        void* p = m->freelist;
        m->freelist = *(void**)p;
        *(void**)p = g_tls_sll_head[class_idx];
        g_tls_sll_head[class_idx] = p;
        g_tls_sll_count[class_idx]++;
        moved++;
    }
}

static inline int tiny_remote_queue_contains_guard(SuperSlab* ss, int slab_idx, void* target) {
    if (!ss || slab_idx < 0) return 0;
    uintptr_t cur = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
    int limit = 8192;
    while (cur && limit-- > 0) {
        if ((void*)cur == target) {
            return 1;
        }
        uintptr_t next;
        if (__builtin_expect(g_remote_side_enable, 0)) {
            next = tiny_remote_side_get(ss, slab_idx, (void*)cur);
        } else {
            next = atomic_load_explicit((_Atomic uintptr_t*)cur, memory_order_relaxed);
        }
        cur = next;
    }
    if (limit <= 0) {
        return 1; // fail-safe: treat unbounded traversal as duplicate
    }
    return 0;
}


// Phase 6.12.1: Free with pre-calculated slab (Option C - avoids duplicate lookup)
void hak_tiny_free_with_slab(void* ptr, TinySlab* slab) {
    // Phase 7.6: slab == NULL means SuperSlab mode (Magazine integration)
    if (!slab) {
        // SuperSlab path: Get class_idx from SuperSlab
        SuperSlab* ss = hak_super_lookup(ptr);
        if (!ss || ss->magic != SUPERSLAB_MAGIC) return;
        int class_idx = ss->size_class;
        size_t ss_size = (size_t)1ULL << ss->lg_size;
        uintptr_t ss_base = (uintptr_t)ss;
        if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
            tiny_debug_ring_record(TINY_RING_EVENT_SUPERSLAB_ADOPT_FAIL, (uint16_t)0xFFu, ss, (uintptr_t)ss->size_class);
            return;
        }
        // Optional: cross-lookup TinySlab owner and detect class mismatch early
        if (__builtin_expect(g_tiny_safe_free, 0)) {
            TinySlab* ts = hak_tiny_owner_slab(ptr);
            if (ts) {
                int ts_cls = ts->class_idx;
                if (ts_cls >= 0 && ts_cls < TINY_NUM_CLASSES && ts_cls != class_idx) {
                    uint32_t code = 0xAA00u | ((uint32_t)ts_cls & 0xFFu);
                    uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr);
                    tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)class_idx, ptr, aux);
                    if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
                }
            }
        }
        tiny_debug_ring_record(TINY_RING_EVENT_FREE_ENTER, (uint16_t)class_idx, ptr, 0);
        // Detect cross-thread: cross-thread free MUST go via superslab path
        int slab_idx = slab_index_for(ss, ptr);
        int ss_cap = ss_slabs_capacity(ss);
        if (__builtin_expect(slab_idx < 0 || slab_idx >= ss_cap, 0)) {
            tiny_debug_ring_record(TINY_RING_EVENT_SUPERSLAB_ADOPT_FAIL, (uint16_t)0xFEu, ss, (uintptr_t)slab_idx);
            return;
        }
        TinySlabMeta* meta = &ss->slabs[slab_idx];
        if (__builtin_expect(g_tiny_safe_free, 0)) {
            size_t blk = g_tiny_class_sizes[class_idx];
            uint8_t* base = tiny_slab_base_for(ss, slab_idx);
            uintptr_t delta = (uintptr_t)ptr - (uintptr_t)base;
            int cap_ok = (meta->capacity > 0) ? 1 : 0;
            int align_ok = (delta % blk) == 0;
            int range_ok = cap_ok && (delta / blk) < meta->capacity;
            if (!align_ok || !range_ok) {
                uint32_t code = 0xA104u;
                if (align_ok) code |= 0x2u;
                if (range_ok) code |= 0x1u;
                uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr);
                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)class_idx, ptr, aux);
                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
                return;
            }
        }
        uint32_t self_tid = tiny_self_u32();
        if (__builtin_expect(meta->owner_tid != self_tid, 0)) {
            // route directly to superslab (remote queue / freelist)
            uintptr_t ptr_val = (uintptr_t)ptr;
            uintptr_t ss_base = (uintptr_t)ss;
            size_t ss_size = (size_t)1ULL << ss->lg_size;
            if (__builtin_expect(ptr_val < ss_base || ptr_val >= ss_base + ss_size, 0)) {
                tiny_debug_ring_record(TINY_RING_EVENT_SUPERSLAB_ADOPT_FAIL, (uint16_t)0xFDu, ss, ptr_val);
                return;
            }
            tiny_debug_ring_record(TINY_RING_EVENT_FREE_REMOTE, (uint16_t)class_idx, ss, (uintptr_t)ptr);
            hak_tiny_free_superslab(ptr, ss);
            HAK_STAT_FREE(class_idx);
            return;
        }

        if (__builtin_expect(g_debug_fast0, 0)) {
            tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, ptr, (uintptr_t)slab_idx);
            void* prev = meta->freelist;
            *(void**)ptr = prev;
            meta->freelist = ptr;
            meta->used--;
            ss_active_dec_one(ss);
            if (prev == NULL) {
                ss_partial_publish((int)ss->size_class, ss);
            }
            tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, (uintptr_t)slab_idx);
            HAK_STAT_FREE(class_idx);
            return;
        }

        if (g_fast_enable && g_fast_cap[class_idx] != 0) {
            if (tiny_fast_push(class_idx, ptr)) {
                tiny_debug_ring_record(TINY_RING_EVENT_FREE_FAST, (uint16_t)class_idx, ptr, slab_idx);
                HAK_STAT_FREE(class_idx);
                return;
            }
        }

        if (g_tls_list_enable) {
            TinyTLSList* tls = &g_tls_lists[class_idx];
            uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
                tiny_tls_refresh_params(class_idx, tls);
            }
            // TinyHotMag front push（8/16/32B, A/B）
            if (__builtin_expect(g_hotmag_enable && class_idx <= 2, 1)) {
                if (hotmag_push(class_idx, ptr)) {
                    tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 1);
                    HAK_STAT_FREE(class_idx);
                    return;
                }
            }
            if (tls->count < tls->cap) {
                tiny_tls_list_guard_push(class_idx, tls, ptr);
                tls_list_push(tls, ptr);
                tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, 0);
                HAK_STAT_FREE(class_idx);
                return;
            }
            seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
                tiny_tls_refresh_params(class_idx, tls);
            }
            tiny_tls_list_guard_push(class_idx, tls, ptr);
            tls_list_push(tls, ptr);
            if (tls_list_should_spill(tls)) {
                tls_list_spill_excess(class_idx, tls);
            }
            tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, 2);
            HAK_STAT_FREE(class_idx);
            return;
        }

#if !HAKMEM_BUILD_RELEASE
        // SuperSlab uses Magazine for TLS caching (same as TinySlab)
        tiny_small_mags_init_once();
        if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
        TinyTLSMag* mag = &g_tls_mags[class_idx];
        int cap = mag->cap;

        // 32/64B: SLL優先（mag優先は無効化）
        // Prefer TinyQuickSlot (compile-out if HAKMEM_TINY_NO_QUICK)
#if !defined(HAKMEM_TINY_NO_QUICK)
        if (g_quick_enable && class_idx <= 4) {
            TinyQuickSlot* qs = &g_tls_quick[class_idx];
            if (__builtin_expect(qs->top < QUICK_CAP, 1)) {
                qs->items[qs->top++] = ptr;
                HAK_STAT_FREE(class_idx);
                return;
            }
        }
#endif

        // Fast path: TLS SLL push for hottest classes
        if (!g_tls_list_enable && g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)cap)) {
            *(void**)ptr = g_tls_sll_head[class_idx];
            g_tls_sll_head[class_idx] = ptr;
            g_tls_sll_count[class_idx]++;
            // Active → Inactive: count down immediately (TLS保管中は"使用中"ではない)
            ss_active_dec_one(ss);
            HAK_TP1(sll_push, class_idx);
            tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, 3);
            HAK_STAT_FREE(class_idx);
            return;
        }

        // Next: Magazine push（必要ならmag→SLLへバルク転送で空きを作る）
        // Hysteresis: allow slight overfill before deciding to spill under lock
        if (mag->top >= cap && g_spill_hyst > 0) {
            (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2);
        }
        if (mag->top < cap + g_spill_hyst) {
            mag->items[mag->top].ptr = ptr;
#if HAKMEM_TINY_MAG_OWNER
            mag->items[mag->top].owner = NULL; // SuperSlab owner not a TinySlab; leave NULL
#endif
            mag->top++;
#if HAKMEM_DEBUG_COUNTERS
            g_magazine_push_count++;  // Phase 7.6: Track pushes
#endif
            // Active → Inactive: decrement now（アプリ解放時に非アクティブ扱い）
            ss_active_dec_one(ss);
            HAK_TP1(mag_push, class_idx);
            tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 2);
            HAK_STAT_FREE(class_idx);
            return;
        }

        // Background spill: queue to BG thread instead of locking (when enabled)
        if (g_bg_spill_enable) {
            uint32_t qlen = atomic_load_explicit(&g_bg_spill_len[class_idx], memory_order_relaxed);
            if ((int)qlen < g_bg_spill_target) {
                // Build a small chain: include current ptr and pop from mag up to limit
                int limit = g_bg_spill_max_batch;
                if (limit > cap/2) limit = cap/2;
                if (limit > 32) limit = 32; // keep free-path bounded
                void* head = ptr;
                *(void**)head = NULL;
                void* tail = head; // current tail
                int taken = 1;
                while (taken < limit && mag->top > 0) {
                    void* p2 = mag->items[--mag->top].ptr;
                    *(void**)p2 = head;
                    head = p2;
                    taken++;
                }
                // Push chain to spill queue (single CAS)
                bg_spill_push_chain(class_idx, head, tail, taken);
                tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 3);
                HAK_STAT_FREE(class_idx);
                return;
            }
        }

        // Spill half (SuperSlab version - simpler than TinySlab)
                pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
        hkm_prof_begin(NULL);
        pthread_mutex_lock(lock);
        // Batch spill: reduce lock frequency and work per call
        int spill = cap / 2;
        int over = mag->top - (cap + g_spill_hyst);
        if (over > 0 && over < spill) spill = over;

        for (int i = 0; i < spill && mag->top > 0; i++) {
            TinyMagItem it = mag->items[--mag->top];

            // Phase 7.6: SuperSlab spill - return to freelist
            SuperSlab* owner_ss = hak_super_lookup(it.ptr);
            if (owner_ss && owner_ss->magic == SUPERSLAB_MAGIC) {
                // Direct freelist push (same as old hak_tiny_free_superslab)
                int slab_idx = slab_index_for(owner_ss, it.ptr);
                TinySlabMeta* meta = &owner_ss->slabs[slab_idx];
                *(void**)it.ptr = meta->freelist;
                meta->freelist = it.ptr;
                meta->used--;
                // Decrement SuperSlab active counter (spill returns blocks to SS)
                ss_active_dec_one(owner_ss);

                // Phase 8.4: Empty SuperSlab detection (will use meta->used scan)
                // TODO: Implement scan-based empty detection
                // Empty SuperSlab detection/munmapは別途フラッシュAPIで実施（ホットパスから除外）
            }
        }

        pthread_mutex_unlock(lock);
        hkm_prof_end(ss_time, HKP_TINY_SPILL, &tss);

        // Adaptive increase of cap after spill
        int max_cap = tiny_cap_max_for_class(class_idx);
                if (mag->cap < max_cap) {
            int new_cap = mag->cap + (mag->cap / 2);
            if (new_cap > max_cap) new_cap = max_cap;
            if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP;
            mag->cap = new_cap;
        }

        // Finally, try FastCache push first (≤128B) — compile-out if HAKMEM_TINY_NO_FRONT_CACHE
#if !defined(HAKMEM_TINY_NO_FRONT_CACHE)
        if (g_fastcache_enable && class_idx <= 4) {
            if (fastcache_push(class_idx, ptr)) {
                HAK_TP1(front_push, class_idx);
                HAK_STAT_FREE(class_idx);
                return;
            }
        }
#endif
        // Then TLS SLL if room, else magazine
        if (g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)mag->cap)) {
            *(void**)ptr = g_tls_sll_head[class_idx];
            g_tls_sll_head[class_idx] = ptr;
            g_tls_sll_count[class_idx]++;
        } else {
            mag->items[mag->top].ptr = ptr;
#if HAKMEM_TINY_MAG_OWNER
            mag->items[mag->top].owner = slab;
#endif
            mag->top++;
        }
        
#if HAKMEM_DEBUG_COUNTERS
        g_magazine_push_count++;  // Phase 7.6: Track pushes
#endif
        HAK_STAT_FREE(class_idx);
        return;
#endif  // HAKMEM_BUILD_RELEASE
    }

    // Phase 7.6: TinySlab path (original)
    //g_tiny_free_with_slab_count++;  // Phase 7.6: Track calls - DISABLED due to segfault
    // Same-thread → TLS magazine; remote-thread → MPSC stack
    if (pthread_equal(slab->owner_tid, tiny_self_pt())) {
        int class_idx = slab->class_idx;

        if (g_tls_list_enable) {
            TinyTLSList* tls = &g_tls_lists[class_idx];
            uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
                tiny_tls_refresh_params(class_idx, tls);
            }
            // TinyHotMag front push（8/16/32B, A/B）
            if (__builtin_expect(g_hotmag_enable && class_idx <= 2, 1)) {
                if (hotmag_push(class_idx, ptr)) {
                    HAK_STAT_FREE(class_idx);
                    return;
                }
            }
            if (tls->count < tls->cap) {
                tiny_tls_list_guard_push(class_idx, tls, ptr);
                tls_list_push(tls, ptr);
                HAK_STAT_FREE(class_idx);
                return;
            }
            seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
                tiny_tls_refresh_params(class_idx, tls);
            }
            tiny_tls_list_guard_push(class_idx, tls, ptr);
            tls_list_push(tls, ptr);
            if (tls_list_should_spill(tls)) {
                tls_list_spill_excess(class_idx, tls);
            }
            HAK_STAT_FREE(class_idx);
            return;
        }

        tiny_mag_init_if_needed(class_idx);
        TinyTLSMag* mag = &g_tls_mags[class_idx];
        int cap = mag->cap;
        // 32/64B: SLL優先（mag優先は無効化）
        // Fast path: FastCache push (preferred for ≤128B), then TLS SLL
        if (g_fastcache_enable && class_idx <= 4) {
            if (fastcache_push(class_idx, ptr)) {
                HAK_STAT_FREE(class_idx);
                return;
            }
        }
        // Fast path: TLS SLL push (preferred)
        if (!g_tls_list_enable && g_tls_sll_enable && class_idx <= 5) {
            uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)cap);
            if (g_tls_sll_count[class_idx] < sll_cap) {
                *(void**)ptr = g_tls_sll_head[class_idx];
                g_tls_sll_head[class_idx] = ptr;
                g_tls_sll_count[class_idx]++;
                HAK_STAT_FREE(class_idx);
                return;
            }
        }
        // Next: if magazine has room, push immediately and return（満杯ならmag→SLLへバルク）
        if (mag->top >= cap) {
            (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2);
        }
        // Remote-drain can be handled opportunistically on future calls.
        if (mag->top < cap) {
            mag->items[mag->top].ptr = ptr;
#if HAKMEM_TINY_MAG_OWNER
            mag->items[mag->top].owner = slab;
#endif
            mag->top++;
            
#if HAKMEM_DEBUG_COUNTERS
            g_magazine_push_count++;  // Phase 7.6: Track pushes
#endif
            // Note: SuperSlab uses separate path (slab == NULL branch above)
            HAK_STAT_FREE(class_idx);  // Phase 3
            return;
        }
        // Magazine full: before spilling, opportunistically drain remotes once under lock.
        if (atomic_load_explicit(&slab->remote_count, memory_order_relaxed) >= (unsigned)g_remote_drain_thresh_per_class[class_idx] || atomic_load_explicit(&slab->remote_head, memory_order_acquire)) {
            pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
            pthread_mutex_lock(lock);
            HAK_TP1(remote_drain, class_idx);
            tiny_remote_drain_locked(slab);
            pthread_mutex_unlock(lock);
        }
        // Spill half under class lock
        pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
        pthread_mutex_lock(lock);
        int spill = cap / 2;

        // Phase 4.2: High-water threshold for gating Phase 4 logic
        int high_water = (cap * 3) / 4;  // 75% of capacity

        for (int i = 0; i < spill && mag->top > 0; i++) {
            TinyMagItem it = mag->items[--mag->top];

            // Phase 7.6: Check for SuperSlab first (mixed Magazine support)
            SuperSlab* ss_owner = hak_super_lookup(it.ptr);
            if (ss_owner && ss_owner->magic == SUPERSLAB_MAGIC) {
                // SuperSlab spill - return to freelist
                int slab_idx = slab_index_for(ss_owner, it.ptr);
                TinySlabMeta* meta = &ss_owner->slabs[slab_idx];
                *(void**)it.ptr = meta->freelist;
                meta->freelist = it.ptr;
                meta->used--;
                // 空SuperSlab処理はフラッシュ/バックグラウンドで対応（ホットパス除外）
                HAK_STAT_FREE(class_idx);
                continue;  // Skip TinySlab processing
            }

            TinySlab* owner =
#if HAKMEM_TINY_MAG_OWNER
                it.owner;
#else
                NULL;
#endif
            if (!owner) {
                owner = tls_active_owner_for_ptr(class_idx, it.ptr);
            }
            if (!owner) {
                owner = hak_tiny_owner_slab(it.ptr);
            }
            if (!owner) continue;

            // Phase 4.2: Adaptive gating - skip Phase 4 when TLS Magazine is high-water
            // Rationale: When mag->top >= 75%, next alloc will come from TLS anyway
            //            so pushing to mini-mag is wasted work
            int is_high_water = (mag->top >= high_water);

            if (!is_high_water) {
                // Low-water: Phase 4.1 logic (try mini-magazine first)
                uint8_t cidx = owner->class_idx;  // Option A: 1回だけ読む
                TinySlab* tls_a = g_tls_active_slab_a[cidx];
                TinySlab* tls_b = g_tls_active_slab_b[cidx];

                // Option B: Branch prediction hint (spill → TLS-active への戻りが likely)
                if (__builtin_expect((owner == tls_a || owner == tls_b) &&
                                     !mini_mag_is_full(&owner->mini_mag), 1)) {
                    // Fast path: mini-magazineに戻す（bitmap触らない）
                    mini_mag_push(&owner->mini_mag, it.ptr);
                    HAK_TP1(spill_tiny, cidx);
                    HAK_STAT_FREE(cidx);
                    continue;  // bitmap操作スキップ
                }
            }
            // High-water or Phase 4.1 mini-mag full: fall through to bitmap

            // Slow path: bitmap直接書き込み（既存ロジック）
            size_t bs = g_tiny_class_sizes[owner->class_idx];
            int idx = ((uintptr_t)it.ptr - (uintptr_t)owner->base) / bs;
            if (hak_tiny_is_used(owner, idx)) {
                hak_tiny_set_free(owner, idx);
                int was_full = (owner->free_count == 0);
                owner->free_count++;
                if (was_full) move_to_free_list(owner->class_idx, owner);
                if (owner->free_count == owner->total_count) {
                    // If this slab is TLS-active for this thread, clear the pointer before releasing
                    if (g_tls_active_slab_a[owner->class_idx] == owner) g_tls_active_slab_a[owner->class_idx] = NULL;
                    if (g_tls_active_slab_b[owner->class_idx] == owner) g_tls_active_slab_b[owner->class_idx] = NULL;
                    TinySlab** headp = &g_tiny_pool.free_slabs[owner->class_idx];
                    TinySlab* prev = NULL;
                    for (TinySlab* s = *headp; s; prev = s, s = s->next) {
                        if (s == owner) { if (prev) prev->next = s->next; else *headp = s->next; break; }
                    }
                    release_slab(owner);
                }
                HAK_TP1(spill_tiny, owner->class_idx);
                HAK_STAT_FREE(owner->class_idx);
            }
        }
        pthread_mutex_unlock(lock);
        hkm_prof_end(ss, HKP_TINY_SPILL, &tss);
        // Adaptive increase of cap after spill
        int max_cap = tiny_cap_max_for_class(class_idx);
        if (mag->cap < max_cap) {
            int new_cap = mag->cap + (mag->cap / 2);
            if (new_cap > max_cap) new_cap = max_cap;
            if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP;
            mag->cap = new_cap;
        }
        // Finally: prefer TinyQuickSlot → SLL → UltraFront → HotMag → Magazine（順序で局所性を確保）
#if !HAKMEM_BUILD_RELEASE && !defined(HAKMEM_TINY_NO_QUICK)
        if (g_quick_enable && class_idx <= 4) {
            TinyQuickSlot* qs = &g_tls_quick[class_idx];
            if (__builtin_expect(qs->top < QUICK_CAP, 1)) {
                qs->items[qs->top++] = ptr;
            } else if (g_tls_sll_enable) {
                uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
                if (g_tls_sll_count[class_idx] < sll_cap2) {
                    *(void**)ptr = g_tls_sll_head[class_idx];
                    g_tls_sll_head[class_idx] = ptr;
                    g_tls_sll_count[class_idx]++;
                } else if (!tiny_optional_push(class_idx, ptr)) {
                    mag->items[mag->top].ptr = ptr;
#if HAKMEM_TINY_MAG_OWNER
                    mag->items[mag->top].owner = slab;
#endif
                    mag->top++;
                }
            } else {
                if (!tiny_optional_push(class_idx, ptr)) {
                    mag->items[mag->top].ptr = ptr;
#if HAKMEM_TINY_MAG_OWNER
                    mag->items[mag->top].owner = slab;
#endif
                    mag->top++;
                }
            }
        } else
#endif
        {
            if (g_tls_sll_enable && class_idx <= 5) {
                uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
                if (g_tls_sll_count[class_idx] < sll_cap2) {
                    *(void**)ptr = g_tls_sll_head[class_idx];
                    g_tls_sll_head[class_idx] = ptr;
                    g_tls_sll_count[class_idx]++;
                } else if (!tiny_optional_push(class_idx, ptr)) {
                    mag->items[mag->top].ptr = ptr;
#if HAKMEM_TINY_MAG_OWNER
                    mag->items[mag->top].owner = slab;
#endif
                    mag->top++;
                }
            } else {
                if (!tiny_optional_push(class_idx, ptr)) {
                    mag->items[mag->top].ptr = ptr;
#if HAKMEM_TINY_MAG_OWNER
                    mag->items[mag->top].owner = slab;
#endif
                    mag->top++;
                }
            }
        }
        
#if HAKMEM_DEBUG_COUNTERS
        g_magazine_push_count++;  // Phase 7.6: Track pushes
#endif
        // Note: SuperSlab uses separate path (slab == NULL branch above)
        HAK_STAT_FREE(class_idx);  // Phase 3
        return;
    } else {
        tiny_remote_push(slab, ptr);
    }
}

// ============================================================================
// Phase 6.23: SuperSlab Allocation Helpers
// ============================================================================

// Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation)
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
    TinySlabMeta* meta = &ss->slabs[slab_idx];

    // Ensure remote queue is drained before handing blocks back to TLS
    if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0) {
        uint32_t self_tid = tiny_self_u32();
        SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
        if (slab_is_valid(&h)) {
            slab_drain_remote_full(&h);
            int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0;
            if (__builtin_expect(pending, 0)) {
                if (__builtin_expect(g_debug_remote_guard, 0)) {
                    uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed);
                    tiny_remote_watch_note("alloc_pending_remote",
                                           ss,
                                           slab_idx,
                                           (void*)head,
                                           0xA243u,
                                           self_tid,
                                           0);
                }
                slab_release(&h);
                return NULL;
            }
            slab_release(&h);
        } else {
            if (__builtin_expect(g_debug_remote_guard, 0)) {
                tiny_remote_watch_note("alloc_acquire_fail",
                                       ss,
                                       slab_idx,
                                       meta,
                                       0xA244u,
                                       self_tid,
                                       0);
            }
            return NULL;
        }
    }

    if (__builtin_expect(g_debug_remote_guard, 0)) {
        uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
        if (head_pending != 0) {
            tiny_remote_watch_note("alloc_remote_pending",
                                   ss,
                                   slab_idx,
                                   (void*)head_pending,
                                   0xA247u,
                                   tiny_self_u32(),
                                   1);
            return NULL;
        }
    }

    // Phase 6.24: Linear allocation mode (freelist == NULL)
    // This avoids the 4000-8000 cycle cost of building freelist on init
    if (meta->freelist == NULL && meta->used < meta->capacity) {
        // Linear allocation: sequential memory access (cache-friendly!)
        size_t block_size = g_tiny_class_sizes[ss->size_class];
        void* slab_start = slab_data_start(ss, slab_idx);

        // First slab: skip SuperSlab header
        if (slab_idx == 0) {
            slab_start = (char*)slab_start + 1024;
        }

        void* block = (char*)slab_start + (meta->used * block_size);
        meta->used++;
        tiny_remote_track_on_alloc(ss, slab_idx, block, "linear_alloc", 0);
        tiny_remote_assert_not_remote(ss, slab_idx, block, "linear_alloc_ret", 0);
        return block;  // Fast path: O(1) pointer arithmetic
    }

    // Freelist mode (after first free())
    if (meta->freelist) {
        void* block = meta->freelist;
        meta->freelist = *(void**)block;  // Pop from freelist
        meta->used++;
        tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
        tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
        return block;
    }

    return NULL;  // Slab is full
}

// Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation)
static SuperSlab* superslab_refill(int class_idx) {
#if HAKMEM_DEBUG_COUNTERS
    g_superslab_refill_calls_dbg[class_idx]++;
#endif
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
    static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen
    if (g_ss_adopt_en == -1) {
        char* e = getenv("HAKMEM_TINY_SS_ADOPT");
        if (e) {
            g_ss_adopt_en = (*e != '0') ? 1 : 0;
        } else {
            extern _Atomic int g_ss_remote_seen;
            g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0;
        }
    }
    extern int g_adopt_cool_period;
    extern __thread int g_tls_adopt_cd[];
    if (g_adopt_cool_period == -1) {
        char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN");
        int v = (cd ? atoi(cd) : 0);
        if (v < 0) v = 0; if (v > 1024) v = 1024;
        g_adopt_cool_period = v;
    }

    static int g_superslab_refill_debug_once = 0;
    SuperSlab* prev_ss = tls->ss;
    TinySlabMeta* prev_meta = tls->meta;
    uint8_t prev_slab_idx = tls->slab_idx;
    uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0;
    uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0;
    uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0;
    uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0;
    int free_idx_attempted = -2;  // -2 = not evaluated, -1 = none, >=0 = chosen
    int reused_slabs = 0;

    // Optional: Mid-size simple refill to avoid multi-layer scans (class>=4)
    do {
        static int g_mid_simple_warn = 0;
        if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) {
            // If current TLS has a SuperSlab, prefer taking a virgin slab directly
            if (tls->ss) {
                int tls_cap = ss_slabs_capacity(tls->ss);
                if (tls->ss->active_slabs < tls_cap) {
                    int free_idx = superslab_find_free_slab(tls->ss);
                    if (free_idx >= 0) {
                        uint32_t my_tid = tiny_self_u32();
                        superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
                        tiny_tls_bind_slab(tls, tls->ss, free_idx);
                        return tls->ss;
                    }
                }
            }
            // Otherwise allocate a fresh SuperSlab and bind first slab
            SuperSlab* ssn = superslab_allocate((uint8_t)class_idx);
            if (!ssn) {
                if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) {
                    g_mid_simple_warn++;
                    int err = errno;
                    fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err);
                }
                return NULL;
            }
            uint32_t my_tid = tiny_self_u32();
            superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid);
            SuperSlab* old = tls->ss;
            tiny_tls_bind_slab(tls, ssn, 0);
            superslab_ref_inc(ssn);
            if (old && old != ssn) { superslab_ref_dec(old); }
            return ssn;
        }
    } while (0);


    // First, try to adopt a published partial SuperSlab for this class
    if (g_ss_adopt_en) {
        if (g_adopt_cool_period > 0) {
            if (g_tls_adopt_cd[class_idx] > 0) {
                g_tls_adopt_cd[class_idx]--;
            } else {
                // eligible to adopt
            }
        }
        if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) {
        SuperSlab* adopt = ss_partial_adopt(class_idx);
        if (adopt && adopt->magic == SUPERSLAB_MAGIC) {
            int best = -1;
            uint32_t best_score = 0;
            int adopt_cap = ss_slabs_capacity(adopt);
            for (int s = 0; s < adopt_cap; s++) {
                TinySlabMeta* m = &adopt->slabs[s];
                uint32_t rc = atomic_load_explicit(&adopt->remote_counts[s], memory_order_relaxed);
                int has_remote = (atomic_load_explicit(&adopt->remote_heads[s], memory_order_acquire) != 0);
                uint32_t score = rc + (m->freelist ? (1u<<30) : 0u) + (has_remote ? 1u : 0u);
                if (score > best_score) {
                    best_score = score;
                    best = s;
                }
            }
            if (best >= 0) {
                // Box: Try to acquire ownership atomically
                uint32_t self = tiny_self_u32();
                SlabHandle h = slab_try_acquire(adopt, best, self);
                if (slab_is_valid(&h)) {
                    slab_drain_remote_full(&h);
                    if (slab_remote_pending(&h)) {
                        if (__builtin_expect(g_debug_remote_guard, 0)) {
                            uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed);
                            tiny_remote_watch_note("adopt_remote_pending",
                                                   h.ss,
                                                   h.slab_idx,
                                                   (void*)head,
                                                   0xA255u,
                                                   self,
                                                   0);
                        }
                        // Remote still pending; give up adopt path and fall through to normal refill.
                        slab_release(&h);
                    }

                    // Box 4 Boundary: bind は remote_head==0 を保証する必要がある
                    // slab_is_safe_to_bind() で TOCTOU-safe にチェック
                    if (slab_is_safe_to_bind(&h)) {
                        // Optional: move a few nodes to Front SLL to boost next hits
                        tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
                        // 安全に bind 可能（freelist 存在 && remote_head==0 保証）
                        tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
                        if (g_adopt_cool_period > 0) {
                            g_tls_adopt_cd[class_idx] = g_adopt_cool_period;
                        }
                        return h.ss;
                    }
                    // Safe to bind 失敗（freelist なしor remote pending）→ adopt 中止
                    slab_release(&h);
                }
                // Failed to acquire or no freelist - continue searching
            }
            // If no freelist found, ignore and continue (optional: republish)
        }
    }
    }

    // Phase 7.6 Step 4: Check existing SuperSlab with priority order
    if (tls->ss) {
        // Priority 1: Reuse slabs with freelist (already freed blocks)
        int tls_cap = ss_slabs_capacity(tls->ss);
        uint32_t nonempty_mask = 0;
        do {
            static int g_mask_en = -1;
            if (__builtin_expect(g_mask_en == -1, 0)) {
                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
            }
            if (__builtin_expect(g_mask_en, 0)) {
                nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire);
                break;
            }
            for (int i = 0; i < tls_cap; i++) {
                if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i);
            }
        } while (0);

        // O(1) lookup: scan mask with ctz (1 instruction!)
        while (__builtin_expect(nonempty_mask != 0, 1)) {
            int i = __builtin_ctz(nonempty_mask);  // Find first non-empty slab (O(1))
            nonempty_mask &= ~(1u << i);  // Clear bit for next iteration

            // FIX #1 DELETED (Race condition fix):
            // Previous drain without ownership caused concurrent freelist corruption.
            // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
            // Remote frees will be drained when the slab is adopted (see tiny_refill.h paths).

            uint32_t self_tid = tiny_self_u32();
            SlabHandle h = slab_try_acquire(tls->ss, i, self_tid);
            if (slab_is_valid(&h)) {
                if (slab_remote_pending(&h)) {
                    slab_drain_remote_full(&h);
                    if (__builtin_expect(g_debug_remote_guard, 0)) {
                        uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed);
                        tiny_remote_watch_note("reuse_remote_pending",
                                               h.ss,
                                               h.slab_idx,
                                               (void*)head,
                                               0xA254u,
                                               self_tid,
                                               0);
                    }
                    slab_release(&h);
                    continue;
                }
                // Box 4 Boundary: bind は remote_head==0 を保証する必要がある
                if (slab_is_safe_to_bind(&h)) {
                    // Optional: move a few nodes to Front SLL to boost next hits
                    tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
                    reused_slabs = 1;
                    tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
                    return h.ss;
                }
                // Safe to bind 失敗 → 次の slab を試す
                slab_release(&h);
            }
        }

        // Priority 2: Use unused slabs (virgin slabs)
        if (tls->ss->active_slabs < tls_cap) {
            // Find next free slab
            int free_idx = superslab_find_free_slab(tls->ss);
            free_idx_attempted = free_idx;
            if (free_idx >= 0) {
                // Initialize this slab
                uint32_t my_tid = tiny_self_u32();
                superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);

                // Update TLS cache (unified update)
                tiny_tls_bind_slab(tls, tls->ss, free_idx);

                return tls->ss;
            }
        }
    }

    // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan)
    // This reduces pressure to allocate new SS when other threads freed blocks.
    if (!tls->ss) {
        // Best-effort: scan a small window of registry for our class
        extern SuperRegEntry g_super_reg[];
        int scanned = 0;
        const int scan_max = tiny_reg_scan_max();
        for (int i = 0; i < SUPER_REG_SIZE && scanned < scan_max; i++) {
            SuperRegEntry* e = &g_super_reg[i];
            uintptr_t base = atomic_load_explicit((_Atomic uintptr_t*)&e->base, memory_order_acquire);
            if (base == 0) continue;
            SuperSlab* ss = atomic_load_explicit(&e->ss, memory_order_acquire);
            if (!ss || ss->magic != SUPERSLAB_MAGIC) continue;
            if ((int)ss->size_class != class_idx) { scanned++; continue; }
            // Pick first slab with freelist (Box 4: 所有権取得 + remote check)
        int reg_cap = ss_slabs_capacity(ss);
        uint32_t self_tid = tiny_self_u32();
        for (int s = 0; s < reg_cap; s++) {
            if (ss->slabs[s].freelist) {
                SlabHandle h = slab_try_acquire(ss, s, self_tid);
                if (slab_is_valid(&h)) {
                    slab_drain_remote_full(&h);
                    if (slab_is_safe_to_bind(&h)) {
                        tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
                        tiny_tls_bind_slab(tls, ss, s);
                        return ss;
                    }
                    slab_release(&h);
                }
            }
        }
            scanned++;
        }
    }

    // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window
    {
        SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls);
        if (gate_ss) return gate_ss;
    }

    // Allocate new SuperSlab
    SuperSlab* ss = superslab_allocate((uint8_t)class_idx);
    if (!ss) {
        if (!g_superslab_refill_debug_once) {
            g_superslab_refill_debug_once = 1;
            int err = errno;
            fprintf(stderr,
                    "[DEBUG] superslab_refill NULL detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n",
                    class_idx,
                    (void*)prev_ss,
                    (unsigned)prev_active,
                    prev_bitmap,
                    (void*)prev_meta,
                    (unsigned)prev_meta_used,
                    (unsigned)prev_meta_cap,
                    (unsigned)prev_slab_idx,
                    reused_slabs,
                    free_idx_attempted,
                    err);
        }
        return NULL;  // OOM
    }

    // Initialize first slab
    uint32_t my_tid = tiny_self_u32();
    superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid);

    // Cache in unified TLS（前のSS参照を解放）
    SuperSlab* old = tls->ss;
    tiny_tls_bind_slab(tls, ss, 0);
    // Maintain refcount（将来の空回収に備え、TLS参照をカウント）
    superslab_ref_inc(ss);
    if (old && old != ss) {
        superslab_ref_dec(old);
    }

    return ss;
}

// Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix)
static inline void* hak_tiny_alloc_superslab(int class_idx) {
    // DEBUG: Function entry trace
    tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0);

    // MidTC fast path: 128..1024B（class>=4）はTLS tcacheを最優先
    do {
        void* mp = midtc_pop(class_idx);
        if (mp) {
            HAK_RET_ALLOC(class_idx, mp);
        }
    } while (0);

    // Phase 6.24: 1 TLS read (down from 3)
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];

    TinySlabMeta* meta = tls->meta;
    int slab_idx = tls->slab_idx;
    if (meta && slab_idx >= 0 && tls->ss) {
        // A/B: Relaxed read for remote head presence check
        static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed
        if (__builtin_expect(g_alloc_remote_relax == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX");
            g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0;
        }
        uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx],
                                                 g_alloc_remote_relax ? memory_order_relaxed
                                                                       : memory_order_acquire);
        if (__builtin_expect(pending != 0, 0)) {
            uint32_t self_tid = tiny_self_u32();
            if (ss_owner_try_acquire(meta, self_tid)) {
                _ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta);
            }
        }
    }

    // FIX #2 DELETED (Race condition fix):
    // Previous drain-all-slabs without ownership caused concurrent freelist corruption.
    // Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash.
    // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
    // Remote frees will be drained when the slab is adopted via refill paths.

    // Fast path: Direct metadata access (no repeated TLS reads!)
    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
        // Linear allocation (lazy init)
        size_t block_size = g_tiny_class_sizes[tls->ss->size_class];
        void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));
        meta->used++;
        // Track active blocks in SuperSlab for conservative reclamation
        ss_active_inc(tls->ss);
        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
    }

    if (meta && meta->freelist) {
        // Freelist allocation
        void* block = meta->freelist;
        // Safety: bounds/alignment check (debug)
        if (__builtin_expect(g_tiny_safe_free, 0)) {
            size_t blk = g_tiny_class_sizes[tls->ss->size_class];
            uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
            uintptr_t delta = (uintptr_t)block - (uintptr_t)base;
            int align_ok = ((delta % blk) == 0);
            int range_ok = (delta / blk) < meta->capacity;
            if (!align_ok || !range_ok) {
                uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u);
                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u);
                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; }
                return NULL;
            }
        }
        void* next = *(void**)block;
        meta->freelist = next;
        meta->used++;
        // Optional: clear freelist bit when becomes empty
        do {
            static int g_mask_en = -1;
            if (__builtin_expect(g_mask_en == -1, 0)) {
                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
            }
            if (__builtin_expect(g_mask_en, 0) && next == NULL) {
                uint32_t bit = (1u << slab_idx);
                atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release);
            }
        } while (0);
        // Track active blocks in SuperSlab for conservative reclamation
        ss_active_inc(tls->ss);
        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
    }

    // Slow path: Refill TLS slab
    SuperSlab* ss = superslab_refill(class_idx);
    if (!ss) {
        static int log_oom = 0;
        if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; }
        return NULL;  // OOM
    }

    // Retry allocation (metadata already cached in superslab_refill)
    meta = tls->meta;

    // DEBUG: Check each condition (disabled for benchmarks)
    // static int log_retry = 0;
    // if (log_retry < 2) {
    //     fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n",
    //             (void*)meta, meta ? meta->freelist : NULL,
    //             meta ? meta->used : 0, meta ? meta->capacity : 0,
    //             (void*)tls->slab_base);
    //     log_retry++;
    // }

    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
        size_t block_size = g_tiny_class_sizes[ss->size_class];
        void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));

        // Disabled for benchmarks
        // static int log_success = 0;
        // if (log_success < 2) {
        //     fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n",
        //             block, class_idx, meta->used, meta->used + 1);
        //     log_success++;
        // }

        meta->used++;
        // Track active blocks in SuperSlab for conservative reclamation
        ss_active_inc(ss);
        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
    }

    // Disabled for benchmarks
    // static int log_fail = 0;
    // if (log_fail < 2) {
    //     fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n");
    //     log_fail++;
    // }
    return NULL;
}

// Phase 6.22-B: SuperSlab fast free path
static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
    HAK_DBG_INC(g_superslab_free_count);  // Phase 7.6: Track SuperSlab frees
    // Get slab index (supports 1MB/2MB SuperSlabs)
    int slab_idx = slab_index_for(ss, ptr);
    size_t ss_size = (size_t)1ULL << ss->lg_size;
    uintptr_t ss_base = (uintptr_t)ss;
    if (__builtin_expect(slab_idx < 0, 0)) {
        uintptr_t aux = tiny_remote_pack_diag(0xBAD1u, ss_base, ss_size, (uintptr_t)ptr);
        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
        return;
    }
    TinySlabMeta* meta = &ss->slabs[slab_idx];
    if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
        tiny_remote_watch_note("free_enter", ss, slab_idx, ptr, 0xA240u, tiny_self_u32(), 0);
        extern __thread TinyTLSSlab g_tls_slabs[];
        tiny_alloc_dump_tls_state(ss->size_class, "watch_free_enter", &g_tls_slabs[ss->size_class]);
#if !HAKMEM_BUILD_RELEASE
        extern __thread TinyTLSMag g_tls_mags[];
        TinyTLSMag* watch_mag = &g_tls_mags[ss->size_class];
        fprintf(stderr,
                "[REMOTE_WATCH_MAG] cls=%u mag_top=%d cap=%d\n",
                ss->size_class,
                watch_mag->top,
                watch_mag->cap);
#endif
    }
    if (__builtin_expect(g_tiny_safe_free, 0)) {
        size_t blk = g_tiny_class_sizes[ss->size_class];
        uint8_t* base = tiny_slab_base_for(ss, slab_idx);
        uintptr_t delta = (uintptr_t)ptr - (uintptr_t)base;
        int cap_ok = (meta->capacity > 0) ? 1 : 0;
        int align_ok = (delta % blk) == 0;
        int range_ok = cap_ok && (delta / blk) < meta->capacity;
        if (!align_ok || !range_ok) {
            uint32_t code = 0xA100u;
            if (align_ok) code |= 0x2u;
            if (range_ok) code |= 0x1u;
            uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr);
            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
            return;
        }
        // Duplicate in freelist (best-effort scan up to 64)
        void* scan = meta->freelist; int scanned = 0; int dup = 0;
        while (scan && scanned < 64) { if (scan == ptr) { dup = 1; break; } scan = *(void**)scan; scanned++; }
        if (dup) {
            uintptr_t aux = tiny_remote_pack_diag(0xDFu, ss_base, ss_size, (uintptr_t)ptr);
            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
            return;
        }
    }

    // Phase 6.23: Same-thread check
    uint32_t my_tid = tiny_self_u32();
    const int debug_guard = g_debug_remote_guard;
    static __thread int g_debug_free_count = 0;
    if (!g_tiny_force_remote && meta->owner_tid != 0 && meta->owner_tid == my_tid) {
        // Fast path: Direct freelist push (same-thread)
        if (g_debug_free_count < 1) {
            fprintf(stderr, "[FREE_SS] SAME-THREAD: owner=%u my=%u\n",
                    meta->owner_tid, my_tid);
            g_debug_free_count++;
        }
        if (__builtin_expect(meta->used == 0, 0)) {
            uintptr_t aux = tiny_remote_pack_diag(0x00u, ss_base, ss_size, (uintptr_t)ptr);
            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
            return;
        }
        tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "local_free_enter", my_tid);
        if (!tiny_remote_guard_allow_local_push(ss, slab_idx, meta, ptr, "local_free", my_tid)) {
            int transitioned = ss_remote_push(ss, slab_idx, ptr);
            meta->used--;
            ss_active_dec_one(ss);
            if (transitioned) {
                ss_partial_publish((int)ss->size_class, ss);
            }
            return;
        }
        // Optional: MidTC (TLS tcache for 128..1024B)
        do {
            int cls = (int)ss->size_class;
            if (midtc_enabled() && cls >= 4) {
                if (midtc_push(cls, ptr)) {
                    // Treat as returned to TLS cache (not SS freelist)
                    meta->used--;
                    ss_active_dec_one(ss);
                    return;
                }
            }
        } while (0);

        void* prev = meta->freelist;
        *(void**)ptr = prev;
        meta->freelist = ptr;
        do {
            static int g_mask_en = -1;
            if (__builtin_expect(g_mask_en == -1, 0)) {
                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
            }
            if (__builtin_expect(g_mask_en, 0) && prev == NULL) {
                uint32_t bit = (1u << slab_idx);
                atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
            }
        } while (0);
        tiny_remote_track_on_local_free(ss, slab_idx, ptr, "local_free", my_tid);
        meta->used--;
        // Decrement SuperSlab active counter (actual return to SS)
        ss_active_dec_one(ss);
        if (prev == NULL) {
            ss_partial_publish((int)ss->size_class, ss);
        }

        if (__builtin_expect(debug_guard, 0)) {
            fprintf(stderr, "[REMOTE_LOCAL] cls=%u slab=%d owner=%u my=%u ptr=%p prev=%p used=%u\n",
                    ss->size_class, slab_idx, meta->owner_tid, my_tid, ptr, prev, meta->used);
        }

        // 空検出は別途（ホットパス除外）
    } else {
        if (__builtin_expect(meta->owner_tid == my_tid && meta->owner_tid == 0, 0)) {
            uintptr_t aux = tiny_remote_pack_diag(0xA300u, ss_base, ss_size, (uintptr_t)ptr);
            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
            if (debug_guard) {
                fprintf(stderr, "[REMOTE_OWNER_ZERO] cls=%u slab=%d ptr=%p my=%u used=%u\n",
                        ss->size_class, slab_idx, ptr, my_tid, (unsigned)meta->used);
            }
        }
        tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "remote_free_enter", my_tid);
        // Slow path: Remote free (cross-thread)
        if (g_debug_free_count < 5) {
            fprintf(stderr, "[FREE_SS] CROSS-THREAD: owner=%u my=%u slab_idx=%d\n",
                    meta->owner_tid, my_tid, slab_idx);
            g_debug_free_count++;
        }
        if (__builtin_expect(g_tiny_safe_free, 0)) {
            // Best-effort duplicate scan in remote stack (up to 64 nodes)
            uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
            uintptr_t base = ss_base;
            int scanned = 0; int dup = 0;
            uintptr_t cur = head;
            while (cur && scanned < 64) {
                if ((cur < base) || (cur >= base + ss_size)) {
                    uintptr_t aux = tiny_remote_pack_diag(0xA200u, base, ss_size, cur);
                    tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
                    if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
                    break;
                }
                if ((void*)cur == ptr) { dup = 1; break; }
                if (__builtin_expect(g_remote_side_enable, 0)) {
                    if (!tiny_remote_sentinel_ok((void*)cur)) {
                        uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, cur);
                        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
                        uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)(void*)cur, memory_order_relaxed);
                        tiny_remote_report_corruption("scan", (void*)cur, observed);
                        fprintf(stderr,
                                "[REMOTE_SENTINEL] cls=%u slab=%d cur=%p head=%p ptr=%p scanned=%d observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p remote_head=%p\n",
                                ss->size_class,
                                slab_idx,
                                (void*)cur,
                                (void*)head,
                                ptr,
                                scanned,
                                observed,
                                meta->owner_tid,
                                (unsigned)meta->used,
                                meta->freelist,
                                (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed));
                        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
                        break;
                    }
                    cur = tiny_remote_side_get(ss, slab_idx, (void*)cur);
                } else {
                    if ((cur & (uintptr_t)(sizeof(void*) - 1)) != 0) {
                        uintptr_t aux = tiny_remote_pack_diag(0xA201u, base, ss_size, cur);
                        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
                        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
                        break;
                    }
                    cur = (uintptr_t)(*(void**)(void*)cur);
                }
                scanned++;
            }
            if (dup) {
            uintptr_t aux = tiny_remote_pack_diag(0xD1u, ss_base, ss_size, (uintptr_t)ptr);
            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
            return;
        }
        }
        if (__builtin_expect(meta->used == 0, 0)) {
            uintptr_t aux = tiny_remote_pack_diag(0x01u, ss_base, ss_size, (uintptr_t)ptr);
            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
            return;
        }
        static int g_ss_adopt_en2 = -1; // env cached
        if (g_ss_adopt_en2 == -1) {
            char* e = getenv("HAKMEM_TINY_SS_ADOPT");
            // 既定: Remote Queueを使う（1）。env指定時のみ上書き。
            g_ss_adopt_en2 = (e == NULL) ? 1 : ((*e != '0') ? 1 : 0);
            if (__builtin_expect(debug_guard, 0)) {
                fprintf(stderr, "[FREE_SS] g_ss_adopt_en2=%d (env='%s')\n", g_ss_adopt_en2, e ? e : "(null)");
            }
        }
        if (g_ss_adopt_en2) {
            // Use remote queue
            uintptr_t head_word = __atomic_load_n((uintptr_t*)ptr, __ATOMIC_RELAXED);
            fprintf(stderr, "[REMOTE_PUSH_CALL] cls=%u slab=%d owner=%u my=%u ptr=%p used=%u remote_count=%u head=%p word=0x%016" PRIxPTR "\n",
                    ss->size_class,
                    slab_idx,
                    meta->owner_tid,
                    my_tid,
                    ptr,
                    (unsigned)meta->used,
                    atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_relaxed),
                    (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed),
                    head_word);
            int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, ptr);
            if (!dup_remote && __builtin_expect(g_remote_side_enable, 0)) {
                dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, ptr);
            }
            if (__builtin_expect(head_word == TINY_REMOTE_SENTINEL && !dup_remote && g_debug_remote_guard, 0)) {
                tiny_remote_watch_note("dup_scan_miss", ss, slab_idx, ptr, 0xA215u, my_tid, 0);
            }
            if (dup_remote) {
                uintptr_t aux = tiny_remote_pack_diag(0xA214u, ss_base, ss_size, (uintptr_t)ptr);
                tiny_remote_watch_mark(ptr, "dup_prevent", my_tid);
                tiny_remote_watch_note("dup_prevent", ss, slab_idx, ptr, 0xA214u, my_tid, 0);
                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
                return;
            }
            if (__builtin_expect(g_remote_side_enable && (head_word & 0xFFFFu) == 0x6261u, 0)) {
                // TLS guard scribble detected on the node's first word → same-pointer double free across routes
                uintptr_t aux = tiny_remote_pack_diag(0xA213u, ss_base, ss_size, (uintptr_t)ptr);
                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
                tiny_remote_watch_mark(ptr, "pre_push", my_tid);
                tiny_remote_watch_note("pre_push", ss, slab_idx, ptr, 0xA231u, my_tid, 0);
                tiny_remote_report_corruption("pre_push", ptr, head_word);
                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
                return;
            }
            if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
                tiny_remote_watch_note("free_remote", ss, slab_idx, ptr, 0xA232u, my_tid, 0);
            }
            int was_empty = ss_remote_push(ss, slab_idx, ptr);
            meta->used--;
            ss_active_dec_one(ss);
            if (was_empty) {
                ss_partial_publish((int)ss->size_class, ss);
            }
        } else {
            // Fallback: direct freelist push (legacy)
            fprintf(stderr, "[FREE_SS] Using LEGACY freelist push (not remote queue)\n");
            void* prev = meta->freelist;
            *(void**)ptr = prev;
            meta->freelist = ptr;
            do {
                static int g_mask_en = -1;
                if (__builtin_expect(g_mask_en == -1, 0)) {
                    const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
                    g_mask_en = (e && *e && *e != '0') ? 1 : 0;
                }
                if (__builtin_expect(g_mask_en, 0) && prev == NULL) {
                    uint32_t bit = (1u << slab_idx);
                    atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
                }
            } while (0);
            meta->used--;
            ss_active_dec_one(ss);
            if (prev == NULL) {
                ss_partial_publish((int)ss->size_class, ss);
            }
        }

        // 空検出は別途（ホットパス除外）
    }
}

void hak_tiny_free(void* ptr) {
    if (!ptr || !g_tiny_initialized) return;

    hak_tiny_stats_poll();
    tiny_debug_ring_record(TINY_RING_EVENT_FREE_ENTER, 0, ptr, 0);

#ifdef HAKMEM_TINY_BENCH_SLL_ONLY
    // Bench-only SLL-only free: push to TLS SLL for ≤64B when possible
    {
        int class_idx = -1;
        if (g_use_superslab) {
            // FIXED: Use hak_super_lookup() instead of hak_super_lookup() to avoid false positives
            SuperSlab* ss = hak_super_lookup(ptr);
            if (ss && ss->magic == SUPERSLAB_MAGIC) class_idx = ss->size_class;
        }
        if (class_idx < 0) {
            TinySlab* slab = hak_tiny_owner_slab(ptr);
            if (slab) class_idx = slab->class_idx;
        }
        if (class_idx >= 0 && class_idx <= 3) {
            uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
            if ((int)g_tls_sll_count[class_idx] < (int)sll_cap) {
                *(void**)ptr = g_tls_sll_head[class_idx];
                g_tls_sll_head[class_idx] = ptr;
                g_tls_sll_count[class_idx]++;
                return;
            }
        }
    }
#endif

    if (g_tiny_ultra) {
        int class_idx = -1;
        if (g_use_superslab) {
            // FIXED: Use hak_super_lookup() instead of hak_super_lookup() to avoid false positives
            SuperSlab* ss = hak_super_lookup(ptr);
            if (ss && ss->magic == SUPERSLAB_MAGIC) class_idx = ss->size_class;
        }
        if (class_idx < 0) {
            TinySlab* slab = hak_tiny_owner_slab(ptr);
            if (slab) class_idx = slab->class_idx;
        }
        if (class_idx >= 0) {
            // Ultra free: push directly to TLS SLL without magazine init
            int sll_cap = ultra_sll_cap_for_class(class_idx);
            if ((int)g_tls_sll_count[class_idx] < sll_cap) {
                *(void**)ptr = g_tls_sll_head[class_idx];
                g_tls_sll_head[class_idx] = ptr;
                g_tls_sll_count[class_idx]++;
                return;
            }
        }
        // Fallback to existing path if class resolution fails
    }

    SuperSlab* fast_ss = NULL;
    TinySlab* fast_slab = NULL;
    int fast_class_idx = -1;
    if (g_use_superslab) {
        fast_ss = hak_super_lookup(ptr);
        if (fast_ss && fast_ss->magic == SUPERSLAB_MAGIC) {
            fast_class_idx = fast_ss->size_class;
        } else {
            fast_ss = NULL;
        }
    }
    if (fast_class_idx < 0) {
        fast_slab = hak_tiny_owner_slab(ptr);
        if (fast_slab) fast_class_idx = fast_slab->class_idx;
    }
    // Safety: detect class mismatch (SS vs TinySlab) early
    if (__builtin_expect(g_tiny_safe_free && fast_class_idx >= 0, 0)) {
        int ss_cls = -1, ts_cls = -1;
        SuperSlab* chk_ss = fast_ss ? fast_ss : (g_use_superslab ? hak_super_lookup(ptr) : NULL);
        if (chk_ss && chk_ss->magic == SUPERSLAB_MAGIC) ss_cls = chk_ss->size_class;
        TinySlab* chk_slab = fast_slab ? fast_slab : hak_tiny_owner_slab(ptr);
        if (chk_slab) ts_cls = chk_slab->class_idx;
        if (ss_cls >= 0 && ts_cls >= 0 && ss_cls != ts_cls) {
            uintptr_t packed = ((uintptr_t)(uint16_t)ss_cls << 16) | (uint16_t)ts_cls;
            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)fast_class_idx, ptr, packed);
            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
        }
    }
    if (fast_class_idx >= 0) {
        tiny_debug_ring_record(TINY_RING_EVENT_FREE_ENTER, (uint16_t)fast_class_idx, ptr, 1);
    }
    if (fast_class_idx >= 0 && g_fast_enable && g_fast_cap[fast_class_idx] != 0) {
        if (tiny_fast_push(fast_class_idx, ptr)) {
            tiny_debug_ring_record(TINY_RING_EVENT_FREE_FAST, (uint16_t)fast_class_idx, ptr, 0);
            HAK_STAT_FREE(fast_class_idx);
            return;
        }
    }

    // SuperSlab detection: prefer fast mask-based check when available
    SuperSlab* ss = fast_ss;
    if (!ss && g_use_superslab) {
        ss = hak_super_lookup(ptr);
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) {
            ss = NULL;
        }
    }
    if (ss && ss->magic == SUPERSLAB_MAGIC) {
        // Direct SuperSlab free (avoid second lookup TOCTOU)
        hak_tiny_free_superslab(ptr, ss);
        HAK_STAT_FREE(ss->size_class);
        return;
    }

    // Fallback to TinySlab only when SuperSlab is not in use
    TinySlab* slab = fast_slab;
    if (!slab) slab = hak_tiny_owner_slab(ptr);
    if (!slab) return;  // Not managed by Tiny Pool
    if (__builtin_expect(g_use_superslab, 0)) {
        // In SS mode, a pointer that resolves only to TinySlab is suspicious → treat as invalid free
        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xEE, ptr, 0xF1u);
        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
        return;
    }

    hak_tiny_free_with_slab(ptr, slab);
}

// ============================================================================
// EXTRACTED TO hakmem_tiny_query.c (Phase 2B-1)
// ============================================================================
// EXTRACTED: int hak_tiny_is_managed(void* ptr) {
// EXTRACTED:     if (!ptr || !g_tiny_initialized) return 0;
// EXTRACTED:     // Phase 6.12.1: O(1) slab lookup via registry/list
// EXTRACTED:     return hak_tiny_owner_slab(ptr) != NULL || hak_super_lookup(ptr) != NULL;
// EXTRACTED: }

// Phase 7.6: Check if pointer is managed by Tiny Pool (TinySlab OR SuperSlab)
// EXTRACTED: int hak_tiny_is_managed_superslab(void* ptr) {
// EXTRACTED:     if (!ptr || !g_tiny_initialized) return 0;
// EXTRACTED: 
// EXTRACTED:     // Safety: Only check if g_use_superslab is enabled
// EXTRACTED:     if (g_use_superslab) {
// EXTRACTED:         SuperSlab* ss = hak_super_lookup(ptr);
// EXTRACTED:         // Phase 8.2 optimization: Use alignment check instead of mincore()
// EXTRACTED:         // SuperSlabs are always SUPERSLAB_SIZE-aligned (2MB)
// EXTRACTED:         if (ss && ((uintptr_t)ss & (SUPERSLAB_SIZE - 1)) == 0) {
// EXTRACTED:             if (ss->magic == SUPERSLAB_MAGIC) {
// EXTRACTED:                 return 1;  // Valid SuperSlab pointer
// EXTRACTED:             }
// EXTRACTED:         }
// EXTRACTED:     }
// EXTRACTED: 
// EXTRACTED:     // Fallback to TinySlab check
// EXTRACTED:     return hak_tiny_owner_slab(ptr) != NULL;
// EXTRACTED: }

// Return the usable size for a Tiny-managed pointer (0 if unknown/not tiny).
// Prefer SuperSlab metadata when available; otherwise use TinySlab owner class.
// EXTRACTED: size_t hak_tiny_usable_size(void* ptr) {
// EXTRACTED:     if (!ptr || !g_tiny_initialized) return 0;
// EXTRACTED: 
// EXTRACTED:     // Check SuperSlab first via registry (safe under direct link and LD)
// EXTRACTED:     if (g_use_superslab) {
// EXTRACTED:         SuperSlab* ss = hak_super_lookup(ptr);
// EXTRACTED:         if (ss && ss->magic == SUPERSLAB_MAGIC) {
// EXTRACTED:             int k = (int)ss->size_class;
// EXTRACTED:             if (k >= 0 && k < TINY_NUM_CLASSES) {
// EXTRACTED:                 return g_tiny_class_sizes[k];
// EXTRACTED:             }
// EXTRACTED:         }
// EXTRACTED:     }
// EXTRACTED: 
// EXTRACTED:     // Fallback: TinySlab owner lookup
// EXTRACTED:     TinySlab* slab = hak_tiny_owner_slab(ptr);
// EXTRACTED:     if (slab) {
// EXTRACTED:         int k = slab->class_idx;
// EXTRACTED:         if (k >= 0 && k < TINY_NUM_CLASSES) {
// EXTRACTED:             return g_tiny_class_sizes[k];
// EXTRACTED:         }
// EXTRACTED:     }
// EXTRACTED:     return 0;
// EXTRACTED: }


// ============================================================================
// Statistics and Debug Functions - Extracted to hakmem_tiny_stats.c
// ============================================================================
// (Phase 2B API headers moved to top of file)


// Optional shutdown hook to stop background components (e.g., Intelligence Engine)
void hak_tiny_shutdown(void) {
    // Release TLS SuperSlab references (dec refcount) before stopping BG/INT
    for (int k = 0; k < TINY_NUM_CLASSES; k++) {
        TinyTLSSlab* tls = &g_tls_slabs[k];
        if (tls->ss) {
            superslab_ref_dec(tls->ss);
            tls->ss = NULL;
            tls->meta = NULL;
            tls->slab_base = NULL;
        }
    }
    if (g_bg_bin_started) {
        g_bg_bin_stop = 1;
        if (!pthread_equal(tiny_self_pt(), g_bg_bin_thread)) {
            pthread_join(g_bg_bin_thread, NULL);
        }
        g_bg_bin_started = 0;
        g_bg_bin_enable = 0;
    }
    tiny_obs_shutdown();
    if (g_int_engine && g_int_started) {
        g_int_stop = 1;
        // Best-effort join; avoid deadlock if called from within the thread
        if (!pthread_equal(tiny_self_pt(), g_int_thread)) {
            pthread_join(g_int_thread, NULL);
        }
        g_int_started = 0;
        g_int_engine = 0;
    }
}


// Always-available: Trim empty slabs (release fully-free slabs)