diff --git a/core/box/pool_mf2_adoption.inc.h b/core/box/pool_mf2_adoption.inc.h
new file mode 100644
index 00000000..56ab27f7
--- /dev/null
+++ b/core/box/pool_mf2_adoption.inc.h
@@ -0,0 +1,129 @@
+// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue
+// Returns true if a page was successfully adopted and activated
+// Called from alloc_slow when allocating thread needs memory
+static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) {
+    if (!me) return false;
+
+    // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating)
+    // Avoids scanning empty queues (major performance win!)
+    int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed);
+    if (adoptable == 0) return false;  // All queues empty, no scan needed
+
+    // Get global thread registry
+    int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire);
+    if (num_tp == 0) return false;
+
+    // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES)
+    // Prevents excessive scanning overhead (2-8 threads is usually enough)
+    int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues;
+
+    // Round-robin scan (limited number of threads, not ALL!)
+    static _Atomic uint64_t adopt_counter = 0;
+    uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed);
+
+    for (int i = 0; i < scan_limit; i++) {
+        int tp_idx = (start_idx + i) % num_tp;
+        MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit(
+            (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire);
+
+        if (!other_tp) continue;
+
+        // Route P: Idle Detection - Only adopt from idle owners
+        // Check if owner is still actively allocating (threshold configurable via env var)
+        uint64_t now_tsc = mf2_rdtsc();
+        uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed);
+        uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US;
+
+        if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) {
+            continue;  // Owner still active, skip adoption
+        }
+
+        // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!)
+        // Only one thread scans each queue at a time → eliminates CAS contention
+        if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) {
+            continue;  // Another thread is already scanning this queue, skip
+        }
+
+        // Try to dequeue a pending page from this thread
+        MidPage* page = mf2_dequeue_pending(other_tp, class_idx);
+        if (!page) {
+            // Queue empty, release claim and try next thread
+            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
+            continue;
+        }
+
+        // Clear pending flag (no longer in queue)
+        atomic_store_explicit(&page->in_remote_pending, false, memory_order_release);
+
+        // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS)
+        // 0ms = disabled (no lease check), >0 = lease period in milliseconds
+        uint64_t now = mf2_rdtsc();
+        uint64_t last_transfer = page->last_transfer_time;
+        if (g_mf2_lease_ms > 0 && last_transfer != 0) {
+            // Calculate lease cycles from ms (approx 3GHz CPU)
+            uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL);
+            if ((now - last_transfer) < lease_cycles) {
+                // Lease still active, return page to full_pages (don't thrash ownership)
+                page->next_page = other_tp->full_pages[class_idx];
+                other_tp->full_pages[class_idx] = page;
+                // Release claim before continuing
+                atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
+                continue;  // Try next thread
+            }
+        }
+
+        // Try to transfer ownership using CAS
+        pthread_t old_owner = page->owner_tid;
+        pthread_t new_owner = pthread_self();
+
+        // Note: pthread_t may not be atomic-compatible on all platforms
+        // For now, we'll use a simple write (ownership transfer is rare)
+        // TODO: If thrashing is observed, add atomic CAS with serialization
+        page->owner_tid = new_owner;
+        page->owner_tp = me;
+        page->last_transfer_time = now;
+
+        // DEBUG: Log drain state
+        static _Atomic int adopt_samples = 0;
+        int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed);
+        unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed);
+        unsigned int pre_free = page->free_count;
+        PoolBlock* pre_freelist = page->freelist;
+
+        // Drain remote frees
+        int drained = mf2_drain_remote_frees(page);
+
+        // DEBUG: Log result (first 10 samples)
+        if (sample_idx < 10) {
+            MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p",
+                          sample_idx, class_idx, pre_remote, drained,
+                          pre_free, page->free_count, pre_freelist, page->freelist);
+        }
+
+        // Make adopted page ACTIVE immediately (not partial!)
+        // Adoption needs immediate activation for caller's mf2_alloc_fast()
+        // Partial list is only for own pending queue drains
+        if (page->freelist) {
+            atomic_fetch_add(&g_mf2_page_reuse_count, 1);
+            atomic_fetch_add(&g_mf2_pending_drained, 1);
+            atomic_fetch_add(&g_mf2_drain_success, 1);
+
+            // Make it active (move old active to full_pages)
+            mf2_make_page_active(me, class_idx, page);
+
+            // Release claim before returning SUCCESS
+            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
+            return true;  // SUCCESS! Page adopted and activated
+        }
+
+        // No freelist after drain, return to MY full_pages (I'm the new owner!)
+        page->next_page = me->full_pages[class_idx];
+        me->full_pages[class_idx] = page;
+        // Release claim before continuing search
+        atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
+        // Continue searching for a better page
+    }
+
+    return false;  // No adoptable pages found
+}
+
diff --git a/core/box/pool_mf2_helpers.inc.h b/core/box/pool_mf2_helpers.inc.h
new file mode 100644
index 00000000..37367082
--- /dev/null
+++ b/core/box/pool_mf2_helpers.inc.h
@@ -0,0 +1,158 @@
+// Forward declarations
+static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id);
+
+// ===========================================================================
+// Helper Functions (Clean & Modular)
+// ===========================================================================
+
+// Helper: Make page active (move old active to full_pages)
+static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
+    if (!tp || !page) return;
+
+    // Move old active page to full_pages (if any)
+    if (tp->active_page[class_idx]) {
+        MidPage* old_active = tp->active_page[class_idx];
+        old_active->next_page = tp->full_pages[class_idx];
+        tp->full_pages[class_idx] = old_active;
+    }
+
+    // Set new page as active
+    tp->active_page[class_idx] = page;
+    page->next_page = NULL;
+}
+
+// Helper: Drain page and add to partial list (LIFO for cache locality)
+// Returns true if page has free blocks after drain
+static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
+    if (!tp || !page) return false;
+
+    // Drain remote frees
+    int drained = mf2_drain_remote_frees(page);
+
+    // If page has freelist after drain, add to partial list (LIFO)
+    if (page->freelist) {
+        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
+        page->next_page = tp->partial_pages[class_idx];
+        tp->partial_pages[class_idx] = page;
+        return true;
+    }
+
+    // No freelist, return to full_pages
+    page->next_page = tp->full_pages[class_idx];
+    tp->full_pages[class_idx] = page;
+    return false;
+}
+
+// Helper: Drain page and activate if successful (Direct Handoff - backward compat)
+// Returns true if page was activated
+static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
+    if (!tp || !page) return false;
+
+    // Drain remote frees
+    int drained = mf2_drain_remote_frees(page);
+
+    // If page has freelist after drain, make it active immediately
+    if (page->freelist) {
+        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
+        mf2_make_page_active(tp, class_idx, page);
+        return true;
+    }
+
+    // No freelist, return to full_pages
+    page->next_page = tp->full_pages[class_idx];
+    tp->full_pages[class_idx] = page;
+    return false;
+}
+
+// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1)
+// Returns true if a page was successfully drained and activated
+static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) {
+    if (!tp) return false;
+
+    // Budget: Process up to N pages to avoid blocking
+    for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) {
+        MidPage* pending_page = mf2_dequeue_pending(tp, class_idx);
+        if (!pending_page) break;  // Queue empty
+
+        atomic_fetch_add(&g_mf2_pending_drained, 1);
+
+        // Clear pending flag (no longer in queue)
+        atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release);
+
+        // DIRECT HANDOFF: Drain and activate if successful
+        if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) {
+            return true;  // Success! Page is now active
+        }
+        // No freelist after drain, page returned to full_pages by helper
+    }
+    return false;  // No pages available for reuse
+}
+
+// Helper: Try to drain remotes from active page (must-reuse gate part 2)
+// Returns true if active page has freelist after drain
+static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) {
+    if (!tp) return false;
+
+    MidPage* page = tp->active_page[class_idx];
+    if (!page) return false;
+
+    atomic_fetch_add(&g_mf2_slow_checked_drain, 1);
+    unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
+
+    if (remote_cnt > 0) {
+        atomic_fetch_add(&g_mf2_slow_found_remote, 1);
+        int drained = mf2_drain_remote_frees(page);
+        if (drained > 0 && page->freelist) {
+            atomic_fetch_add(&g_mf2_drain_success, 1);
+            return true;  // Success! Active page now has freelist
+        }
+    }
+    return false;  // No remotes or drain failed
+}
+
+// Helper: Allocate new page and make it active
+// Returns the newly allocated page (or NULL on OOM)
+static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) {
+    if (!tp) return NULL;
+
+    atomic_fetch_add(&g_mf2_new_page_count, 1);
+
+    // DEBUG: Log why we're allocating new page (first N samples)
+    static _Atomic int new_page_samples = 0;
+    int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed);
+    if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) {
+        // Count adoptable pages across all threads
+        int total_adoptable = 0;
+        for (int i = 0; i < POOL_NUM_CLASSES; i++) {
+            total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed);
+        }
+        MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p",
+                      sample_idx, class_idx,
+                      (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed),
+                      total_adoptable,
+                      tp->active_page[class_idx],
+                      tp->full_pages[class_idx]);
+    }
+
+    MidPage* page = mf2_alloc_new_page(class_idx);
+    if (!page) {
+        return NULL; // OOM
+    }
+
+    // Move current active page to full list (if any)
+    if (tp->active_page[class_idx]) {
+        MidPage* old_page = tp->active_page[class_idx];
+        old_page->next_page = tp->full_pages[class_idx];
+        tp->full_pages[class_idx] = old_page;
+    }
+
+    // Set new page as active
+    tp->active_page[class_idx] = page;
+    tp->page_count[class_idx]++;
+
+    return page;
+}
+
+// ===========================================================================
+// End of Helper Functions
+// ===========================================================================
diff --git a/core/box/pool_mf2_types.inc.h b/core/box/pool_mf2_types.inc.h
new file mode 100644
index 00000000..203a6a9c
--- /dev/null
+++ b/core/box/pool_mf2_types.inc.h
@@ -0,0 +1,266 @@
+// ===========================================================================
+// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture
+// ===========================================================================
+//
+// Key idea: Each 64KB page has independent freelist (no sharing!)
+// - O(1) page lookup from block address: (addr & ~0xFFFF)
+// - Owner thread: fast path (no locks, no atomics)
+// - Cross-thread free: lock-free remote stack
+// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc)
+
+// MF2 Configuration Constants (Quick Win #5)
+#define MF2_PENDING_QUEUE_BUDGET    4      // Max pages to drain from pending queue
+#define MF2_DEBUG_SAMPLE_COUNT      20     // Number of debug samples to log
+#define MF2_TSC_CYCLES_PER_US       3000   // Estimated TSC cycles per microsecond
+#define MF2_PAGE_SIZE_SHIFT         16     // log2(64KB) for fast division
+#define MF2_PAGE_ALIGNMENT          65536  // 64KB alignment for mmap
+
+// Debug Logging Macros (Quick Win #6)
+// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable
+#ifdef HAKMEM_DEBUG_MF2
+    #define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__)
+    #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
+#else
+    #define MF2_DEBUG_LOG(fmt, ...) ((void)0)
+    #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
+#endif
+
+// Forward declarations
+static size_t g_class_sizes[POOL_NUM_CLASSES];
+
+// MF2 Page descriptor: per-page metadata (one per 64KB page)
+typedef struct MidPage {
+    // Page identity
+    void* base;              // Page base address (64KB aligned)
+    uint8_t class_idx;       // Size class index (0-6)
+    uint8_t flags;           // Page flags (reserved for future use)
+    uint16_t _pad0;
+
+    // Ownership
+    pthread_t owner_tid;     // Owner thread ID (for fast-path check)
+    struct MF2_ThreadPages* owner_tp;  // Owner thread's heap (for pending queue access)
+    uint64_t last_transfer_time;  // rdtsc() timestamp of last ownership transfer (lease mechanism)
+
+    // Page-local freelist (owner-only, NO LOCK!)
+    PoolBlock* freelist;     // Local freelist head
+    uint16_t free_count;     // Number of free blocks
+    uint16_t capacity;       // Total blocks per page
+
+    // Remote frees (cross-thread, lock-free MPSC stack)
+    atomic_uintptr_t remote_head;   // Lock-free remote free stack
+    atomic_uint remote_count;       // Remote free count (for quick check)
+
+    // Lifecycle
+    atomic_int in_use;       // Live allocations on this page
+    atomic_int pending_dn;   // DONTNEED enqueued flag
+
+    // Linkage (thread-local page lists)
+    struct MidPage* next_page;   // Next page in thread's list
+    struct MidPage* prev_page;   // Previous page in thread's list
+
+    // Pending queue (remote drain notification)
+    _Atomic(_Bool) in_remote_pending;  // Is this page in pending queue?
+    struct MidPage* next_pending;       // Next page in pending queue
+
+    // Padding to cache line boundary (avoid false sharing)
+    char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 +
+                     sizeof(atomic_uintptr_t) + sizeof(atomic_uint) +
+                     sizeof(atomic_int) * 2 + sizeof(pthread_t) +
+                     sizeof(_Atomic(_Bool)) + 4) % 64)];
+} MidPage;
+
+// Page registry: O(1) lookup from block address
+// Use direct indexing: (addr >> 16) & MASK
+#define MF2_PAGE_REGISTRY_BITS 16   // 64K entries (4GB address space with 64KB pages)
+#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS)
+#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1)
+
+typedef struct {
+    // Direct-mapped page table (no hash collisions!)
+    MidPage* pages[MF2_PAGE_REGISTRY_SIZE];
+
+    // Coarse-grained locks for rare updates (page alloc/free)
+    // 256 locks = 256-way parallelism for page registration
+    pthread_mutex_t locks[256];
+
+    // Statistics
+    atomic_uint_fast64_t total_pages;      // Total pages allocated
+    atomic_uint_fast64_t active_pages;     // Pages with live allocations
+} MF2_PageRegistry;
+
+// Thread-local page lists (one list per size class)
+typedef struct MF2_ThreadPages {
+    // Active pages (have free blocks)
+    MidPage* active_page[POOL_NUM_CLASSES];
+
+    // Partial pages (drained pages with free blocks, LIFO for cache locality)
+    // Checked before allocating new pages (fast reuse path)
+    MidPage* partial_pages[POOL_NUM_CLASSES];
+
+    // Full pages (no free blocks, but may receive remote frees)
+    // TODO: Gradually deprecate in favor of partial_pages
+    MidPage* full_pages[POOL_NUM_CLASSES];
+
+    // Pending queue (pages with remote frees, MPSC lock-free stack)
+    atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES];
+
+    // Pending claim flags (prevent multi-consumer CAS thrashing)
+    // One adopter at a time per queue (test_and_set to claim, clear to release)
+    atomic_flag pending_claim[POOL_NUM_CLASSES];
+
+    // Page ownership count (for statistics)
+    uint32_t page_count[POOL_NUM_CLASSES];
+
+    // Thread identity (cached for fast comparison)
+    pthread_t my_tid;
+
+    // Route P: Activity tracking for idle-based adoption
+    // Updated on every allocation (mf2_alloc_fast)
+    // Read by adopters to check if owner is idle
+    atomic_uint_fast64_t last_alloc_tsc;
+} MF2_ThreadPages;
+
+// Global page registry (shared, rarely accessed)
+static MF2_PageRegistry g_mf2_page_registry;
+
+// Thread-local page lists (hot path, no sharing!)
+static __thread MF2_ThreadPages* t_mf2_pages = NULL;
+
+// ===========================================================================
+// MF2 Global State (Quick Win #3b - Structured Globals)
+// ===========================================================================
+// Individual globals replaced with structured state below.
+// Old declarations removed, replaced with macro-mapped struct instances.
+//
+// Benefits:
+// - Logical grouping (config, registry, stats)
+// - Better documentation
+// - Easier to extend or refactor
+// - Single source of truth for each category
+
+#define MF2_MAX_THREADS 256
+
+// MF2 Configuration (environment variables)
+typedef struct {
+    int enabled;              // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled)
+    int max_queues;           // HAKMEM_MF2_MAX_QUEUES (default: 2)
+    int lease_ms;             // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled)
+    int idle_threshold_us;    // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs)
+} MF2_Config;
+
+// MF2 Thread Registry (cross-thread coordination)
+typedef struct {
+    MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS];  // Global registry
+    _Atomic int num_thread_pages;                         // Active thread count
+    _Atomic int adoptable_count[POOL_NUM_CLASSES];        // Non-empty pending queues
+    pthread_key_t tls_key;                                 // Thread-local storage key
+    pthread_once_t key_once;                               // TLS initialization guard
+} MF2_Registry;
+
+// MF2 Statistics (debug instrumentation)
+typedef struct {
+    // Allocation path
+    atomic_uint_fast64_t alloc_fast_hit;
+    atomic_uint_fast64_t alloc_slow_hit;
+    atomic_uint_fast64_t page_reuse_count;
+    atomic_uint_fast64_t new_page_count;
+
+    // Free path
+    atomic_uint_fast64_t free_owner_count;
+    atomic_uint_fast64_t free_remote_count;
+
+    // Drain operations
+    atomic_uint_fast64_t drain_count;
+    atomic_uint_fast64_t drain_blocks;
+    atomic_uint_fast64_t drain_attempts;
+    atomic_uint_fast64_t drain_success;
+    atomic_uint_fast64_t slow_checked_drain;
+    atomic_uint_fast64_t slow_found_remote;
+
+    // Full page scan (obsolete, kept for historical tracking)
+    atomic_uint_fast64_t full_scan_checked;
+    atomic_uint_fast64_t full_scan_found_remote;
+    atomic_uint_fast64_t eager_drain_scanned;
+    atomic_uint_fast64_t eager_drain_found;
+
+    // Pending queue
+    atomic_uint_fast64_t pending_enqueued;
+    atomic_uint_fast64_t pending_drained;
+    atomic_uint_fast64_t pending_requeued;
+} MF2_Stats;
+
+// Instantiate structured global state (Quick Win #3b)
+static MF2_Config g_mf2_config = {
+    .enabled = 0,           // Will be set by env var
+    .max_queues = 2,
+    .lease_ms = 10,
+    .idle_threshold_us = 150
+};
+
+static MF2_Registry g_mf2_registry = {
+    .all_thread_pages = {0},
+    .num_thread_pages = 0,
+    .adoptable_count = {0},
+    .tls_key = 0,
+    .key_once = PTHREAD_ONCE_INIT
+};
+
+static MF2_Stats g_mf2_stats = {
+    // All fields initialized to 0 (atomic zero-initialization is valid)
+    .alloc_fast_hit = 0,
+    .alloc_slow_hit = 0,
+    .page_reuse_count = 0,
+    .new_page_count = 0,
+    .free_owner_count = 0,
+    .free_remote_count = 0,
+    .drain_count = 0,
+    .drain_blocks = 0,
+    .drain_attempts = 0,
+    .drain_success = 0,
+    .slow_checked_drain = 0,
+    .slow_found_remote = 0,
+    .full_scan_checked = 0,
+    .full_scan_found_remote = 0,
+    .eager_drain_scanned = 0,
+    .eager_drain_found = 0,
+    .pending_enqueued = 0,
+    .pending_drained = 0,
+    .pending_requeued = 0
+};
+
+// Compatibility macros: Map old global names to struct fields
+// This allows existing code to work unchanged while using structured state
+#define g_mf2_enabled           (g_mf2_config.enabled)
+#define g_mf2_max_queues        (g_mf2_config.max_queues)
+#define g_mf2_lease_ms          (g_mf2_config.lease_ms)
+#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us)
+
+#define g_all_thread_pages      (g_mf2_registry.all_thread_pages)
+#define g_num_thread_pages      (g_mf2_registry.num_thread_pages)
+#define g_adoptable_count       (g_mf2_registry.adoptable_count)
+#define g_mf2_tls_key           (g_mf2_registry.tls_key)
+#define g_mf2_key_once          (g_mf2_registry.key_once)
+
+#define g_mf2_alloc_fast_hit        (g_mf2_stats.alloc_fast_hit)
+#define g_mf2_alloc_slow_hit        (g_mf2_stats.alloc_slow_hit)
+#define g_mf2_page_reuse_count      (g_mf2_stats.page_reuse_count)
+#define g_mf2_new_page_count        (g_mf2_stats.new_page_count)
+#define g_mf2_free_owner_count      (g_mf2_stats.free_owner_count)
+#define g_mf2_free_remote_count     (g_mf2_stats.free_remote_count)
+#define g_mf2_drain_count           (g_mf2_stats.drain_count)
+#define g_mf2_drain_blocks          (g_mf2_stats.drain_blocks)
+#define g_mf2_drain_attempts        (g_mf2_stats.drain_attempts)
+#define g_mf2_drain_success         (g_mf2_stats.drain_success)
+#define g_mf2_slow_checked_drain    (g_mf2_stats.slow_checked_drain)
+#define g_mf2_slow_found_remote     (g_mf2_stats.slow_found_remote)
+#define g_mf2_full_scan_checked     (g_mf2_stats.full_scan_checked)
+#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote)
+#define g_mf2_eager_drain_scanned   (g_mf2_stats.eager_drain_scanned)
+#define g_mf2_eager_drain_found     (g_mf2_stats.eager_drain_found)
+#define g_mf2_pending_enqueued      (g_mf2_stats.pending_enqueued)
+#define g_mf2_pending_drained       (g_mf2_stats.pending_drained)
+#define g_mf2_pending_requeued      (g_mf2_stats.pending_requeued)
+
+// ===========================================================================
+// End of MF2 Data Structures
+// ===========================================================================
diff --git a/core/box/pool_tls_types.inc.h b/core/box/pool_tls_types.inc.h
new file mode 100644
index 00000000..c27d32c4
--- /dev/null
+++ b/core/box/pool_tls_types.inc.h
@@ -0,0 +1,32 @@
+// ===========================================================================
+// Internal Data Structures
+// ===========================================================================
+
+// Freelist block header (embedded in allocated block)
+typedef struct PoolBlock {
+    struct PoolBlock* next;  // Next free block in freelist
+} PoolBlock;
+
+// TLS cache: one block per class to avoid frequent locks (legacy single-slot)
+__thread PoolBlock* tls_pool_cache[POOL_NUM_CLASSES] = {NULL};
+
+// TLS ring buffer to further reduce lock traffic (configurable capacity)
+// Separate ring size for L2 Pool (mid/large allocations: 8-32KB)
+#ifndef POOL_L2_RING_CAP
+#define POOL_L2_RING_CAP 48  // Optimized for L1 cache efficiency (384B, 6 cache lines)
+#endif
+typedef struct { PoolBlock* items[POOL_L2_RING_CAP]; int top; } PoolTLSRing;
+typedef struct { PoolTLSRing ring; PoolBlock* lo_head; size_t lo_count; } PoolTLSBin;
+static __thread PoolTLSBin g_tls_bin[POOL_NUM_CLASSES];
+
+// TLS active pages (per class): bump-run (no per-block links) from privately owned pages (max 3)
+typedef struct {
+    void* page;      // page base
+    char* bump;      // next raw allocation (header start)
+    char* end;       // page end (bump-run limit)
+    int   count;     // remaining blocks (for quick checks)
+} PoolTLSPage;
+static __thread PoolTLSPage g_tls_active_page_a[POOL_NUM_CLASSES];
+static __thread PoolTLSPage g_tls_active_page_b[POOL_NUM_CLASSES];
+static __thread PoolTLSPage g_tls_active_page_c[POOL_NUM_CLASSES];  // QW2-adjusted: 3 slots (was 4)
+
diff --git a/core/hakmem_pool.c b/core/hakmem_pool.c
index 696e3edb..92148e9b 100644
--- a/core/hakmem_pool.c
+++ b/core/hakmem_pool.c
@@ -64,489 +64,16 @@ typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64
 // ===========================================================================
 // Internal Data Structures
 // ===========================================================================
-
-// Freelist block header (embedded in allocated block)
-typedef struct PoolBlock {
-    struct PoolBlock* next;  // Next free block in freelist
-} PoolBlock;
-
-// TLS cache: one block per class to avoid frequent locks (legacy single-slot)
-__thread PoolBlock* tls_pool_cache[POOL_NUM_CLASSES] = {NULL};
-
-// TLS ring buffer to further reduce lock traffic (configurable capacity)
-// Separate ring size for L2 Pool (mid/large allocations: 8-32KB)
-#ifndef POOL_L2_RING_CAP
-#define POOL_L2_RING_CAP 48  // Optimized for L1 cache efficiency (384B, 6 cache lines)
-#endif
-typedef struct { PoolBlock* items[POOL_L2_RING_CAP]; int top; } PoolTLSRing;
-typedef struct { PoolTLSRing ring; PoolBlock* lo_head; size_t lo_count; } PoolTLSBin;
-static __thread PoolTLSBin g_tls_bin[POOL_NUM_CLASSES];
-
-// TLS active pages (per class): bump-run (no per-block links) from privately owned pages (max 3)
-typedef struct {
-    void* page;      // page base
-    char* bump;      // next raw allocation (header start)
-    char* end;       // page end (bump-run limit)
-    int   count;     // remaining blocks (for quick checks)
-} PoolTLSPage;
-static __thread PoolTLSPage g_tls_active_page_a[POOL_NUM_CLASSES];
-static __thread PoolTLSPage g_tls_active_page_b[POOL_NUM_CLASSES];
-static __thread PoolTLSPage g_tls_active_page_c[POOL_NUM_CLASSES];  // QW2-adjusted: 3 slots (was 4)
+#include "box/pool_tls_types.inc.h"
 
 // Mid page descriptor registry (64KiB pages → {class_idx, owner_tid})
-#define MID_DESC_BUCKETS 2048
-typedef struct MidPageDesc {
-    void* page;
-    uint8_t class_idx;
-    uint8_t _pad0;
-    uint16_t _pad1;
-    uint64_t owner_tid;
-    atomic_int in_use;        // live allocations on this page
-    int blocks_per_page;      // total blocks on this page
-    atomic_int pending_dn;    // background DONTNEED enqueued
-    struct MidPageDesc* next;
-} MidPageDesc;
-static pthread_mutex_t g_mid_desc_mu[MID_DESC_BUCKETS];
-static MidPageDesc* g_mid_desc_head[MID_DESC_BUCKETS];
-
-static inline uint32_t mid_desc_hash(void* page) {
-    uintptr_t x = (uintptr_t)page >> 16; // 64KiB alignment granularity
-    // mix
-    x ^= x >> 33; x *= 0xff51afd7ed558ccdULL; x ^= x >> 33; x *= 0xc4ceb9fe1a85ec53ULL; x ^= x >> 33;
-    return (uint32_t)(x & (MID_DESC_BUCKETS - 1));
-}
-
-// Thread-safe initialization using pthread_once
-static pthread_once_t mid_desc_init_once_control = PTHREAD_ONCE_INIT;
-static void mid_desc_init_impl(void) {
-    for (int i = 0; i < MID_DESC_BUCKETS; i++) {
-        pthread_mutex_init(&g_mid_desc_mu[i], NULL);
-        g_mid_desc_head[i] = NULL;
-    }
-}
-static void mid_desc_init_once(void) {
-    pthread_once(&mid_desc_init_once_control, mid_desc_init_impl);
-}
-
-static void mid_desc_register(void* page, int class_idx, uint64_t owner_tid) {
-    mid_desc_init_once();
-    uint32_t h = mid_desc_hash(page);
-    pthread_mutex_lock(&g_mid_desc_mu[h]);
-    MidPageDesc* d = (MidPageDesc*)hkm_libc_malloc(sizeof(MidPageDesc));  // P0 Fix: Use libc malloc
-    if (d) {
-        d->page = page; d->class_idx = (uint8_t)class_idx; d->owner_tid = owner_tid; d->next = g_mid_desc_head[h];
-        atomic_store(&d->in_use, 0);
-        d->blocks_per_page = 0; // optional; not used for emptiness in P0
-        atomic_store(&d->pending_dn, 0);
-        g_mid_desc_head[h] = d;
-    }
-    pthread_mutex_unlock(&g_mid_desc_mu[h]);
-}
-
-static MidPageDesc* mid_desc_lookup(void* addr) {
-    mid_desc_init_once();
-    void* page = (void*)((uintptr_t)addr & ~((uintptr_t)POOL_PAGE_SIZE - 1));
-    uint32_t h = mid_desc_hash(page);
-    for (MidPageDesc* d = g_mid_desc_head[h]; d; d = d->next) {
-        if (d->page == page) return d;
-    }
-    return NULL;
-}
-
-static void mid_desc_adopt(void* addr, int class_idx, uint64_t owner_tid) {
-    if (owner_tid == 0) return;
-    void* page = (void*)((uintptr_t)addr & ~((uintptr_t)POOL_PAGE_SIZE - 1));
-    uint32_t h = mid_desc_hash(page);
-    pthread_mutex_lock(&g_mid_desc_mu[h]);
-    MidPageDesc* d = g_mid_desc_head[h];
-    while (d) { if (d->page == page) break; d = d->next; }
-    if (d) {
-        if (d->owner_tid == 0) d->owner_tid = owner_tid;
-    } else {
-        MidPageDesc* nd = (MidPageDesc*)hkm_libc_malloc(sizeof(MidPageDesc));  // P0 Fix: Use libc malloc
-        if (nd) { nd->page = page; nd->class_idx = (uint8_t)class_idx; nd->owner_tid = owner_tid; nd->next = g_mid_desc_head[h]; g_mid_desc_head[h] = nd; }
-    }
-    pthread_mutex_unlock(&g_mid_desc_mu[h]);
-}
-
-// Increment page in-use counter for given raw block pointer
-static inline void mid_page_inuse_inc(void* raw) {
-    MidPageDesc* d = mid_desc_lookup(raw);
-    if (d) atomic_fetch_add_explicit(&d->in_use, 1, memory_order_relaxed);
-}
-
-// Decrement page in-use counter and enqueue DONTNEED when it drops to 0
-extern int hak_batch_add_page(void* page, size_t size);
-static inline void mid_page_inuse_dec_and_maybe_dn(void* raw) {
-    MidPageDesc* d = mid_desc_lookup(raw);
-    if (!d) return;
-    int nv = atomic_fetch_sub_explicit(&d->in_use, 1, memory_order_relaxed) - 1;
-    if (nv <= 0) {
-        // Fire once per empty transition
-        if (atomic_exchange_explicit(&d->pending_dn, 1, memory_order_acq_rel) == 0) {
-            hak_batch_add_page(d->page, POOL_PAGE_SIZE);
-        }
-    }
-}
+#include "box/pool_mid_desc.inc.h"
 
 // ---------------- Transfer Cache (per-thread per-class inbox) --------------
-typedef struct MidTC {
-    atomic_uintptr_t inbox[POOL_NUM_CLASSES];
-} MidTC;
+#include "box/pool_mid_tc.inc.h"
 
-#define MID_TC_BUCKETS 1024
-typedef struct MidTCEntry { uint64_t tid; MidTC* tc; struct MidTCEntry* next; } MidTCEntry;
-static pthread_mutex_t g_mid_tc_mu[MID_TC_BUCKETS];
-static MidTCEntry* g_mid_tc_head[MID_TC_BUCKETS];
-static __thread MidTC* t_mid_tc = NULL;
-static int g_tc_enabled = 1;         // env: HAKMEM_TC_ENABLE (default 1)
-static int g_tc_drain_unbounded = 1; // env: HAKMEM_TC_UNBOUNDED (default 1)
-static int g_tc_drain_max = 0;       // env: HAKMEM_TC_DRAIN_MAX (0=unbounded)
-static int g_tc_drain_trigger = 2;   // env: HAKMEM_TC_DRAIN_TRIGGER (ring->top < trigger)
+#include "box/pool_mf2_types.inc.h"
 
-static inline uint32_t mid_tc_hash(uint64_t tid) {
-    tid ^= tid >> 33; tid *= 0xff51afd7ed558ccdULL; tid ^= tid >> 33; tid *= 0xc4ceb9fe1a85ec53ULL; tid ^= tid >> 33;
-    return (uint32_t)(tid & (MID_TC_BUCKETS - 1));
-}
-
-// Thread-safe initialization using pthread_once
-static pthread_once_t mid_tc_init_once_control = PTHREAD_ONCE_INIT;
-static void mid_tc_init_impl(void) {
-    for (int i = 0; i < MID_TC_BUCKETS; i++) {
-        pthread_mutex_init(&g_mid_tc_mu[i], NULL);
-        g_mid_tc_head[i] = NULL;
-    }
-}
-static void mid_tc_init_once(void) {
-    pthread_once(&mid_tc_init_once_control, mid_tc_init_impl);
-}
-
-static MidTC* mid_tc_get(void) {
-    if (t_mid_tc) return t_mid_tc;
-    mid_tc_init_once();
-    MidTC* tc = (MidTC*)hkm_libc_calloc(1, sizeof(MidTC));  // P0 Fix: Use libc malloc
-    if (!tc) return NULL;
-    uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
-    uint32_t h = mid_tc_hash(tid);
-    pthread_mutex_lock(&g_mid_tc_mu[h]);
-    MidTCEntry* e = (MidTCEntry*)hkm_libc_malloc(sizeof(MidTCEntry));  // P0 Fix: Use libc malloc
-    if (e) { e->tid = tid; e->tc = tc; e->next = g_mid_tc_head[h]; g_mid_tc_head[h] = e; }
-    pthread_mutex_unlock(&g_mid_tc_mu[h]);
-    t_mid_tc = tc;
-    return tc;
-}
-
-static MidTC* mid_tc_lookup_by_tid(uint64_t tid) {
-    mid_tc_init_once();
-    uint32_t h = mid_tc_hash(tid);
-    MidTCEntry* e = g_mid_tc_head[h];
-    while (e) { if (e->tid == tid) return e->tc; e = e->next; }
-    return NULL;
-}
-
-static inline void mid_tc_push(MidTC* tc, int class_idx, PoolBlock* b) {
-    uintptr_t old_head;
-    do {
-        old_head = atomic_load_explicit(&tc->inbox[class_idx], memory_order_acquire);
-        b->next = (PoolBlock*)old_head;
-    } while (!atomic_compare_exchange_weak_explicit(&tc->inbox[class_idx], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed));
-}
-
-static inline int mid_tc_drain_into_tls(int class_idx, PoolTLSRing* ring, PoolTLSBin* bin) {
-    MidTC* tc = mid_tc_get();
-    if (!tc) return 0;
-    HKM_TIME_START(t_tc);
-    uintptr_t head = atomic_exchange_explicit(&tc->inbox[class_idx], (uintptr_t)0, memory_order_acq_rel);
-    if (!head) { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc); return 0; }
-    int moved = 0;
-    int limit = (g_tc_drain_unbounded || g_tc_drain_max <= 0) ? INT32_MAX : g_tc_drain_max;
-    PoolBlock* cur = (PoolBlock*)head;
-    while (cur && moved < limit) {
-        PoolBlock* nxt = cur->next;
-        if (ring->top < POOL_L2_RING_CAP) {
-            ring->items[ring->top++] = cur; moved++;
-        } else {
-            cur->next = bin->lo_head; bin->lo_head = cur; bin->lo_count++; moved++;
-        }
-        cur = nxt;
-    }
-    while (cur) { PoolBlock* nxt = cur->next; mid_tc_push(tc, class_idx, cur); cur = nxt; }
-    HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc);
-    return moved;
-}
-
-static inline int mid_tc_has_items(int class_idx) {
-    MidTC* tc = t_mid_tc; // do not allocate on peek
-    if (!tc) return 0;
-    return atomic_load_explicit(&tc->inbox[class_idx], memory_order_relaxed) != 0;
-}
-
-// ===========================================================================
-// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture
-// ===========================================================================
-//
-// Key idea: Each 64KB page has independent freelist (no sharing!)
-// - O(1) page lookup from block address: (addr & ~0xFFFF)
-// - Owner thread: fast path (no locks, no atomics)
-// - Cross-thread free: lock-free remote stack
-// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc)
-
-// MF2 Configuration Constants (Quick Win #5)
-#define MF2_PENDING_QUEUE_BUDGET    4      // Max pages to drain from pending queue
-#define MF2_DEBUG_SAMPLE_COUNT      20     // Number of debug samples to log
-#define MF2_TSC_CYCLES_PER_US       3000   // Estimated TSC cycles per microsecond
-#define MF2_PAGE_SIZE_SHIFT         16     // log2(64KB) for fast division
-#define MF2_PAGE_ALIGNMENT          65536  // 64KB alignment for mmap
-
-// Debug Logging Macros (Quick Win #6)
-// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable
-#ifdef HAKMEM_DEBUG_MF2
-    #define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__)
-    #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
-#else
-    #define MF2_DEBUG_LOG(fmt, ...) ((void)0)
-    #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
-#endif
-
-// Forward declarations
-static size_t g_class_sizes[POOL_NUM_CLASSES];
-
-// MF2 Page descriptor: per-page metadata (one per 64KB page)
-typedef struct MidPage {
-    // Page identity
-    void* base;              // Page base address (64KB aligned)
-    uint8_t class_idx;       // Size class index (0-6)
-    uint8_t flags;           // Page flags (reserved for future use)
-    uint16_t _pad0;
-
-    // Ownership
-    pthread_t owner_tid;     // Owner thread ID (for fast-path check)
-    struct MF2_ThreadPages* owner_tp;  // Owner thread's heap (for pending queue access)
-    uint64_t last_transfer_time;  // rdtsc() timestamp of last ownership transfer (lease mechanism)
-
-    // Page-local freelist (owner-only, NO LOCK!)
-    PoolBlock* freelist;     // Local freelist head
-    uint16_t free_count;     // Number of free blocks
-    uint16_t capacity;       // Total blocks per page
-
-    // Remote frees (cross-thread, lock-free MPSC stack)
-    atomic_uintptr_t remote_head;   // Lock-free remote free stack
-    atomic_uint remote_count;       // Remote free count (for quick check)
-
-    // Lifecycle
-    atomic_int in_use;       // Live allocations on this page
-    atomic_int pending_dn;   // DONTNEED enqueued flag
-
-    // Linkage (thread-local page lists)
-    struct MidPage* next_page;   // Next page in thread's list
-    struct MidPage* prev_page;   // Previous page in thread's list
-
-    // Pending queue (remote drain notification)
-    _Atomic(_Bool) in_remote_pending;  // Is this page in pending queue?
-    struct MidPage* next_pending;       // Next page in pending queue
-
-    // Padding to cache line boundary (avoid false sharing)
-    char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 +
-                     sizeof(atomic_uintptr_t) + sizeof(atomic_uint) +
-                     sizeof(atomic_int) * 2 + sizeof(pthread_t) +
-                     sizeof(_Atomic(_Bool)) + 4) % 64)];
-} MidPage;
-
-// Page registry: O(1) lookup from block address
-// Use direct indexing: (addr >> 16) & MASK
-#define MF2_PAGE_REGISTRY_BITS 16   // 64K entries (4GB address space with 64KB pages)
-#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS)
-#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1)
-
-typedef struct {
-    // Direct-mapped page table (no hash collisions!)
-    MidPage* pages[MF2_PAGE_REGISTRY_SIZE];
-
-    // Coarse-grained locks for rare updates (page alloc/free)
-    // 256 locks = 256-way parallelism for page registration
-    pthread_mutex_t locks[256];
-
-    // Statistics
-    atomic_uint_fast64_t total_pages;      // Total pages allocated
-    atomic_uint_fast64_t active_pages;     // Pages with live allocations
-} MF2_PageRegistry;
-
-// Thread-local page lists (one list per size class)
-typedef struct MF2_ThreadPages {
-    // Active pages (have free blocks)
-    MidPage* active_page[POOL_NUM_CLASSES];
-
-    // Partial pages (drained pages with free blocks, LIFO for cache locality)
-    // Checked before allocating new pages (fast reuse path)
-    MidPage* partial_pages[POOL_NUM_CLASSES];
-
-    // Full pages (no free blocks, but may receive remote frees)
-    // TODO: Gradually deprecate in favor of partial_pages
-    MidPage* full_pages[POOL_NUM_CLASSES];
-
-    // Pending queue (pages with remote frees, MPSC lock-free stack)
-    atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES];
-
-    // Pending claim flags (prevent multi-consumer CAS thrashing)
-    // One adopter at a time per queue (test_and_set to claim, clear to release)
-    atomic_flag pending_claim[POOL_NUM_CLASSES];
-
-    // Page ownership count (for statistics)
-    uint32_t page_count[POOL_NUM_CLASSES];
-
-    // Thread identity (cached for fast comparison)
-    pthread_t my_tid;
-
-    // Route P: Activity tracking for idle-based adoption
-    // Updated on every allocation (mf2_alloc_fast)
-    // Read by adopters to check if owner is idle
-    atomic_uint_fast64_t last_alloc_tsc;
-} MF2_ThreadPages;
-
-// Global page registry (shared, rarely accessed)
-static MF2_PageRegistry g_mf2_page_registry;
-
-// Thread-local page lists (hot path, no sharing!)
-static __thread MF2_ThreadPages* t_mf2_pages = NULL;
-
-// ===========================================================================
-// MF2 Global State (Quick Win #3b - Structured Globals)
-// ===========================================================================
-// Individual globals replaced with structured state below.
-// Old declarations removed, replaced with macro-mapped struct instances.
-//
-// Benefits:
-// - Logical grouping (config, registry, stats)
-// - Better documentation
-// - Easier to extend or refactor
-// - Single source of truth for each category
-
-#define MF2_MAX_THREADS 256
-
-// MF2 Configuration (environment variables)
-typedef struct {
-    int enabled;              // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled)
-    int max_queues;           // HAKMEM_MF2_MAX_QUEUES (default: 2)
-    int lease_ms;             // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled)
-    int idle_threshold_us;    // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs)
-} MF2_Config;
-
-// MF2 Thread Registry (cross-thread coordination)
-typedef struct {
-    MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS];  // Global registry
-    _Atomic int num_thread_pages;                         // Active thread count
-    _Atomic int adoptable_count[POOL_NUM_CLASSES];        // Non-empty pending queues
-    pthread_key_t tls_key;                                 // Thread-local storage key
-    pthread_once_t key_once;                               // TLS initialization guard
-} MF2_Registry;
-
-// MF2 Statistics (debug instrumentation)
-typedef struct {
-    // Allocation path
-    atomic_uint_fast64_t alloc_fast_hit;
-    atomic_uint_fast64_t alloc_slow_hit;
-    atomic_uint_fast64_t page_reuse_count;
-    atomic_uint_fast64_t new_page_count;
-
-    // Free path
-    atomic_uint_fast64_t free_owner_count;
-    atomic_uint_fast64_t free_remote_count;
-
-    // Drain operations
-    atomic_uint_fast64_t drain_count;
-    atomic_uint_fast64_t drain_blocks;
-    atomic_uint_fast64_t drain_attempts;
-    atomic_uint_fast64_t drain_success;
-    atomic_uint_fast64_t slow_checked_drain;
-    atomic_uint_fast64_t slow_found_remote;
-
-    // Full page scan (obsolete, kept for historical tracking)
-    atomic_uint_fast64_t full_scan_checked;
-    atomic_uint_fast64_t full_scan_found_remote;
-    atomic_uint_fast64_t eager_drain_scanned;
-    atomic_uint_fast64_t eager_drain_found;
-
-    // Pending queue
-    atomic_uint_fast64_t pending_enqueued;
-    atomic_uint_fast64_t pending_drained;
-    atomic_uint_fast64_t pending_requeued;
-} MF2_Stats;
-
-// Instantiate structured global state (Quick Win #3b)
-static MF2_Config g_mf2_config = {
-    .enabled = 0,           // Will be set by env var
-    .max_queues = 2,
-    .lease_ms = 10,
-    .idle_threshold_us = 150
-};
-
-static MF2_Registry g_mf2_registry = {
-    .all_thread_pages = {0},
-    .num_thread_pages = 0,
-    .adoptable_count = {0},
-    .tls_key = 0,
-    .key_once = PTHREAD_ONCE_INIT
-};
-
-static MF2_Stats g_mf2_stats = {
-    // All fields initialized to 0 (atomic zero-initialization is valid)
-    .alloc_fast_hit = 0,
-    .alloc_slow_hit = 0,
-    .page_reuse_count = 0,
-    .new_page_count = 0,
-    .free_owner_count = 0,
-    .free_remote_count = 0,
-    .drain_count = 0,
-    .drain_blocks = 0,
-    .drain_attempts = 0,
-    .drain_success = 0,
-    .slow_checked_drain = 0,
-    .slow_found_remote = 0,
-    .full_scan_checked = 0,
-    .full_scan_found_remote = 0,
-    .eager_drain_scanned = 0,
-    .eager_drain_found = 0,
-    .pending_enqueued = 0,
-    .pending_drained = 0,
-    .pending_requeued = 0
-};
-
-// Compatibility macros: Map old global names to struct fields
-// This allows existing code to work unchanged while using structured state
-#define g_mf2_enabled           (g_mf2_config.enabled)
-#define g_mf2_max_queues        (g_mf2_config.max_queues)
-#define g_mf2_lease_ms          (g_mf2_config.lease_ms)
-#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us)
-
-#define g_all_thread_pages      (g_mf2_registry.all_thread_pages)
-#define g_num_thread_pages      (g_mf2_registry.num_thread_pages)
-#define g_adoptable_count       (g_mf2_registry.adoptable_count)
-#define g_mf2_tls_key           (g_mf2_registry.tls_key)
-#define g_mf2_key_once          (g_mf2_registry.key_once)
-
-#define g_mf2_alloc_fast_hit        (g_mf2_stats.alloc_fast_hit)
-#define g_mf2_alloc_slow_hit        (g_mf2_stats.alloc_slow_hit)
-#define g_mf2_page_reuse_count      (g_mf2_stats.page_reuse_count)
-#define g_mf2_new_page_count        (g_mf2_stats.new_page_count)
-#define g_mf2_free_owner_count      (g_mf2_stats.free_owner_count)
-#define g_mf2_free_remote_count     (g_mf2_stats.free_remote_count)
-#define g_mf2_drain_count           (g_mf2_stats.drain_count)
-#define g_mf2_drain_blocks          (g_mf2_stats.drain_blocks)
-#define g_mf2_drain_attempts        (g_mf2_stats.drain_attempts)
-#define g_mf2_drain_success         (g_mf2_stats.drain_success)
-#define g_mf2_slow_checked_drain    (g_mf2_stats.slow_checked_drain)
-#define g_mf2_slow_found_remote     (g_mf2_stats.slow_found_remote)
-#define g_mf2_full_scan_checked     (g_mf2_stats.full_scan_checked)
-#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote)
-#define g_mf2_eager_drain_scanned   (g_mf2_stats.eager_drain_scanned)
-#define g_mf2_eager_drain_found     (g_mf2_stats.eager_drain_found)
-#define g_mf2_pending_enqueued      (g_mf2_stats.pending_enqueued)
-#define g_mf2_pending_drained       (g_mf2_stats.pending_drained)
-#define g_mf2_pending_requeued      (g_mf2_stats.pending_requeued)
-
-// ===========================================================================
-// End of MF2 Data Structures
-// ===========================================================================
 
 // --- MF2 Initialization Functions ---
 
@@ -1018,293 +545,10 @@ static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) {
 // End of Pending Queue Operations
 // ===========================================================================
 
-// Forward declarations
-static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id);
+#include "box/pool_mf2_helpers.inc.h"
 
-// ===========================================================================
-// Helper Functions (Clean & Modular)
-// ===========================================================================
 
-// Helper: Make page active (move old active to full_pages)
-static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return;
-
-    // Move old active page to full_pages (if any)
-    if (tp->active_page[class_idx]) {
-        MidPage* old_active = tp->active_page[class_idx];
-        old_active->next_page = tp->full_pages[class_idx];
-        tp->full_pages[class_idx] = old_active;
-    }
-
-    // Set new page as active
-    tp->active_page[class_idx] = page;
-    page->next_page = NULL;
-}
-
-// Helper: Drain page and add to partial list (LIFO for cache locality)
-// Returns true if page has free blocks after drain
-static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return false;
-
-    // Drain remote frees
-    int drained = mf2_drain_remote_frees(page);
-
-    // If page has freelist after drain, add to partial list (LIFO)
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-        page->next_page = tp->partial_pages[class_idx];
-        tp->partial_pages[class_idx] = page;
-        return true;
-    }
-
-    // No freelist, return to full_pages
-    page->next_page = tp->full_pages[class_idx];
-    tp->full_pages[class_idx] = page;
-    return false;
-}
-
-// Helper: Drain page and activate if successful (Direct Handoff - backward compat)
-// Returns true if page was activated
-static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return false;
-
-    // Drain remote frees
-    int drained = mf2_drain_remote_frees(page);
-
-    // If page has freelist after drain, make it active immediately
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-        mf2_make_page_active(tp, class_idx, page);
-        return true;
-    }
-
-    // No freelist, return to full_pages
-    page->next_page = tp->full_pages[class_idx];
-    tp->full_pages[class_idx] = page;
-    return false;
-}
-
-// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1)
-// Returns true if a page was successfully drained and activated
-static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return false;
-
-    // Budget: Process up to N pages to avoid blocking
-    for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) {
-        MidPage* pending_page = mf2_dequeue_pending(tp, class_idx);
-        if (!pending_page) break;  // Queue empty
-
-        atomic_fetch_add(&g_mf2_pending_drained, 1);
-
-        // Clear pending flag (no longer in queue)
-        atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release);
-
-        // DIRECT HANDOFF: Drain and activate if successful
-        if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) {
-            return true;  // Success! Page is now active
-        }
-        // No freelist after drain, page returned to full_pages by helper
-    }
-    return false;  // No pages available for reuse
-}
-
-// Helper: Try to drain remotes from active page (must-reuse gate part 2)
-// Returns true if active page has freelist after drain
-static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return false;
-
-    MidPage* page = tp->active_page[class_idx];
-    if (!page) return false;
-
-    atomic_fetch_add(&g_mf2_slow_checked_drain, 1);
-    unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
-
-    if (remote_cnt > 0) {
-        atomic_fetch_add(&g_mf2_slow_found_remote, 1);
-        int drained = mf2_drain_remote_frees(page);
-        if (drained > 0 && page->freelist) {
-            atomic_fetch_add(&g_mf2_drain_success, 1);
-            return true;  // Success! Active page now has freelist
-        }
-    }
-    return false;  // No remotes or drain failed
-}
-
-// Helper: Allocate new page and make it active
-// Returns the newly allocated page (or NULL on OOM)
-static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return NULL;
-
-    atomic_fetch_add(&g_mf2_new_page_count, 1);
-
-    // DEBUG: Log why we're allocating new page (first N samples)
-    static _Atomic int new_page_samples = 0;
-    int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed);
-    if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) {
-        // Count adoptable pages across all threads
-        int total_adoptable = 0;
-        for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-            total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed);
-        }
-        MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p",
-                      sample_idx, class_idx,
-                      (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed),
-                      total_adoptable,
-                      tp->active_page[class_idx],
-                      tp->full_pages[class_idx]);
-    }
-
-    MidPage* page = mf2_alloc_new_page(class_idx);
-    if (!page) {
-        return NULL; // OOM
-    }
-
-    // Move current active page to full list (if any)
-    if (tp->active_page[class_idx]) {
-        MidPage* old_page = tp->active_page[class_idx];
-        old_page->next_page = tp->full_pages[class_idx];
-        tp->full_pages[class_idx] = old_page;
-    }
-
-    // Set new page as active
-    tp->active_page[class_idx] = page;
-    tp->page_count[class_idx]++;
-
-    return page;
-}
-
-// ===========================================================================
-// End of Helper Functions
-// ===========================================================================
-
-// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue
-// Returns true if a page was successfully adopted and activated
-// Called from alloc_slow when allocating thread needs memory
-static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) {
-    if (!me) return false;
-
-    // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating)
-    // Avoids scanning empty queues (major performance win!)
-    int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed);
-    if (adoptable == 0) return false;  // All queues empty, no scan needed
-
-    // Get global thread registry
-    int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire);
-    if (num_tp == 0) return false;
-
-    // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES)
-    // Prevents excessive scanning overhead (2-8 threads is usually enough)
-    int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues;
-
-    // Round-robin scan (limited number of threads, not ALL!)
-    static _Atomic uint64_t adopt_counter = 0;
-    uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed);
-
-    for (int i = 0; i < scan_limit; i++) {
-        int tp_idx = (start_idx + i) % num_tp;
-        MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit(
-            (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire);
-
-        if (!other_tp) continue;
-
-        // Route P: Idle Detection - Only adopt from idle owners
-        // Check if owner is still actively allocating (threshold configurable via env var)
-        uint64_t now_tsc = mf2_rdtsc();
-        uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed);
-        uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US;
-
-        if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) {
-            continue;  // Owner still active, skip adoption
-        }
-
-        // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!)
-        // Only one thread scans each queue at a time → eliminates CAS contention
-        if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) {
-            continue;  // Another thread is already scanning this queue, skip
-        }
-
-        // Try to dequeue a pending page from this thread
-        MidPage* page = mf2_dequeue_pending(other_tp, class_idx);
-        if (!page) {
-            // Queue empty, release claim and try next thread
-            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-            continue;
-        }
-
-        // Clear pending flag (no longer in queue)
-        atomic_store_explicit(&page->in_remote_pending, false, memory_order_release);
-
-        // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS)
-        // 0ms = disabled (no lease check), >0 = lease period in milliseconds
-        uint64_t now = mf2_rdtsc();
-        uint64_t last_transfer = page->last_transfer_time;
-        if (g_mf2_lease_ms > 0 && last_transfer != 0) {
-            // Calculate lease cycles from ms (approx 3GHz CPU)
-            uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL);
-            if ((now - last_transfer) < lease_cycles) {
-                // Lease still active, return page to full_pages (don't thrash ownership)
-                page->next_page = other_tp->full_pages[class_idx];
-                other_tp->full_pages[class_idx] = page;
-                // Release claim before continuing
-                atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-                continue;  // Try next thread
-            }
-        }
-
-        // Try to transfer ownership using CAS
-        pthread_t old_owner = page->owner_tid;
-        pthread_t new_owner = pthread_self();
-
-        // Note: pthread_t may not be atomic-compatible on all platforms
-        // For now, we'll use a simple write (ownership transfer is rare)
-        // TODO: If thrashing is observed, add atomic CAS with serialization
-        page->owner_tid = new_owner;
-        page->owner_tp = me;
-        page->last_transfer_time = now;
-
-        // DEBUG: Log drain state
-        static _Atomic int adopt_samples = 0;
-        int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed);
-        unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed);
-        unsigned int pre_free = page->free_count;
-        PoolBlock* pre_freelist = page->freelist;
-
-        // Drain remote frees
-        int drained = mf2_drain_remote_frees(page);
-
-        // DEBUG: Log result (first 10 samples)
-        if (sample_idx < 10) {
-            MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p",
-                          sample_idx, class_idx, pre_remote, drained,
-                          pre_free, page->free_count, pre_freelist, page->freelist);
-        }
-
-        // Make adopted page ACTIVE immediately (not partial!)
-        // Adoption needs immediate activation for caller's mf2_alloc_fast()
-        // Partial list is only for own pending queue drains
-        if (page->freelist) {
-            atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-            atomic_fetch_add(&g_mf2_pending_drained, 1);
-            atomic_fetch_add(&g_mf2_drain_success, 1);
-
-            // Make it active (move old active to full_pages)
-            mf2_make_page_active(me, class_idx, page);
-
-            // Release claim before returning SUCCESS
-            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-            return true;  // SUCCESS! Page adopted and activated
-        }
-
-        // No freelist after drain, return to MY full_pages (I'm the new owner!)
-        page->next_page = me->full_pages[class_idx];
-        me->full_pages[class_idx] = page;
-        // Release claim before continuing search
-        atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-        // Continue searching for a better page
-    }
-
-    return false;  // No adoptable pages found
-}
+#include "box/pool_mf2_adoption.inc.h"
 
 // Fast allocation path (owner thread, NO LOCK!)
 static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) {
@@ -1646,947 +890,18 @@ int hak_pool_get_shard_index(uintptr_t site_id) {
     return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
 }
 
-// Bitmap helpers (O(1) empty class detection)
-static inline void set_nonempty_bit(int class_idx, int shard_idx) {
-    // Set bit: freelist[class][shard] is non-empty (atomic OR)
-    atomic_fetch_or(&g_pool.nonempty_mask[class_idx], (uint64_t)(1ULL << shard_idx));
-}
+// TLS helpers
+#include "box/pool_tls_core.inc.h"
 
-static inline void clear_nonempty_bit(int class_idx, int shard_idx) {
-    // Clear bit: freelist[class][shard] is empty (atomic AND)
-    atomic_fetch_and(&g_pool.nonempty_mask[class_idx], ~(uint64_t)(1ULL << shard_idx));
-}
 
-static inline int is_shard_nonempty(int class_idx, int shard_idx) {
-    // Check if shard has blocks (atomic load)
-    uint64_t mask = atomic_load(&g_pool.nonempty_mask[class_idx]);
-    return (mask & (1ULL << shard_idx)) != 0;
-}
+// Refill/ACE (boxed)
+#include "box/pool_refill.inc.h"
 
-// Drain remote-free MPSC stack into freelist under the shard lock
-static inline void drain_remote_locked(int class_idx, int shard_idx) {
-    uintptr_t head = atomic_exchange_explicit(&g_pool.remote_head[class_idx][shard_idx], (uintptr_t)0, memory_order_acq_rel);
-    unsigned drained = 0;
-    while (head) {
-        PoolBlock* b = (PoolBlock*)head;
-        head = (uintptr_t)b->next;  // next pointer stored in first word
-        b->next = g_pool.freelist[class_idx][shard_idx];
-        g_pool.freelist[class_idx][shard_idx] = b;
-        drained++;
-    }
-    if (drained) {
-        atomic_fetch_sub_explicit(&g_pool.remote_count[class_idx][shard_idx], drained, memory_order_relaxed);
-        if (g_pool.freelist[class_idx][shard_idx]) set_nonempty_bit(class_idx, shard_idx);
-    }
-}
+// Init/Shutdown + MF2 debug (boxed)
+#include "box/pool_init_api.inc.h"
 
-// Choose a non-empty shard near preferred using the nonempty mask. If none, return preferred.
-static inline int choose_nonempty_shard(int class_idx, int preferred) {
-    uint64_t mask = atomic_load_explicit(&g_pool.nonempty_mask[class_idx], memory_order_acquire);
-    if (!mask) return preferred;
-    // Rotate so preferred becomes bit0
-    int shift = preferred & 63;
-    uint64_t rot = (mask >> shift) | (mask << (64 - shift));
-    if (!rot) return preferred;
-    int off = __builtin_ctzll(rot);
-    return (preferred + off) & (POOL_NUM_SHARDS - 1);
-}
-// Allocate a private page for TLS active page and split into a local list
-static int alloc_tls_page(int class_idx, PoolTLSPage* ap) {
-    size_t user_size = g_class_sizes[class_idx];
-    size_t block_size = HEADER_SIZE + user_size;
-    int blocks_per_page = POOL_PAGE_SIZE / block_size;
-    if (blocks_per_page <= 0) return 0;
-    void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE,
-                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    if (!page) return 0;
-    // Bump-run initialization (no per-block linking)
-    ap->page = page;
-    ap->bump = (char*)page;
-    ap->end  = (char*)page + POOL_PAGE_SIZE;
-    ap->count = blocks_per_page;
-    // Register page with owner (this thread) for owner-fast free detection
-    mid_desc_register(page, class_idx, (uint64_t)(uintptr_t)pthread_self());
-    g_pool.refills[class_idx]++;
-    g_pool.total_pages_allocated++;
-    g_pool.pages_by_class[class_idx]++;
-    g_pool.total_bytes_allocated += POOL_PAGE_SIZE;
-    return 1;
-}
+// Pool statistics (boxed)
+#include "box/pool_stats.inc.h"
 
-// Refill TLS ring/LIFO from active page without building links. Returns number added.
-static inline int refill_tls_from_active_page(int class_idx, PoolTLSRing* ring, PoolTLSBin* bin, PoolTLSPage* ap, int need) {
-    if (!ap || !ap->page || ap->count <= 0 || ap->bump >= ap->end) return 0;
-    size_t blk = HEADER_SIZE + g_class_sizes[class_idx];
-    int moved = 0;
-    int to_add = need;
-    while (to_add > 0 && ap->bump < ap->end && ap->count > 0) {
-        PoolBlock* b = (PoolBlock*)(void*)ap->bump;
-        if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
-            ring->items[ring->top++] = b;
-        } else {
-            b->next = bin->lo_head; bin->lo_head = b; bin->lo_count++;
-        }
-        ap->bump += blk;
-        ap->count--;
-        moved++;
-        to_add--;
-    }
-    if (ap->bump >= ap->end || ap->count <= 0) {
-        ap->page = NULL; ap->bump = ap->end; ap->count = 0;
-    }
-    return moved;
-}
-
-
-// ACE: adjust bundle factor per class based on windowed hits/misses
-static inline void pool_update_bundle_factor(int class_idx) {
-    // Compute deltas since last snapshot
-    uint64_t h = g_pool.hits[class_idx];
-    uint64_t m = g_pool.misses[class_idx];
-    uint64_t dh = h - g_pool.last_hits[class_idx];
-    uint64_t dm = m - g_pool.last_misses[class_idx];
-    uint64_t dt = dh + dm;
-    if (dt < 256) return;  // wait for window to accumulate
-
-    int bf = g_pool.bundle_factor[class_idx];
-    if (bf <= 0) bf = 1;
-
-    // Ifミス優勢（ヒット率<60% かつ ミスがヒット+一定閾値超）→増やす
-    if (dt > 0) {
-        double hit_rate = (double)dh / (double)dt;
-        if (hit_rate < 0.60 && dm > (dh + 16)) {
-            if (bf < 4) bf++;
-        } else if (hit_rate > 0.90 && dh > (dm + 32)) {
-            if (bf > 1) bf--;
-        }
-    }
-
-    g_pool.bundle_factor[class_idx] = bf;
-    // Advance snapshot
-    g_pool.last_hits[class_idx] = h;
-    g_pool.last_misses[class_idx] = m;
-}
-
-// Refill freelist by allocating a new page (64KiB)
-// Args: class_idx - size class index (0-4)
-//       shard_idx - shard index (0-63)
-// Returns: 1 on success, 0 on failure
-//
-// Each block now includes AllocHeader + user data
-static int refill_freelist(int class_idx, int shard_idx) {
-    if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return 0;
-    if (shard_idx < 0 || shard_idx >= POOL_NUM_SHARDS) return 0;
-
-    size_t user_size = g_class_sizes[class_idx];
-    size_t block_size = HEADER_SIZE + user_size;  // Header + user data
-
-    // Calculate blocks per page (with header overhead)
-    int blocks_per_page = POOL_PAGE_SIZE / block_size;
-    if (blocks_per_page == 0) return 0;  // Safety: class too large for 64KiB page
-
-    // Allocate page via mmap (page-granular, avoids malloc overhead)
-    void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE,
-                      MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    if (!page) return 0;
-
-    // Update bundle factor based on windowed stats
-    pool_update_bundle_factor(class_idx);
-    int bundles = g_pool.bundle_factor[class_idx];
-    if (bundles < 1) bundles = 1;
-    if (bundles > 4) bundles = 4;
-
-    // Soft CAP guidance: use FrozenPolicy mid_cap to modulate bundling
-    // Semantics: mid_cap[class] is a soft target (in pages). We do not trim yet.
-    // If at/over cap → restrict bundling to 1; if far under cap → allow bundling up to deficit (max 4).
-    const FrozenPolicy* pol = hkm_policy_get();
-    if (pol) {
-        uint16_t cap = 0;
-        if (class_idx < 5) cap = pol->mid_cap[class_idx];
-        else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1;
-        else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2;
-        if (cap > 0) {
-            uint64_t have = g_pool.pages_by_class[class_idx];
-            if (have >= cap) {
-                bundles = 1;  // over cap: refill minimally
-            } else {
-                uint64_t deficit = (cap - have);
-                if (deficit < (uint64_t)bundles) bundles = (int)deficit; // don't exceed deficit
-                if (bundles < 1) bundles = 1;
-                if (bundles > 4) bundles = 4;
-                // Ensure at least min bundle under deficit for faster warm-up
-                if (deficit >= (uint64_t)g_pool_min_bundle && bundles < g_pool_min_bundle) bundles = g_pool_min_bundle;
-            }
-        }
-    }
-
-    int pages_allocated_this_call = 0;
-    for (int b = 0; b < bundles; b++) {
-        // Split page into blocks and link into freelist
-        PoolBlock* freelist_head = NULL;
-
-        for (int i = 0; i < blocks_per_page; i++) {
-            void* raw_block = (char*)page + (i * block_size);
-            // Prefetch next block header to reduce cache miss on link
-            __builtin_prefetch((char*)raw_block + block_size, 1, 1);
-            // Freelist uses raw pointer (header start). Header will be
-            // constructed after pop in hak_pool_try_alloc.
-            PoolBlock* block = (PoolBlock*)raw_block;
-            block->next = freelist_head;
-            freelist_head = block;
-        }
-
-        // Prepend to existing freelist (if any)
-        if (g_pool.freelist[class_idx][shard_idx]) {
-            // Find tail of new list
-            PoolBlock* tail = freelist_head;
-            while (tail->next) {
-                tail = tail->next;
-            }
-            tail->next = g_pool.freelist[class_idx][shard_idx];
-        }
-
-        g_pool.freelist[class_idx][shard_idx] = freelist_head;
-        // Register this 64KiB page (shared owner)
-        mid_desc_register(page, class_idx, 0);
-
-        // Next page if bundling
-        if (b + 1 < bundles) {
-            page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE,
-                        MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-            if (!page) break;
-        }
-        pages_allocated_this_call++;
-    }
-
-    // Set non-empty bit (freelist now has blocks)
-    set_nonempty_bit(class_idx, shard_idx);
-
-    // Update statistics
-    g_pool.refills[class_idx]++;
-    g_pool.total_pages_allocated += pages_allocated_this_call;
-    g_pool.pages_by_class[class_idx] += pages_allocated_this_call;
-    g_pool.total_bytes_allocated += (uint64_t)pages_allocated_this_call * (uint64_t)POOL_PAGE_SIZE;
-
-    return 1;
-}
-
-// ===========================================================================
-// Public API
-// ===========================================================================
-
-// Thread-safe initialization using pthread_once
-static pthread_once_t hak_pool_init_once_control = PTHREAD_ONCE_INIT;
-static void hak_pool_init_impl(void) {
-    // NOTE: Do NOT use memset() here! It would clobber 448 mutexes during concurrent init.
-    // All fields are explicitly initialized below.
-    // Configure dynamic Mid classes from FrozenPolicy (index 5/6)
-    const FrozenPolicy* pol = hkm_policy_get();
-    if (pol && pol->mid_dyn1_bytes >= POOL_MIN_SIZE && pol->mid_dyn1_bytes <= POOL_MAX_SIZE) {
-        g_class_sizes[5] = pol->mid_dyn1_bytes;
-    } else {
-        g_class_sizes[5] = 0; // disabled
-    }
-    if (pol && pol->mid_dyn2_bytes >= POOL_MIN_SIZE && pol->mid_dyn2_bytes <= POOL_MAX_SIZE) {
-        g_class_sizes[6] = pol->mid_dyn2_bytes;
-    } else {
-        g_class_sizes[6] = 0;
-    }
-    // Initialize all g_pool fields explicitly (no memset!)
-    for (int c = 0; c < POOL_NUM_CLASSES; c++) {
-        // Initialize freelists to NULL
-        for (int s = 0; s < POOL_NUM_SHARDS; s++) {
-            g_pool.freelist[c][s] = NULL;
-        }
-
-        // Initialize atomic variables and locks
-        atomic_store(&g_pool.nonempty_mask[c], 0);
-        for (int s = 0; s < POOL_NUM_SHARDS; s++) {
-            pthread_mutex_init(&g_pool.freelist_locks[c][s].m, NULL);
-            atomic_store(&g_pool.remote_head[c][s], (uintptr_t)0);
-            atomic_store(&g_pool.remote_count[c][s], 0);
-        }
-
-        // Initialize per-class statistics
-        g_pool.hits[c] = 0;
-        g_pool.misses[c] = 0;
-        g_pool.refills[c] = 0;
-        g_pool.frees[c] = 0;
-        g_pool.pages_by_class[c] = 0;
-
-        // Initialize ACE variables
-        g_pool.bundle_factor[c] = 1;
-        g_pool.last_hits[c] = 0;
-        g_pool.last_misses[c] = 0;
-    }
-
-    // Initialize global statistics
-    g_pool.total_bytes_allocated = 0;
-    g_pool.total_pages_allocated = 0;
-
-    // Initialize atomic metrics
-    atomic_store(&g_pool.trylock_attempts, 0);
-    atomic_store(&g_pool.trylock_success, 0);
-    atomic_store(&g_pool.ring_underflow, 0);
-    const char* e_tls = getenv("HAKMEM_POOL_TLS_FREE");
-    g_pool.tls_free_enabled = (e_tls == NULL) ? 1 : (atoi(e_tls) != 0);
-    const char* e_wrap = getenv("HAKMEM_WRAP_L2");
-    g_wrap_l2_enabled = (e_wrap && atoi(e_wrap) != 0) ? 1 : 0;
-    const char* e_minb = getenv("HAKMEM_POOL_MIN_BUNDLE");
-    if (e_minb) { int v = atoi(e_minb); if (v >= 1 && v <= 8) g_pool_min_bundle = v; }
-    const char* e_mix = getenv("HAKMEM_SHARD_MIX");
-    g_shard_mix_enabled = (e_mix && atoi(e_mix) != 0) ? 1 : 0;
-    const char* e_ring = getenv("HAKMEM_POOL_TLS_RING");
-    if (e_ring) g_tls_ring_enabled = (atoi(e_ring) != 0);
-    const char* e_hdr = getenv("HAKMEM_HDR_LIGHT");
-    if (e_hdr) g_hdr_light_enabled = atoi(e_hdr);  // 0=full, 1=minimal, 2=skip header writes/validation
-    const char* e_probe = getenv("HAKMEM_TRYLOCK_PROBES");
-    if (e_probe) { int v = atoi(e_probe); if (v>=1 && v<=8) g_trylock_probes = v; }
-    const char* e_div = getenv("HAKMEM_RING_RETURN_DIV");
-    if (e_div) { int v = atoi(e_div); if (v>=2 && v<=4) g_ring_return_div = v; }
-    const char* e_lo = getenv("HAKMEM_TLS_LO_MAX");
-    if (e_lo) { int v = atoi(e_lo); if (v>=32 && v<=16384) g_tls_lo_max = v; }
-    const char* e_cs = getenv("HAKMEM_POOL_COUNT_SAMPLE");
-    if (e_cs) { int v = atoi(e_cs); if (v>=0 && v<=16) g_count_sample_exp = v; }
-    const char* e_tc = getenv("HAKMEM_TC_ENABLE");
-    if (e_tc) g_tc_enabled = (atoi(e_tc) != 0);
-    const char* e_tcu = getenv("HAKMEM_TC_UNBOUNDED");
-    if (e_tcu) g_tc_drain_unbounded = (atoi(e_tcu) != 0);
-    const char* e_tcm = getenv("HAKMEM_TC_DRAIN_MAX");
-    if (e_tcm) { int v = atoi(e_tcm); if (v>=0 && v<=65536) g_tc_drain_max = v; }
-    const char* e_tct = getenv("HAKMEM_TC_DRAIN_TRIGGER");
-    if (e_tct) { int v = atoi(e_tct); if (v>=0 && v<=POOL_L2_RING_CAP) g_tc_drain_trigger = v; }
-
-    // MF2: Per-Page Sharding
-    const char* e_mf2 = getenv("HAKMEM_MF2_ENABLE");
-    if (e_mf2 && atoi(e_mf2) != 0) {
-        g_mf2_enabled = 1;
-        mf2_page_registry_init();
-
-        // MF2 tuning parameters
-        const char* e_maxq = getenv("HAKMEM_MF2_MAX_QUEUES");
-        if (e_maxq) {
-            int v = atoi(e_maxq);
-            if (v >= 1 && v <= 256) g_mf2_max_queues = v;
-        }
-        const char* e_lease = getenv("HAKMEM_MF2_LEASE_MS");
-        if (e_lease) {
-            int v = atoi(e_lease);
-            if (v >= 0 && v <= 1000) g_mf2_lease_ms = v;  // 0=disabled, max 1000ms
-        }
-        const char* e_idle = getenv("HAKMEM_MF2_IDLE_THRESHOLD_US");
-        if (e_idle) {
-            int v = atoi(e_idle);
-            if (v >= 0 && v <= 10000) g_mf2_idle_threshold_us = v;  // 0µs~10ms
-        }
-
-        HAKMEM_LOG("[Pool] MF2 Per-Page Sharding enabled\n");
-        HAKMEM_LOG("[MF2] max_queues=%d, lease_ms=%d, idle_threshold_us=%d\n",
-                   g_mf2_max_queues, g_mf2_lease_ms, g_mf2_idle_threshold_us);
-    }
-
-    g_pool.initialized = 1;
-
-    HAKMEM_LOG("[Pool] Initialized (L2 Hybrid Pool)\n");
-    if (g_class_sizes[5] != 0 || g_class_sizes[6] != 0) {
-        HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB%s%s%s\n",
-            g_class_sizes[5] ? ", dyn1=" : "",
-            g_class_sizes[5] ? (const char*)"" : (g_class_sizes[6]?",":""),
-            (g_class_sizes[5]||g_class_sizes[6]) ? "" : "");
-    } else {
-        HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB\n");
-    }
-    HAKMEM_LOG("[Pool] Page size: %d KB\n", POOL_PAGE_SIZE / 1024);
-    HAKMEM_LOG("[Pool] Shards: %d (site-based)\n", POOL_NUM_SHARDS);
-}
-
-void hak_pool_init(void) {
-    pthread_once(&hak_pool_init_once_control, hak_pool_init_impl);
-}
-
-static void mf2_print_debug_stats(void) {
-    if (!g_mf2_enabled) return;
-
-    fprintf(stderr, "\n[MF2 DEBUG STATS]\n");
-    fprintf(stderr, "Alloc fast hits:  %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_fast_hit));
-    fprintf(stderr, "Alloc slow hits:  %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_slow_hit));
-    fprintf(stderr, "Page reuses:      %12lu\n", (unsigned long)atomic_load(&g_mf2_page_reuse_count));
-    fprintf(stderr, "New pages:        %12lu\n", (unsigned long)atomic_load(&g_mf2_new_page_count));
-    fprintf(stderr, "Owner frees:      %12lu\n", (unsigned long)atomic_load(&g_mf2_free_owner_count));
-    fprintf(stderr, "Remote frees:     %12lu\n", (unsigned long)atomic_load(&g_mf2_free_remote_count));
-    fprintf(stderr, "Slow checked:     %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_checked_drain));
-    fprintf(stderr, "Slow found rem:   %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_found_remote));
-    fprintf(stderr, "Full scan chk:    %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_checked));
-    fprintf(stderr, "Full scan rem:    %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_found_remote));
-    fprintf(stderr, "Eager scan:       %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_scanned));
-    fprintf(stderr, "Eager found:      %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_found));
-    fprintf(stderr, "Drain attempts:   %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_attempts));
-    fprintf(stderr, "Drain successes:  %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_success));
-    fprintf(stderr, "Remote drains:    %12lu (blocks: %lu)\n",
-            (unsigned long)atomic_load(&g_mf2_drain_count),
-            (unsigned long)atomic_load(&g_mf2_drain_blocks));
-
-    // Pending queue statistics
-    fprintf(stderr, "\n[PENDING QUEUE]\n");
-    fprintf(stderr, "Pending enqueued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_enqueued));
-    fprintf(stderr, "Pending drained:  %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_drained));
-    fprintf(stderr, "Pending requeued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_requeued));
-
-    // Calculate ratios
-    uint64_t total_allocs = atomic_load(&g_mf2_alloc_fast_hit) + atomic_load(&g_mf2_alloc_slow_hit);
-    uint64_t total_frees = atomic_load(&g_mf2_free_owner_count) + atomic_load(&g_mf2_free_remote_count);
-    if (total_allocs > 0) {
-        fprintf(stderr, "\nFast path hit rate:  %.2f%%\n",
-                100.0 * atomic_load(&g_mf2_alloc_fast_hit) / total_allocs);
-    }
-    if (total_frees > 0) {
-        fprintf(stderr, "Owner free rate:     %.2f%%\n",
-                100.0 * atomic_load(&g_mf2_free_owner_count) / total_frees);
-    }
-    fflush(stderr);
-}
-
-__attribute__((destructor))
-static void mf2_destructor(void) {
-    mf2_print_debug_stats();
-}
-
-void hak_pool_shutdown(void) {
-    if (!g_pool.initialized) return;
-
-    hak_pool_print_stats();
-    mf2_print_debug_stats();
-
-    // Free all pages (walk freelists and free page heads)
-    // MVP: Skip for now (pages allocated via malloc, will be freed by system)
-    // Future: Track page allocations and munmap explicitly
-
-    g_pool.initialized = 0;
-}
-
-void* hak_pool_try_alloc(size_t size, uintptr_t site_id) {
-    hak_pool_init();  // pthread_once() ensures thread-safe init (no data race!)
-    // P1.7 approach: Avoid using pool during ALL wrapper calls (conservative but safe)
-    extern int hak_in_wrapper(void);
-    if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL;
-    if (!hak_pool_is_poolable(size)) return NULL;
-
-    // Get class and shard indices
-    int class_idx = hak_pool_get_class_index(size);
-    if (class_idx < 0) return NULL;
-
-    // MF2: Per-Page Sharding path
-    if (g_mf2_enabled) {
-        return mf2_alloc_fast(class_idx, size, site_id);
-    }
-
-    // OLD PATH: TLS fast path (ring then local LIFO); drain TC only when needed
-    PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
-    if (g_tc_enabled && ring->top < g_tc_drain_trigger && mid_tc_has_items(class_idx)) {
-        HKM_TIME_START(t_tc_drain);
-        if (mid_tc_drain_into_tls(class_idx, ring, &g_tls_bin[class_idx])) {
-            HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain);
-            if (ring->top > 0) {
-                HKM_TIME_START(t_ring_pop0);
-                PoolBlock* tlsb = ring->items[--ring->top];
-                HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop0);
-                void* raw = (void*)tlsb;
-                AllocHeader* hdr = (AllocHeader*)raw;
-                mid_set_header(hdr, g_class_sizes[class_idx], site_id);
-                mid_page_inuse_inc(raw);
-                t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-                if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
-                return (char*)raw + HEADER_SIZE;
-            }
-        } else { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain); }
-    }
-    if (g_tls_ring_enabled) {
-        if (ring->top == 0) {
-            atomic_fetch_add_explicit(&g_pool.ring_underflow, 1, memory_order_relaxed);
-        }
-        if (ring->top > 0) {
-            HKM_TIME_START(t_ring_pop1);
-            PoolBlock* tlsb = ring->items[--ring->top];
-            HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop1);
-            void* raw = (void*)tlsb;
-            AllocHeader* hdr = (AllocHeader*)raw;
-            mid_set_header(hdr, g_class_sizes[class_idx], site_id);
-            t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-            if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
-            return (char*)raw + HEADER_SIZE;
-        }
-    }
-    if (g_tls_bin[class_idx].lo_head) {
-        HKM_TIME_START(t_lifo_pop0);
-        PoolBlock* b = g_tls_bin[class_idx].lo_head;
-        g_tls_bin[class_idx].lo_head = b->next;
-        if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--;
-        HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop0);
-        void* raw = (void*)b; AllocHeader* hdr = (AllocHeader*)raw;
-        mid_set_header(hdr, g_class_sizes[class_idx], site_id);
-        mid_page_inuse_inc(raw);
-        t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-        if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
-        return (char*)raw + HEADER_SIZE;
-    }
-
-    // Compute shard only when we need to access shared structures
-    int shard_idx = hak_pool_get_shard_index(site_id);
-
-    // Try to batch-pop from a non-empty shard using trylock to fill TLS ring
-    if (g_tls_ring_enabled) {
-        int s0 = choose_nonempty_shard(class_idx, shard_idx);
-        for (int probe = 0; probe < g_trylock_probes; ++probe) {
-            int s = (s0 + probe) & (POOL_NUM_SHARDS - 1);
-            pthread_mutex_t* l = &g_pool.freelist_locks[class_idx][s].m;
-            atomic_fetch_add_explicit(&g_pool.trylock_attempts, 1, memory_order_relaxed);
-            if (pthread_mutex_trylock(l) == 0) {
-                atomic_fetch_add_explicit(&g_pool.trylock_success, 1, memory_order_relaxed);
-                // First, drain any remote frees into freelist
-                if (atomic_load_explicit(&g_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {
-                    drain_remote_locked(class_idx, s);
-                }
-                PoolBlock* head = g_pool.freelist[class_idx][s];
-                int to_ring = POOL_L2_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
-                while (head && to_ring-- > 0) { PoolBlock* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
-                while (head) { PoolBlock* nxt = head->next; head->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = head; g_tls_bin[class_idx].lo_count++; head = nxt; }
-                g_pool.freelist[class_idx][s] = head;
-                if (!head) clear_nonempty_bit(class_idx, s);
-                pthread_mutex_unlock(l);
-                if (ring->top > 0) {
-                    PoolBlock* tlsb = ring->items[--ring->top];
-                    void* raw = (void*)tlsb;
-                    AllocHeader* hdr = (AllocHeader*)raw;
-                    mid_set_header(hdr, g_class_sizes[class_idx], site_id);
-                    mid_page_inuse_inc(raw);
-                    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-                    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
-                    return (char*)raw + HEADER_SIZE;
-                }
-            }
-        }
-    }
-
-    // Try TLS active pages (owner-only local bump-run, up to 3)  // QW2-adjusted
-    PoolTLSPage* ap = NULL;
-    if (g_tls_active_page_a[class_idx].page && g_tls_active_page_a[class_idx].count > 0 && g_tls_active_page_a[class_idx].bump < g_tls_active_page_a[class_idx].end) ap = &g_tls_active_page_a[class_idx];
-    else if (g_tls_active_page_b[class_idx].page && g_tls_active_page_b[class_idx].count > 0 && g_tls_active_page_b[class_idx].bump < g_tls_active_page_b[class_idx].end) ap = &g_tls_active_page_b[class_idx];
-    else if (g_tls_active_page_c[class_idx].page && g_tls_active_page_c[class_idx].count > 0 && g_tls_active_page_c[class_idx].bump < g_tls_active_page_c[class_idx].end) ap = &g_tls_active_page_c[class_idx];  // QW2-adjusted
-    if (ap) {
-        // Opportunistically fill TLS ring from active page as well
-        if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
-            int need = POOL_L2_RING_CAP - ring->top;
-            (void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);
-        }
-        PoolBlock* b = NULL;
-        if (ring->top > 0) { b = ring->items[--ring->top]; }
-        else if (ap->page && ap->count > 0 && ap->bump < ap->end) {
-            b = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count<=0){ ap->page=NULL; ap->count=0; }
-        }
-        if (b) {
-        void* raw = (void*)b;
-        AllocHeader* hdr = (AllocHeader*)raw;
-        mid_set_header(hdr, g_class_sizes[class_idx], site_id);
-        mid_page_inuse_inc(raw);
-        g_pool.hits[class_idx]++;
-        return (char*)raw + HEADER_SIZE;
-        }
-    }
-
-    // Lock the shard freelist for this (class, shard)
-    pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m;
-    HKM_TIME_START(t_lock);
-    struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1);
-    (void)ts_lk1; (void)lk1;  // Unused profiling variables
-    pthread_mutex_lock(lock);
-    HKM_TIME_END(HKM_CAT_POOL_LOCK, t_lock);
-    hkm_prof_end(lk1, HKP_POOL_LOCK, &ts_lk1);
-
-    // Try to pop from freelist
-    PoolBlock* block = g_pool.freelist[class_idx][shard_idx];
-
-    if (!block) {
-        // Before refilling, try draining remote stack and simple shard steal
-        int stole = 0;
-        const FrozenPolicy* pol = hkm_policy_get();
-        if (pol) {
-            uint16_t cap = 0;
-            if (class_idx < 5) cap = pol->mid_cap[class_idx];
-            else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1;
-            else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2;
-            // Drain remote stack regardless of cap (cheap and helps reuse)
-            if (atomic_load_explicit(&g_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) {
-                drain_remote_locked(class_idx, shard_idx);
-                block = g_pool.freelist[class_idx][shard_idx];
-            }
-            if (!block && cap > 0 && g_pool.pages_by_class[class_idx] >= cap) {
-                HKM_TIME_START(t_steal);
-                for (int d = 1; d <= 4 && !stole; d++) {
-                    int s1 = (shard_idx + d) & (POOL_NUM_SHARDS - 1);
-                    int s2 = (shard_idx - d) & (POOL_NUM_SHARDS - 1);
-                    if (is_shard_nonempty(class_idx, s1)) {
-                        pthread_mutex_t* l2 = &g_pool.freelist_locks[class_idx][s1].m;
-                        pthread_mutex_lock(l2);
-                        PoolBlock* b2 = g_pool.freelist[class_idx][s1];
-                        if (b2) {
-                            g_pool.freelist[class_idx][s1] = b2->next;
-                            if (!g_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1);
-                            block = b2; stole = 1;
-                        }
-                        pthread_mutex_unlock(l2);
-                    }
-                    if (!stole && is_shard_nonempty(class_idx, s2)) {
-                        pthread_mutex_t* l3 = &g_pool.freelist_locks[class_idx][s2].m;
-                        pthread_mutex_lock(l3);
-                        PoolBlock* b3 = g_pool.freelist[class_idx][s2];
-                        if (b3) {
-                            g_pool.freelist[class_idx][s2] = b3->next;
-                            if (!g_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2);
-                            block = b3; stole = 1;
-                        }
-                        pthread_mutex_unlock(l3);
-                    }
-                }
-                HKM_TIME_END(HKM_CAT_SHARD_STEAL, t_steal);
-            }
-        }
-
-        if (!stole && !block) {
-            // Freelist empty, refill page
-            {
-                // choose empty TLS slot for new page (check all 3 slots)  // QW2-adjusted
-                PoolTLSPage* tap = NULL;
-                if (g_tls_active_page_a[class_idx].page == NULL || g_tls_active_page_a[class_idx].count == 0) tap = &g_tls_active_page_a[class_idx];
-                else if (g_tls_active_page_b[class_idx].page == NULL || g_tls_active_page_b[class_idx].count == 0) tap = &g_tls_active_page_b[class_idx];
-                else if (g_tls_active_page_c[class_idx].page == NULL || g_tls_active_page_c[class_idx].count == 0) tap = &g_tls_active_page_c[class_idx];  // QW2-adjusted
-                else tap = &g_tls_active_page_a[class_idx]; // fallback overwrite oldest if all 3 busy
-                HKM_TIME_START(t_alloc_page);
-                if (alloc_tls_page(class_idx, tap)) {
-                    HKM_TIME_END(HKM_CAT_POOL_ALLOC_TLS_PAGE, t_alloc_page);
-                    pthread_mutex_unlock(lock);
-                    // rebind to the page we just allocated and top-up ring from bump-run
-                    ap = tap;
-                    if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
-                        int need = POOL_L2_RING_CAP - ring->top;
-                        (void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);
-                    }
-                    PoolBlock* takeb = NULL;
-                    if (ring->top > 0) { HKM_TIME_START(t_ring_pop2); takeb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop2);} 
-                    else if (ap->page && ap->count > 0 && ap->bump < ap->end) { takeb = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count==0){ ap->page=NULL; ap->count=0; } }
-                    void* raw2 = (void*)takeb;
-                    AllocHeader* hdr2 = (AllocHeader*)raw2;
-                    mid_set_header(hdr2, g_class_sizes[class_idx], site_id);
-                    mid_page_inuse_inc(raw2);
-                    g_pool.hits[class_idx]++;
-                    return (char*)raw2 + HEADER_SIZE;
-                }
-                HKM_TIME_START(t_refill);
-                struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf);
-                int ok = refill_freelist(class_idx, shard_idx);
-                HKM_TIME_END(HKM_CAT_POOL_REFILL, t_refill);
-                hkm_prof_end(rf, HKP_POOL_REFILL, &ts_rf);
-                if (!ok) {
-                t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-                if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.misses[class_idx]++;
-                pthread_mutex_unlock(lock);
-                return NULL;  // Out of memory
-                }
-            }
-
-            // Try again after refill
-            block = g_pool.freelist[class_idx][shard_idx];
-            if (!block) {
-                t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-                if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.misses[class_idx]++;
-                pthread_mutex_unlock(lock);
-                return NULL;  // Refill failed
-            }
-        }
-    }
-
-    // Pop block from freelist (block points to raw allocation = header start)
-    g_pool.freelist[class_idx][shard_idx] = block->next;
-    // Adopt shared page to this thread (first touch) to improve TC routing
-    mid_desc_adopt(block, class_idx, (uint64_t)(uintptr_t)pthread_self());
-    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
-
-    // Clear bit if freelist becomes empty
-    if (g_pool.freelist[class_idx][shard_idx] == NULL) {
-        clear_nonempty_bit(class_idx, shard_idx);
-    }
-
-    pthread_mutex_unlock(lock);
-
-    // Save to TLS local (ring if possible、else LIFO)、then pop
-    PoolBlock* take;
-    if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { HKM_CNT(HKM_CAT_TLS_FAST); ring->items[ring->top++] = block; HKM_TIME_START(t_ring_pop4); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop4); }
-    else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++; 
-           if (g_tls_ring_enabled && ring->top > 0) { HKM_CNT(HKM_CAT_TLS_FAST); HKM_TIME_START(t_ring_pop5); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop5); } 
-           else { HKM_TIME_START(t_lifo_pop2); take = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = take->next; if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--; HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop2); } }
-
-    // Construct header fields now (freelist used header area for links)
-    void* raw = (void*)take;
-    AllocHeader* hdr = (AllocHeader*)raw;
-    mid_set_header(hdr, g_class_sizes[class_idx], site_id);
-    mid_page_inuse_inc(raw);
-
-    // Calculate user pointer (skip header)
-    void* user_ptr = (char*)raw + HEADER_SIZE;
-
-    // ゼロ化禁止（calloc以外）
-    // デバッグモードのみパターン埋め
-    #ifdef HAKMEM_DEBUG_SANITIZE
-        memset(user_ptr, 0xA5, g_class_sizes[class_idx]);  // パターン埋め
-    #endif
-    // 本番: ゼロ化なし（15-25% 高速化）
-
-    return user_ptr;
-}
-
-void hak_pool_free(void* ptr, size_t size, uintptr_t site_id) {
-    if (!ptr) return;
-    hak_pool_init();  // pthread_once() ensures thread-safe init (no data race!)
-    if (!hak_pool_is_poolable(size)) return;
-
-    // MF2: Per-Page Sharding path
-    if (g_mf2_enabled) {
-        mf2_free(ptr);
-        return;
-    }
-
-    // OLD PATH: ptr is user pointer, get raw pointer (header start)
-    void* raw = (char*)ptr - HEADER_SIZE;
-
-    // Validate header unless we can prove Mid ownership via page descriptor.
-    AllocHeader* hdr = (AllocHeader*)raw;
-    int mid_by_desc = 0;
-    MidPageDesc* d_desc = mid_desc_lookup(ptr);
-    if (d_desc) mid_by_desc = 1;
-    if (!mid_by_desc && g_hdr_light_enabled < 2) {
-        if (hdr->magic != HAKMEM_MAGIC) {
-            MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X",
-                          hdr->magic, HAKMEM_MAGIC);
-            return;  // Skip free (corruption detected)
-        }
-        if (hdr->method != ALLOC_METHOD_POOL) {
-            MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)",
-                          hdr->method, ALLOC_METHOD_POOL);
-            return;  // Skip free (not a pool allocation)
-        }
-    }
-
-    // Get class and shard indices
-    int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size);
-    if (class_idx < 0) return;
-
-    int shard_idx = hak_pool_get_shard_index(site_id);
-    (void)shard_idx;  // Unused in MF2 path
-
-    PoolBlock* block = (PoolBlock*)raw;
-    if (g_pool.tls_free_enabled) {
-        // Same-thread fast path: prefer TLS caches. If header lacks owner (light),
-        // consult page descriptor for TLS-owned pages; otherwise fall back to remote.
-        int same_thread = 0;
-        if (g_hdr_light_enabled >= 1) {
-            MidPageDesc* d = mid_desc_lookup(raw);
-            if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) {
-                same_thread = 1;
-            }
-        } else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) {
-            same_thread = 1;
-        }
-        if (same_thread) {
-            PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
-            if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
-                ring->items[ring->top++] = block;
-            } else {
-                // Push to TLS local LIFO; only溢れたときにremoteへ少量spill
-                block->next = g_tls_bin[class_idx].lo_head;
-                g_tls_bin[class_idx].lo_head = block;
-                g_tls_bin[class_idx].lo_count++;
-                if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {
-                    size_t spill = g_tls_bin[class_idx].lo_count / 2;
-                    int shard = hak_pool_get_shard_index(site_id);
-                    while (spill-- && g_tls_bin[class_idx].lo_head) {
-                        PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; g_tls_bin[class_idx].lo_count--;
-                        HKM_TIME_START(t_remote_push1);
-                        uintptr_t old_head;
-                        do {
-                            old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
-                            b->next = (PoolBlock*)old_head;
-                        } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed));
-                        atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
-                        HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1);
-                    }
-                    set_nonempty_bit(class_idx, shard);
-                }
-            }
-
-        } else {
-            // Cross-thread: remote push to target shard
-            if (g_tc_enabled) {
-                uint64_t owner_tid = 0;
-                if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid;
-                if (owner_tid == 0) {
-                    MidPageDesc* d = mid_desc_lookup(raw);
-                    if (d) owner_tid = d->owner_tid;
-                }
-                if (owner_tid != 0) {
-                    MidTC* otc = mid_tc_lookup_by_tid(owner_tid);
-                    if (otc) { mid_tc_push(otc, class_idx, block); return; }
-                }
-            }
-            int shard = hak_pool_get_shard_index(site_id);
-            uintptr_t old_head;
-            HKM_TIME_START(t_remote_push2);
-            do {
-                old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
-                block->next = (PoolBlock*)old_head;
-            } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed));
-            atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
-            HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2);
-            set_nonempty_bit(class_idx, shard);
-        }
-    } else {
-        // Return to global freelist (A/B testing path)
-        int shard_idx = hak_pool_get_shard_index(site_id);
-        pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m;
-        pthread_mutex_lock(lock);
-        block->next = g_pool.freelist[class_idx][shard_idx];
-        g_pool.freelist[class_idx][shard_idx] = block;
-        set_nonempty_bit(class_idx, shard_idx);
-        pthread_mutex_unlock(lock);
-    }
-    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
-    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
-    // Decrement in-use and enqueue DONTNEED if page becomes empty
-    mid_page_inuse_dec_and_maybe_dn(raw);
-}
-
-void hak_pool_print_stats(void) {
-    if (!g_pool.initialized) return;
-
-    printf("\n========================================\n");
-    printf("L2 Pool Statistics\n");
-    printf("========================================\n");
-
-    uint64_t total_hits = 0, total_misses = 0, total_refills = 0, total_frees = 0;
-
-    for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-        if (g_class_sizes[i] == 0) continue; // skip disabled dynamic class
-        total_hits += g_pool.hits[i];
-        total_misses += g_pool.misses[i];
-        total_refills += g_pool.refills[i];
-        total_frees += g_pool.frees[i];
-
-        printf("Class %zu KB:\n", g_class_sizes[i] / 1024);
-        printf("  Hits:    %lu\n", (unsigned long)g_pool.hits[i]);
-        printf("  Misses:  %lu\n", (unsigned long)g_pool.misses[i]);
-        printf("  Refills: %lu\n", (unsigned long)g_pool.refills[i]);
-        printf("  Frees:   %lu\n", (unsigned long)g_pool.frees[i]);
-
-        if (g_pool.hits[i] + g_pool.misses[i] > 0) {
-            double hit_rate = (double)g_pool.hits[i] / (g_pool.hits[i] + g_pool.misses[i]) * 100.0;
-            printf("  Hit rate: %.1f%%\n", hit_rate);
-        }
-    }
-
-    printf("\n----------------------------------------\n");
-    printf("Summary:\n");
-    printf("  Total hits:    %lu\n", (unsigned long)total_hits);
-    printf("  Total misses:  %lu\n", (unsigned long)total_misses);
-    printf("  Total refills: %lu\n", (unsigned long)total_refills);
-    printf("  Total frees:   %lu\n", (unsigned long)total_frees);
-    printf("  Pages allocated: %lu\n", (unsigned long)g_pool.total_pages_allocated);
-    printf("  Bytes allocated: %lu KB\n", (unsigned long)(g_pool.total_bytes_allocated / 1024));
-
-    if (total_hits + total_misses > 0) {
-        double hit_rate = (double)total_hits / (total_hits + total_misses) * 100.0;
-        printf("  Overall hit rate: %.1f%%\n", hit_rate);
-    }
-
-    printf("========================================\n");
-}
-
-void hak_pool_stats_snapshot(uint64_t hits[], uint64_t misses[], uint64_t refills[], uint64_t frees[]) {
-    if (!g_pool.initialized) {
-        // Zero out if not initialized
-        for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-            if (hits) hits[i] = 0;
-            if (misses) misses[i] = 0;
-            if (refills) refills[i] = 0;
-            if (frees) frees[i] = 0;
-        }
-        return;
-    }
-    for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-        if (hits) hits[i] = g_pool.hits[i];
-        if (misses) misses[i] = g_pool.misses[i];
-        if (refills) refills[i] = g_pool.refills[i];
-        if (frees) frees[i] = g_pool.frees[i];
-    }
-}
-
-void hak_pool_extra_metrics_snapshot(uint64_t* trylock_attempts, uint64_t* trylock_success, uint64_t* ring_underflow) {
-    if (trylock_attempts) {
-        *trylock_attempts = atomic_load_explicit(&g_pool.trylock_attempts, memory_order_relaxed);
-    }
-    if (trylock_success) {
-        *trylock_success = atomic_load_explicit(&g_pool.trylock_success, memory_order_relaxed);
-    }
-    if (ring_underflow) {
-        *ring_underflow = atomic_load_explicit(&g_pool.ring_underflow, memory_order_relaxed);
-    }
-}
-int hak_pool_mid_lookup(void* ptr, size_t* out_size) {
-    // CRITICAL FIX: Check MF2 registry first if MF2 is enabled
-    if (g_mf2_enabled) {
-        MidPage* page = mf2_addr_to_page(ptr);
-        if (page) {
-            int c = (int)page->class_idx;
-            if (c < 0 || c >= POOL_NUM_CLASSES) return 0;
-            size_t sz = g_class_sizes[c];
-            if (sz == 0) return 0;
-            if (out_size) *out_size = sz;
-            return 1;
-        }
-        // Not an MF2 page - fall through to old lookup
-    }
-
-    // OLD PATH: Use mid_desc lookup
-    MidPageDesc* d = mid_desc_lookup(ptr);
-    if (!d) return 0;
-    int c = (int)d->class_idx;
-    if (c < 0 || c >= POOL_NUM_CLASSES) return 0;
-    size_t sz = g_class_sizes[c];
-    if (sz == 0) return 0;
-    if (out_size) *out_size = sz;
-    return 1;
-}
-
-void hak_pool_free_fast(void* ptr, uintptr_t site_id) {
-    if (!ptr || !g_pool.initialized) return;
-
-    // CRITICAL FIX: If MF2 is enabled, mid_desc_lookup will FAIL because MF2 pages
-    // are registered in g_mf2_page_registry, not mid_desc! Route directly to MF2.
-    if (g_mf2_enabled) {
-        // Check if this is an MF2 page by looking it up in the MF2 registry
-        MidPage* page = mf2_addr_to_page(ptr);
-
-        if (page) {
-            // MF2 page found - free through MF2 path
-            mf2_free(ptr);
-            return;
-        }
-        // Not an MF2 page - might be from old allocator or another tier
-        // Fall through to old path (though this shouldn't happen if MF2 is exclusive)
-    }
-
-    // OLD PATH: Use mid_desc lookup
-    MidPageDesc* d = mid_desc_lookup(ptr);
-    if (!d) return;
-    size_t sz = g_class_sizes[(int)d->class_idx];
-    if (sz == 0) return;
-    hak_pool_free(ptr, sz, site_id);
-}
+// Public API (boxed): alloc/free/lookup/free_fast
+#include "box/pool_api.inc.h"
diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c
index 0941af62..1b327046 100644
--- a/core/hakmem_tiny.c
+++ b/core/hakmem_tiny.c
@@ -9,29 +9,19 @@
 #include "hakmem_tiny_batch_refill.h"  // Phase 1: Batch refill/spill for mini-magazine
 #include "hakmem_tiny_stats.h"     // Phase 1: Batched statistics (replaces XOR RNG)
 // Phase 2B modules
-#include "hakmem_tiny_stats_api.h"     // Phase 2B: Stats API
-#include "hakmem_tiny_query_api.h"     // Phase 2B-1: Query API
-#include "hakmem_tiny_rss_api.h"       // Phase 2B-2: RSS Utils
-#include "hakmem_tiny_registry_api.h"  // Phase 2B-3: Registry
+#include "tiny_api.h"  // Consolidated: stats_api, query_api, rss_api, registry_api
 #include "tiny_tls.h"
 #include "tiny_debug.h"
 #include "tiny_mmap_gate.h"
 #include "tiny_debug_ring.h"
+#include "tiny_route.h"
 #include "tiny_tls_guard.h"
+#include "tiny_ready.h"
 #include "hakmem_tiny_tls_list.h"
 #include "hakmem_tiny_remote_target.h" // Phase 2C-1: Remote target queue
 #include "hakmem_tiny_bg_spill.h"      // Phase 2C-2: Background spill queue
 // NOTE: hakmem_tiny_tls_ops.h included later (after type definitions)
-#include <stdio.h>
-#include <stdlib.h>
-#include <string.h>
-#include <strings.h>
-#include <sys/mman.h>
-#include <unistd.h>
-#include <errno.h>
-#include <sched.h>
-#include <pthread.h>
-#include <time.h>
+#include "tiny_system.h"  // Consolidated: stdio, stdlib, string, etc.
 #include "hakmem_prof.h"
 #include "hakmem_trace.h"   // Optional USDT (perf) tracepoints
 
@@ -123,6 +113,7 @@ static __thread unsigned char g_tls_bench_warm_done[4];
 // Return helper: record tiny alloc stat (guarded) then return pointer
 static inline void tiny_debug_track_alloc_ret(int cls, void* ptr);
 
+// Inject route commit into return helper so any successful allocation commits a fingerprint
 #ifdef HAKMEM_ENABLE_STATS
 // Optional: sampling（ビルド時に有効化）。ホットパスは直接インライン呼び出し（間接分岐なし）。
 #ifdef HAKMEM_TINY_STAT_SAMPLING
@@ -136,9 +127,9 @@ static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) {
 #else
 static inline __attribute__((always_inline)) void hkm_stat_alloc(int cls) { stats_record_alloc(cls); }
 #endif
-#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); hkm_stat_alloc((cls)); return (ptr); } while(0)
+#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); hkm_stat_alloc((cls)); ROUTE_COMMIT((cls), 0x7F); return (ptr); } while(0)
 #else
-#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); return (ptr); } while(0)
+#define HAK_RET_ALLOC(cls, ptr) do { tiny_debug_track_alloc_ret((cls), (ptr)); ROUTE_COMMIT((cls), 0x7F); return (ptr); } while(0)
 #endif
 
 // Free-side stats: compile-time zero when stats disabled
@@ -205,6 +196,61 @@ void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class
 static void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, int class_idx);
 #endif
 
+// ---------------------------------------------------------------------------
+// Box: adopt_gate_try (implementation moved from header for robust linkage)
+// ---------------------------------------------------------------------------
+#include "box/adopt_gate_box.h"
+extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
+extern int g_super_reg_class_size[TINY_NUM_CLASSES];
+extern unsigned long long g_adopt_gate_calls[];
+extern unsigned long long g_adopt_gate_success[];
+extern unsigned long long g_reg_scan_attempts[];
+extern unsigned long long g_reg_scan_hits[];
+SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) {
+    g_adopt_gate_calls[class_idx]++;
+    ROUTE_MARK(13);
+    SuperSlab* ss = tiny_refill_try_fast(class_idx, tls);
+    if (ss) { g_adopt_gate_success[class_idx]++; return ss; }
+    g_reg_scan_attempts[class_idx]++;
+    int reg_size = g_super_reg_class_size[class_idx];
+    int scan_limit = tiny_reg_scan_max();
+    if (scan_limit > reg_size) scan_limit = reg_size;
+    uint32_t self_tid = tiny_self_u32();
+    for (int i = 0; i < scan_limit; i++) {
+        SuperSlab* cand = g_super_reg_by_class[class_idx][i];
+        if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue;
+        // Fast path: use nonempty_mask / freelist_mask to locate candidates in O(1)
+        uint32_t mask = cand->nonempty_mask;
+        // Fallback to atomic freelist_mask for cross-thread visibility
+        if (mask == 0) {
+            mask = atomic_load_explicit(&cand->freelist_mask, memory_order_acquire);
+        }
+        if (mask == 0) continue;  // No visible freelists in this SS
+        int cap = ss_slabs_capacity(cand);
+        // Iterate set bits only
+        while (mask) {
+            int sidx = __builtin_ctz(mask);
+            mask &= (mask - 1);  // clear lowest set bit
+            if (sidx >= cap) continue;
+            SlabHandle h = slab_try_acquire(cand, sidx, self_tid);
+            if (!slab_is_valid(&h)) continue;
+            if (slab_remote_pending(&h)) {
+                slab_drain_remote_full(&h);
+            }
+            if (slab_is_safe_to_bind(&h)) {
+                tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
+                g_adopt_gate_success[class_idx]++;
+                g_reg_scan_hits[class_idx]++;
+                ROUTE_MARK(14); ROUTE_COMMIT(class_idx, 0x07);
+                slab_release(&h);
+                return h.ss;
+            }
+            slab_release(&h);
+        }
+    }
+    return NULL;
+}
+
 // ============================================================================
 // Global State
 // ============================================================================
@@ -264,7 +310,7 @@ static int g_use_registry = 1;  // Default ON for thread-safety
 static int g_tiny_refill_max = 64;        // HAKMEM_TINY_REFILL_MAX (default 64)
 static int g_tiny_refill_max_hot = 192;   // HAKMEM_TINY_REFILL_MAX_HOT for classes<=3 (default 192)
 
-#include "hakmem_tiny_tls_list.h"
+// hakmem_tiny_tls_list.h already included at top
 static __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
 static int g_tls_list_enable = 1;
 static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
@@ -436,7 +482,7 @@ void tiny_adopt_gate_on_remote_seen(int class_idx) {
 #include "tiny_sticky.h"
 
 // Mailbox box
-#include "tiny_mailbox.h"
+#include "box/mailbox_box.h"
 
 // Publish pipeline counters (visibility)
 unsigned long long g_pub_notify_calls[TINY_NUM_CLASSES]       = {0};
@@ -513,6 +559,7 @@ static _Atomic(uint32_t)  g_slab_partial_rr2[TINY_NUM_CLASSES];
 unsigned long long g_rf_total_calls[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_rf_hit_bench[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_rf_hit_hot[TINY_NUM_CLASSES]       = {0};
+unsigned long long g_rf_hit_ready[TINY_NUM_CLASSES]     = {0};
 unsigned long long g_rf_hit_mail[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_rf_hit_slab[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_rf_hit_ss[TINY_NUM_CLASSES]        = {0};
@@ -535,6 +582,10 @@ unsigned long long g_rf_time_ss_ns[TINY_NUM_CLASSES]    = {0};
 unsigned long long g_rf_time_reg_ns[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_rf_time_mmap_ns[TINY_NUM_CLASSES]  = {0};
 
+// Refill item source breakdown (freelist vs carve)
+unsigned long long g_rf_freelist_items[TINY_NUM_CLASSES] = {0};
+unsigned long long g_rf_carve_items[TINY_NUM_CLASSES]    = {0};
+
 static int g_rf_trace_en = -1;
 static inline int rf_trace_enabled(void) {
     if (__builtin_expect(g_rf_trace_en == -1, 0)) {
@@ -566,6 +617,22 @@ unsigned long long g_free_via_ss_local[TINY_NUM_CLASSES]  = {0};
 unsigned long long g_free_via_ss_remote[TINY_NUM_CLASSES] = {0};
 unsigned long long g_free_via_tls_sll[TINY_NUM_CLASSES]   = {0};
 unsigned long long g_free_via_mag[TINY_NUM_CLASSES]       = {0};
+
+// Front Gate Breakdown (debug counters)
+unsigned long long g_front_sfc_hit[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_front_sll_hit[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_front_quick_hit[TINY_NUM_CLASSES] = {0};
+unsigned long long g_front_mag_hit[TINY_NUM_CLASSES]   = {0};
+
+// Free-side trigger counters
+unsigned long long g_first_free_transitions[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_remote_free_transitions[TINY_NUM_CLASSES]  = {0};
+
+// Adopt/Registry gate counters
+unsigned long long g_adopt_gate_calls[TINY_NUM_CLASSES]   = {0};
+unsigned long long g_adopt_gate_success[TINY_NUM_CLASSES] = {0};
+unsigned long long g_reg_scan_attempts[TINY_NUM_CLASSES]  = {0};
+unsigned long long g_reg_scan_hits[TINY_NUM_CLASSES]      = {0};
 unsigned long long g_free_via_fast_tls[TINY_NUM_CLASSES]  = {0};
 unsigned long long g_free_via_fastcache[TINY_NUM_CLASSES] = {0};
 unsigned long long g_fast_spare_flush[TINY_NUM_CLASSES]   = {0};
@@ -622,7 +689,7 @@ static inline uintptr_t hot_slot_pop(int class_idx) {
 
 // moved to tiny_publish.c
 
-static void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
+static __attribute__((unused)) void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
     if (!ss) return;
     uintptr_t ent = slab_entry_make(ss, slab_idx);
     for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
@@ -650,7 +717,7 @@ static void slab_partial_publish(int class_idx, SuperSlab* ss, int slab_idx) {
     g_slab_publish_dbg[class_idx]++;
 }
 
-static uintptr_t slab_partial_adopt(int class_idx) {
+static __attribute__((unused)) uintptr_t slab_partial_adopt(int class_idx) {
     for (int i = 0; i < SLAB_PARTIAL_RING; i++) {
         uintptr_t ent = atomic_exchange_explicit(&g_slab_partial_ring[class_idx][i], (uintptr_t)0, memory_order_acq_rel);
         if (ent) return ent;
@@ -703,7 +770,13 @@ void ss_partial_publish(int class_idx, SuperSlab* ss) {
                         + (has_remote ? 1u : 0u);
         if (score > best_score) { best_score = score; best = s; }
     }
-    if (best >= 0 && best < 256) ss->publish_hint = (uint8_t)best; else ss->publish_hint = 0xFF;
+    if (best >= 0 && best < 256) {
+        ss->publish_hint = (uint8_t)best;
+        // Box: Ready push — provide slab-level candidate to adopters
+        tiny_ready_push(class_idx, ss, best);
+    } else {
+        ss->publish_hint = 0xFF;
+    }
     for (int i = 0; i < SS_PARTIAL_RING; i++) {
         SuperSlab* expected = NULL;
         if (atomic_compare_exchange_strong_explicit(&g_ss_partial_ring[class_idx][i], &expected, ss,
@@ -842,7 +915,7 @@ static inline int tiny_fast_push(int class_idx, void* ptr);
 // Functions: tiny_hot_pop_class0(), tiny_hot_pop_class1(), tiny_hot_pop_class2(), tiny_hot_pop_class3()
 // 88 lines (lines 407-494)
 
-static __attribute__((cold, noinline)) void* tiny_slow_alloc_fast(int class_idx) {
+static __attribute__((cold, noinline, unused)) void* tiny_slow_alloc_fast(int class_idx) {
     int tls_enabled = g_tls_list_enable;
     TinyTLSList* tls = &g_tls_lists[class_idx];
     pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
@@ -939,7 +1012,7 @@ static __attribute__((cold, noinline)) void* tiny_slow_alloc_fast(int class_idx)
 // Function: tiny_fast_refill_and_take() - 39 lines (lines 584-622)
 // Hot-path cheap sampling counter to avoid rand() in allocation path
 // Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B)
-static int g_tls_sll_enable = 1;                 // HAKMEM_TINY_TLS_SLL=0 to disable
+int g_tls_sll_enable = 1;                 // HAKMEM_TINY_TLS_SLL=0 to disable
 // Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c)
 #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
 __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
@@ -952,27 +1025,27 @@ static int g_tiny_ultra = 0;                     // HAKMEM_TINY_ULTRA=1 for SLL-
 static int g_ultra_validate = 0;                 // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation
 // Ultra debug counters
 #if HAKMEM_DEBUG_COUNTERS
-static uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0};
 static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0};
-static uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0};
 #endif
 
 // Path counters (normal mode visibility): lightweight, for debugging/bench only
 #if HAKMEM_DEBUG_COUNTERS
-static uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0};
-static uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0};
-static uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0};
-static uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0};
-static uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0};
 // New: slow/bitmap/bump/bin instrumentation
-static uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0};
-static uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0};
-static uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0};
-static uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0};
-static uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0};
-static uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0};
-static uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0};
-static uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0};
+static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0};
 #endif
 static int g_path_debug_enabled = 0;
 
@@ -1039,7 +1112,7 @@ static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) {
 }
 
 #include "tiny_refill.h"
-#include "tiny_mmap_gate.h"
+// tiny_mmap_gate.h already included at top
 #include "tiny_publish.h"
 
 static int g_sll_cap_override[TINY_NUM_CLASSES] = {0};     // HAKMEM_TINY_SLL_CAP_C{0..7}
@@ -1524,12 +1597,18 @@ TinySlab* hak_tiny_owner_slab(void* ptr) {
 //   - Eliminates: Registry lookups, mid_lookup, owner checks
 // ============================================================================
 
+// Forward declarations for Phase 6 alloc/free functions
+#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
+    void* hak_tiny_alloc_ultra_simple(size_t size);
+    void hak_tiny_free_ultra_simple(void* ptr);
+#endif
+
 #if defined(HAKMEM_TINY_PHASE6_METADATA) && defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
     #error "Cannot enable both PHASE6_METADATA and PHASE6_ULTRA_SIMPLE"
 #endif
 
 // Phase 6-1.7: Box Theory Refactoring - Mutual exclusion check
-#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
+#if HAKMEM_TINY_PHASE6_BOX_REFACTOR
     #if defined(HAKMEM_TINY_PHASE6_METADATA) || defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
         #error "Cannot enable PHASE6_BOX_REFACTOR with other Phase 6 options"
     #endif
@@ -1563,14 +1642,33 @@ TinySlab* hak_tiny_owner_slab(void* ptr) {
 
 #elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
     // Phase 6-1.5: Alignment guessing (legacy)
+
+    // Refill count globals (needed for compatibility)
+    int g_refill_count_global = 0;
+    int g_refill_count_hot = 0;
+    int g_refill_count_mid = 0;
+    int g_refill_count_class[TINY_NUM_CLASSES] = {0};
+
     #include "hakmem_tiny_ultra_simple.inc"
+
+    // Wrapper functions for hakmem.c compatibility (not used in ULTRA_SIMPLE but needed for linking)
+    void* hak_tiny_alloc_fast_wrapper(size_t size) {
+        return hak_tiny_alloc_ultra_simple(size);
+    }
+
+    void hak_tiny_free_fast_wrapper(void* ptr) {
+        hak_tiny_free_ultra_simple(ptr);
+    }
 #elif defined(HAKMEM_TINY_PHASE6_METADATA)
     // Phase 6-1.6: Metadata header (recommended)
     #include "hakmem_tiny_metadata.inc"
 #endif
 
 // Layer 1-3: Main allocation function (simplified)
-#define HAKMEM_TINY_USE_NEW_3LAYER 0  // TEMP: Disable for baseline comparison
+// Build-time configurable via: -DHAKMEM_TINY_USE_NEW_3LAYER=1
+#ifndef HAKMEM_TINY_USE_NEW_3LAYER
+#define HAKMEM_TINY_USE_NEW_3LAYER 0  // default OFF (legacy path)
+#endif
 #if HAKMEM_TINY_USE_NEW_3LAYER
 #include "hakmem_tiny_alloc_new.inc"
 #else
diff --git a/core/hakmem_tiny_free.inc b/core/hakmem_tiny_free.inc
index 6553601e..59564560 100644
--- a/core/hakmem_tiny_free.inc
+++ b/core/hakmem_tiny_free.inc
@@ -3,6 +3,7 @@
 #include "slab_handle.h"
 #include "tiny_refill.h"
 #include "tiny_tls_guard.h"
+#include "box/free_publish_box.h"
 #include "mid_tcache.h"
 extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
 extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
@@ -132,6 +133,20 @@ void hak_tiny_free_with_slab(void* ptr, TinySlab* slab) {
             return;
         }
 
+        // A/B: Force SS freelist path for same-thread frees (publish on first-free)
+        do {
+            static int g_free_to_ss2 = -1;
+            if (__builtin_expect(g_free_to_ss2 == -1, 0)) {
+                const char* e = getenv("HAKMEM_TINY_FREE_TO_SS");
+                g_free_to_ss2 = (e && *e && *e != '0') ? 1 : 0; // default OFF
+            }
+            if (g_free_to_ss2) {
+                hak_tiny_free_superslab(ptr, ss);
+                HAK_STAT_FREE(class_idx);
+                return;
+            }
+        } while (0);
+
         if (__builtin_expect(g_debug_fast0, 0)) {
             tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, ptr, (uintptr_t)slab_idx);
             void* prev = meta->freelist;
@@ -190,1227 +205,14 @@ void hak_tiny_free_with_slab(void* ptr, TinySlab* slab) {
             return;
         }
 
-#if !HAKMEM_BUILD_RELEASE
-        // SuperSlab uses Magazine for TLS caching (same as TinySlab)
-        tiny_small_mags_init_once();
-        if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
-        TinyTLSMag* mag = &g_tls_mags[class_idx];
-        int cap = mag->cap;
-
-        // 32/64B: SLL優先（mag優先は無効化）
-        // Prefer TinyQuickSlot (compile-out if HAKMEM_TINY_NO_QUICK)
-#if !defined(HAKMEM_TINY_NO_QUICK)
-        if (g_quick_enable && class_idx <= 4) {
-            TinyQuickSlot* qs = &g_tls_quick[class_idx];
-            if (__builtin_expect(qs->top < QUICK_CAP, 1)) {
-                qs->items[qs->top++] = ptr;
-                HAK_STAT_FREE(class_idx);
-                return;
-            }
-        }
-#endif
-
-        // Fast path: TLS SLL push for hottest classes
-        if (!g_tls_list_enable && g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)cap)) {
-            *(void**)ptr = g_tls_sll_head[class_idx];
-            g_tls_sll_head[class_idx] = ptr;
-            g_tls_sll_count[class_idx]++;
-            // Active → Inactive: count down immediately (TLS保管中は"使用中"ではない)
-            ss_active_dec_one(ss);
-            HAK_TP1(sll_push, class_idx);
-            tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, 3);
-            HAK_STAT_FREE(class_idx);
-            return;
-        }
-
-        // Next: Magazine push（必要ならmag→SLLへバルク転送で空きを作る）
-        // Hysteresis: allow slight overfill before deciding to spill under lock
-        if (mag->top >= cap && g_spill_hyst > 0) {
-            (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2);
-        }
-        if (mag->top < cap + g_spill_hyst) {
-            mag->items[mag->top].ptr = ptr;
-#if HAKMEM_TINY_MAG_OWNER
-            mag->items[mag->top].owner = NULL; // SuperSlab owner not a TinySlab; leave NULL
-#endif
-            mag->top++;
-#if HAKMEM_DEBUG_COUNTERS
-            g_magazine_push_count++;  // Phase 7.6: Track pushes
-#endif
-            // Active → Inactive: decrement now（アプリ解放時に非アクティブ扱い）
-            ss_active_dec_one(ss);
-            HAK_TP1(mag_push, class_idx);
-            tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 2);
-            HAK_STAT_FREE(class_idx);
-            return;
-        }
-
-        // Background spill: queue to BG thread instead of locking (when enabled)
-        if (g_bg_spill_enable) {
-            uint32_t qlen = atomic_load_explicit(&g_bg_spill_len[class_idx], memory_order_relaxed);
-            if ((int)qlen < g_bg_spill_target) {
-                // Build a small chain: include current ptr and pop from mag up to limit
-                int limit = g_bg_spill_max_batch;
-                if (limit > cap/2) limit = cap/2;
-                if (limit > 32) limit = 32; // keep free-path bounded
-                void* head = ptr;
-                *(void**)head = NULL;
-                void* tail = head; // current tail
-                int taken = 1;
-                while (taken < limit && mag->top > 0) {
-                    void* p2 = mag->items[--mag->top].ptr;
-                    *(void**)p2 = head;
-                    head = p2;
-                    taken++;
-                }
-                // Push chain to spill queue (single CAS)
-                bg_spill_push_chain(class_idx, head, tail, taken);
-                tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 3);
-                HAK_STAT_FREE(class_idx);
-                return;
-            }
-        }
-
-        // Spill half (SuperSlab version - simpler than TinySlab)
-                pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
-        hkm_prof_begin(NULL);
-        pthread_mutex_lock(lock);
-        // Batch spill: reduce lock frequency and work per call
-        int spill = cap / 2;
-        int over = mag->top - (cap + g_spill_hyst);
-        if (over > 0 && over < spill) spill = over;
-
-        for (int i = 0; i < spill && mag->top > 0; i++) {
-            TinyMagItem it = mag->items[--mag->top];
-
-            // Phase 7.6: SuperSlab spill - return to freelist
-            SuperSlab* owner_ss = hak_super_lookup(it.ptr);
-            if (owner_ss && owner_ss->magic == SUPERSLAB_MAGIC) {
-                // Direct freelist push (same as old hak_tiny_free_superslab)
-                int slab_idx = slab_index_for(owner_ss, it.ptr);
-                TinySlabMeta* meta = &owner_ss->slabs[slab_idx];
-                *(void**)it.ptr = meta->freelist;
-                meta->freelist = it.ptr;
-                meta->used--;
-                // Decrement SuperSlab active counter (spill returns blocks to SS)
-                ss_active_dec_one(owner_ss);
-
-                // Phase 8.4: Empty SuperSlab detection (will use meta->used scan)
-                // TODO: Implement scan-based empty detection
-                // Empty SuperSlab detection/munmapは別途フラッシュAPIで実施（ホットパスから除外）
-            }
-        }
-
-        pthread_mutex_unlock(lock);
-        hkm_prof_end(ss_time, HKP_TINY_SPILL, &tss);
-
-        // Adaptive increase of cap after spill
-        int max_cap = tiny_cap_max_for_class(class_idx);
-                if (mag->cap < max_cap) {
-            int new_cap = mag->cap + (mag->cap / 2);
-            if (new_cap > max_cap) new_cap = max_cap;
-            if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP;
-            mag->cap = new_cap;
-        }
-
-        // Finally, try FastCache push first (≤128B) — compile-out if HAKMEM_TINY_NO_FRONT_CACHE
-#if !defined(HAKMEM_TINY_NO_FRONT_CACHE)
-        if (g_fastcache_enable && class_idx <= 4) {
-            if (fastcache_push(class_idx, ptr)) {
-                HAK_TP1(front_push, class_idx);
-                HAK_STAT_FREE(class_idx);
-                return;
-            }
-        }
-#endif
-        // Then TLS SLL if room, else magazine
-        if (g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)mag->cap)) {
-            *(void**)ptr = g_tls_sll_head[class_idx];
-            g_tls_sll_head[class_idx] = ptr;
-            g_tls_sll_count[class_idx]++;
-        } else {
-            mag->items[mag->top].ptr = ptr;
-#if HAKMEM_TINY_MAG_OWNER
-            mag->items[mag->top].owner = slab;
-#endif
-            mag->top++;
-        }
-        
-#if HAKMEM_DEBUG_COUNTERS
-        g_magazine_push_count++;  // Phase 7.6: Track pushes
-#endif
-        HAK_STAT_FREE(class_idx);
-        return;
-#endif  // HAKMEM_BUILD_RELEASE
-    }
-
-    // Phase 7.6: TinySlab path (original)
-    //g_tiny_free_with_slab_count++;  // Phase 7.6: Track calls - DISABLED due to segfault
-    // Same-thread → TLS magazine; remote-thread → MPSC stack
-    if (pthread_equal(slab->owner_tid, tiny_self_pt())) {
-        int class_idx = slab->class_idx;
-
-        if (g_tls_list_enable) {
-            TinyTLSList* tls = &g_tls_lists[class_idx];
-            uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
-            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
-                tiny_tls_refresh_params(class_idx, tls);
-            }
-            // TinyHotMag front push（8/16/32B, A/B）
-            if (__builtin_expect(g_hotmag_enable && class_idx <= 2, 1)) {
-                if (hotmag_push(class_idx, ptr)) {
-                    HAK_STAT_FREE(class_idx);
-                    return;
-                }
-            }
-            if (tls->count < tls->cap) {
-                tiny_tls_list_guard_push(class_idx, tls, ptr);
-                tls_list_push(tls, ptr);
-                HAK_STAT_FREE(class_idx);
-                return;
-            }
-            seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
-            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
-                tiny_tls_refresh_params(class_idx, tls);
-            }
-            tiny_tls_list_guard_push(class_idx, tls, ptr);
-            tls_list_push(tls, ptr);
-            if (tls_list_should_spill(tls)) {
-                tls_list_spill_excess(class_idx, tls);
-            }
-            HAK_STAT_FREE(class_idx);
-            return;
-        }
-
-        tiny_mag_init_if_needed(class_idx);
-        TinyTLSMag* mag = &g_tls_mags[class_idx];
-        int cap = mag->cap;
-        // 32/64B: SLL優先（mag優先は無効化）
-        // Fast path: FastCache push (preferred for ≤128B), then TLS SLL
-        if (g_fastcache_enable && class_idx <= 4) {
-            if (fastcache_push(class_idx, ptr)) {
-                HAK_STAT_FREE(class_idx);
-                return;
-            }
-        }
-        // Fast path: TLS SLL push (preferred)
-        if (!g_tls_list_enable && g_tls_sll_enable && class_idx <= 5) {
-            uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)cap);
-            if (g_tls_sll_count[class_idx] < sll_cap) {
-                *(void**)ptr = g_tls_sll_head[class_idx];
-                g_tls_sll_head[class_idx] = ptr;
-                g_tls_sll_count[class_idx]++;
-                HAK_STAT_FREE(class_idx);
-                return;
-            }
-        }
-        // Next: if magazine has room, push immediately and return（満杯ならmag→SLLへバルク）
-        if (mag->top >= cap) {
-            (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2);
-        }
-        // Remote-drain can be handled opportunistically on future calls.
-        if (mag->top < cap) {
-            mag->items[mag->top].ptr = ptr;
-#if HAKMEM_TINY_MAG_OWNER
-            mag->items[mag->top].owner = slab;
-#endif
-            mag->top++;
-            
-#if HAKMEM_DEBUG_COUNTERS
-            g_magazine_push_count++;  // Phase 7.6: Track pushes
-#endif
-            // Note: SuperSlab uses separate path (slab == NULL branch above)
-            HAK_STAT_FREE(class_idx);  // Phase 3
-            return;
-        }
-        // Magazine full: before spilling, opportunistically drain remotes once under lock.
-        if (atomic_load_explicit(&slab->remote_count, memory_order_relaxed) >= (unsigned)g_remote_drain_thresh_per_class[class_idx] || atomic_load_explicit(&slab->remote_head, memory_order_acquire)) {
-            pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
-            pthread_mutex_lock(lock);
-            HAK_TP1(remote_drain, class_idx);
-            tiny_remote_drain_locked(slab);
-            pthread_mutex_unlock(lock);
-        }
-        // Spill half under class lock
-        pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
-        pthread_mutex_lock(lock);
-        int spill = cap / 2;
-
-        // Phase 4.2: High-water threshold for gating Phase 4 logic
-        int high_water = (cap * 3) / 4;  // 75% of capacity
-
-        for (int i = 0; i < spill && mag->top > 0; i++) {
-            TinyMagItem it = mag->items[--mag->top];
-
-            // Phase 7.6: Check for SuperSlab first (mixed Magazine support)
-            SuperSlab* ss_owner = hak_super_lookup(it.ptr);
-            if (ss_owner && ss_owner->magic == SUPERSLAB_MAGIC) {
-                // SuperSlab spill - return to freelist
-                int slab_idx = slab_index_for(ss_owner, it.ptr);
-                TinySlabMeta* meta = &ss_owner->slabs[slab_idx];
-                *(void**)it.ptr = meta->freelist;
-                meta->freelist = it.ptr;
-                meta->used--;
-                // 空SuperSlab処理はフラッシュ/バックグラウンドで対応（ホットパス除外）
-                HAK_STAT_FREE(class_idx);
-                continue;  // Skip TinySlab processing
-            }
-
-            TinySlab* owner =
-#if HAKMEM_TINY_MAG_OWNER
-                it.owner;
-#else
-                NULL;
-#endif
-            if (!owner) {
-                owner = tls_active_owner_for_ptr(class_idx, it.ptr);
-            }
-            if (!owner) {
-                owner = hak_tiny_owner_slab(it.ptr);
-            }
-            if (!owner) continue;
-
-            // Phase 4.2: Adaptive gating - skip Phase 4 when TLS Magazine is high-water
-            // Rationale: When mag->top >= 75%, next alloc will come from TLS anyway
-            //            so pushing to mini-mag is wasted work
-            int is_high_water = (mag->top >= high_water);
-
-            if (!is_high_water) {
-                // Low-water: Phase 4.1 logic (try mini-magazine first)
-                uint8_t cidx = owner->class_idx;  // Option A: 1回だけ読む
-                TinySlab* tls_a = g_tls_active_slab_a[cidx];
-                TinySlab* tls_b = g_tls_active_slab_b[cidx];
-
-                // Option B: Branch prediction hint (spill → TLS-active への戻りが likely)
-                if (__builtin_expect((owner == tls_a || owner == tls_b) &&
-                                     !mini_mag_is_full(&owner->mini_mag), 1)) {
-                    // Fast path: mini-magazineに戻す（bitmap触らない）
-                    mini_mag_push(&owner->mini_mag, it.ptr);
-                    HAK_TP1(spill_tiny, cidx);
-                    HAK_STAT_FREE(cidx);
-                    continue;  // bitmap操作スキップ
-                }
-            }
-            // High-water or Phase 4.1 mini-mag full: fall through to bitmap
-
-            // Slow path: bitmap直接書き込み（既存ロジック）
-            size_t bs = g_tiny_class_sizes[owner->class_idx];
-            int idx = ((uintptr_t)it.ptr - (uintptr_t)owner->base) / bs;
-            if (hak_tiny_is_used(owner, idx)) {
-                hak_tiny_set_free(owner, idx);
-                int was_full = (owner->free_count == 0);
-                owner->free_count++;
-                if (was_full) move_to_free_list(owner->class_idx, owner);
-                if (owner->free_count == owner->total_count) {
-                    // If this slab is TLS-active for this thread, clear the pointer before releasing
-                    if (g_tls_active_slab_a[owner->class_idx] == owner) g_tls_active_slab_a[owner->class_idx] = NULL;
-                    if (g_tls_active_slab_b[owner->class_idx] == owner) g_tls_active_slab_b[owner->class_idx] = NULL;
-                    TinySlab** headp = &g_tiny_pool.free_slabs[owner->class_idx];
-                    TinySlab* prev = NULL;
-                    for (TinySlab* s = *headp; s; prev = s, s = s->next) {
-                        if (s == owner) { if (prev) prev->next = s->next; else *headp = s->next; break; }
-                    }
-                    release_slab(owner);
-                }
-                HAK_TP1(spill_tiny, owner->class_idx);
-                HAK_STAT_FREE(owner->class_idx);
-            }
-        }
-        pthread_mutex_unlock(lock);
-        hkm_prof_end(ss, HKP_TINY_SPILL, &tss);
-        // Adaptive increase of cap after spill
-        int max_cap = tiny_cap_max_for_class(class_idx);
-        if (mag->cap < max_cap) {
-            int new_cap = mag->cap + (mag->cap / 2);
-            if (new_cap > max_cap) new_cap = max_cap;
-            if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP;
-            mag->cap = new_cap;
-        }
-        // Finally: prefer TinyQuickSlot → SLL → UltraFront → HotMag → Magazine（順序で局所性を確保）
-#if !HAKMEM_BUILD_RELEASE && !defined(HAKMEM_TINY_NO_QUICK)
-        if (g_quick_enable && class_idx <= 4) {
-            TinyQuickSlot* qs = &g_tls_quick[class_idx];
-            if (__builtin_expect(qs->top < QUICK_CAP, 1)) {
-                qs->items[qs->top++] = ptr;
-            } else if (g_tls_sll_enable) {
-                uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
-                if (g_tls_sll_count[class_idx] < sll_cap2) {
-                    *(void**)ptr = g_tls_sll_head[class_idx];
-                    g_tls_sll_head[class_idx] = ptr;
-                    g_tls_sll_count[class_idx]++;
-                } else if (!tiny_optional_push(class_idx, ptr)) {
-                    mag->items[mag->top].ptr = ptr;
-#if HAKMEM_TINY_MAG_OWNER
-                    mag->items[mag->top].owner = slab;
-#endif
-                    mag->top++;
-                }
-            } else {
-                if (!tiny_optional_push(class_idx, ptr)) {
-                    mag->items[mag->top].ptr = ptr;
-#if HAKMEM_TINY_MAG_OWNER
-                    mag->items[mag->top].owner = slab;
-#endif
-                    mag->top++;
-                }
-            }
-        } else
-#endif
-        {
-            if (g_tls_sll_enable && class_idx <= 5) {
-                uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
-                if (g_tls_sll_count[class_idx] < sll_cap2) {
-                    *(void**)ptr = g_tls_sll_head[class_idx];
-                    g_tls_sll_head[class_idx] = ptr;
-                    g_tls_sll_count[class_idx]++;
-                } else if (!tiny_optional_push(class_idx, ptr)) {
-                    mag->items[mag->top].ptr = ptr;
-#if HAKMEM_TINY_MAG_OWNER
-                    mag->items[mag->top].owner = slab;
-#endif
-                    mag->top++;
-                }
-            } else {
-                if (!tiny_optional_push(class_idx, ptr)) {
-                    mag->items[mag->top].ptr = ptr;
-#if HAKMEM_TINY_MAG_OWNER
-                    mag->items[mag->top].owner = slab;
-#endif
-                    mag->top++;
-                }
-            }
-        }
-        
-#if HAKMEM_DEBUG_COUNTERS
-        g_magazine_push_count++;  // Phase 7.6: Track pushes
-#endif
-        // Note: SuperSlab uses separate path (slab == NULL branch above)
-        HAK_STAT_FREE(class_idx);  // Phase 3
-        return;
-    } else {
-        tiny_remote_push(slab, ptr);
-    }
-}
-
+#include "tiny_free_magazine.inc.h"
 // ============================================================================
 // Phase 6.23: SuperSlab Allocation Helpers
 // ============================================================================
 
 // Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation)
-static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
-    TinySlabMeta* meta = &ss->slabs[slab_idx];
-
-    // Ensure remote queue is drained before handing blocks back to TLS
-    if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0) {
-        uint32_t self_tid = tiny_self_u32();
-        SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
-        if (slab_is_valid(&h)) {
-            slab_drain_remote_full(&h);
-            int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0;
-            if (__builtin_expect(pending, 0)) {
-                if (__builtin_expect(g_debug_remote_guard, 0)) {
-                    uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed);
-                    tiny_remote_watch_note("alloc_pending_remote",
-                                           ss,
-                                           slab_idx,
-                                           (void*)head,
-                                           0xA243u,
-                                           self_tid,
-                                           0);
-                }
-                slab_release(&h);
-                return NULL;
-            }
-            slab_release(&h);
-        } else {
-            if (__builtin_expect(g_debug_remote_guard, 0)) {
-                tiny_remote_watch_note("alloc_acquire_fail",
-                                       ss,
-                                       slab_idx,
-                                       meta,
-                                       0xA244u,
-                                       self_tid,
-                                       0);
-            }
-            return NULL;
-        }
-    }
-
-    if (__builtin_expect(g_debug_remote_guard, 0)) {
-        uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
-        if (head_pending != 0) {
-            tiny_remote_watch_note("alloc_remote_pending",
-                                   ss,
-                                   slab_idx,
-                                   (void*)head_pending,
-                                   0xA247u,
-                                   tiny_self_u32(),
-                                   1);
-            return NULL;
-        }
-    }
-
-    // Phase 6.24: Linear allocation mode (freelist == NULL)
-    // This avoids the 4000-8000 cycle cost of building freelist on init
-    if (meta->freelist == NULL && meta->used < meta->capacity) {
-        // Linear allocation: sequential memory access (cache-friendly!)
-        size_t block_size = g_tiny_class_sizes[ss->size_class];
-        void* slab_start = slab_data_start(ss, slab_idx);
-
-        // First slab: skip SuperSlab header
-        if (slab_idx == 0) {
-            slab_start = (char*)slab_start + 1024;
-        }
-
-        void* block = (char*)slab_start + (meta->used * block_size);
-        meta->used++;
-        tiny_remote_track_on_alloc(ss, slab_idx, block, "linear_alloc", 0);
-        tiny_remote_assert_not_remote(ss, slab_idx, block, "linear_alloc_ret", 0);
-        return block;  // Fast path: O(1) pointer arithmetic
-    }
-
-    // Freelist mode (after first free())
-    if (meta->freelist) {
-        void* block = meta->freelist;
-        meta->freelist = *(void**)block;  // Pop from freelist
-        meta->used++;
-        tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
-        tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
-        return block;
-    }
-
-    return NULL;  // Slab is full
-}
-
-// Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation)
-static SuperSlab* superslab_refill(int class_idx) {
-#if HAKMEM_DEBUG_COUNTERS
-    g_superslab_refill_calls_dbg[class_idx]++;
-#endif
-    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
-    static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen
-    if (g_ss_adopt_en == -1) {
-        char* e = getenv("HAKMEM_TINY_SS_ADOPT");
-        if (e) {
-            g_ss_adopt_en = (*e != '0') ? 1 : 0;
-        } else {
-            extern _Atomic int g_ss_remote_seen;
-            g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0;
-        }
-    }
-    extern int g_adopt_cool_period;
-    extern __thread int g_tls_adopt_cd[];
-    if (g_adopt_cool_period == -1) {
-        char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN");
-        int v = (cd ? atoi(cd) : 0);
-        if (v < 0) v = 0; if (v > 1024) v = 1024;
-        g_adopt_cool_period = v;
-    }
-
-    static int g_superslab_refill_debug_once = 0;
-    SuperSlab* prev_ss = tls->ss;
-    TinySlabMeta* prev_meta = tls->meta;
-    uint8_t prev_slab_idx = tls->slab_idx;
-    uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0;
-    uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0;
-    uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0;
-    uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0;
-    int free_idx_attempted = -2;  // -2 = not evaluated, -1 = none, >=0 = chosen
-    int reused_slabs = 0;
-
-    // Optional: Mid-size simple refill to avoid multi-layer scans (class>=4)
-    do {
-        static int g_mid_simple_warn = 0;
-        if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) {
-            // If current TLS has a SuperSlab, prefer taking a virgin slab directly
-            if (tls->ss) {
-                int tls_cap = ss_slabs_capacity(tls->ss);
-                if (tls->ss->active_slabs < tls_cap) {
-                    int free_idx = superslab_find_free_slab(tls->ss);
-                    if (free_idx >= 0) {
-                        uint32_t my_tid = tiny_self_u32();
-                        superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
-                        tiny_tls_bind_slab(tls, tls->ss, free_idx);
-                        return tls->ss;
-                    }
-                }
-            }
-            // Otherwise allocate a fresh SuperSlab and bind first slab
-            SuperSlab* ssn = superslab_allocate((uint8_t)class_idx);
-            if (!ssn) {
-                if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) {
-                    g_mid_simple_warn++;
-                    int err = errno;
-                    fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err);
-                }
-                return NULL;
-            }
-            uint32_t my_tid = tiny_self_u32();
-            superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid);
-            SuperSlab* old = tls->ss;
-            tiny_tls_bind_slab(tls, ssn, 0);
-            superslab_ref_inc(ssn);
-            if (old && old != ssn) { superslab_ref_dec(old); }
-            return ssn;
-        }
-    } while (0);
-
-
-    // First, try to adopt a published partial SuperSlab for this class
-    if (g_ss_adopt_en) {
-        if (g_adopt_cool_period > 0) {
-            if (g_tls_adopt_cd[class_idx] > 0) {
-                g_tls_adopt_cd[class_idx]--;
-            } else {
-                // eligible to adopt
-            }
-        }
-        if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) {
-        SuperSlab* adopt = ss_partial_adopt(class_idx);
-        if (adopt && adopt->magic == SUPERSLAB_MAGIC) {
-            // ========================================================================
-            // Quick Win #2: First-Fit Adopt (vs Best-Fit scoring all 32 slabs)
-            // For Larson, any slab with freelist works - no need to score all 32!
-            // Expected improvement: -3,000 cycles (from 32 atomic loads + 32 scores)
-            // ========================================================================
-            int adopt_cap = ss_slabs_capacity(adopt);
-            int best = -1;
-            for (int s = 0; s < adopt_cap; s++) {
-                TinySlabMeta* m = &adopt->slabs[s];
-                // Quick check: Does this slab have a freelist?
-                if (m->freelist) {
-                    // Yes! Try to acquire it immediately (first-fit)
-                    best = s;
-                    break;  // ✅ OPTIMIZATION: Stop at first slab with freelist!
-                }
-                // Optional: Also check remote_heads if we want to prioritize those
-                // (But for Larson, freelist is sufficient)
-            }
-            if (best >= 0) {
-                // Box: Try to acquire ownership atomically
-                uint32_t self = tiny_self_u32();
-                SlabHandle h = slab_try_acquire(adopt, best, self);
-                if (slab_is_valid(&h)) {
-                    slab_drain_remote_full(&h);
-                    if (slab_remote_pending(&h)) {
-                        if (__builtin_expect(g_debug_remote_guard, 0)) {
-                            uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed);
-                            tiny_remote_watch_note("adopt_remote_pending",
-                                                   h.ss,
-                                                   h.slab_idx,
-                                                   (void*)head,
-                                                   0xA255u,
-                                                   self,
-                                                   0);
-                        }
-                        // Remote still pending; give up adopt path and fall through to normal refill.
-                        slab_release(&h);
-                    }
-
-                    // Box 4 Boundary: bind は remote_head==0 を保証する必要がある
-                    // slab_is_safe_to_bind() で TOCTOU-safe にチェック
-                    if (slab_is_safe_to_bind(&h)) {
-                        // Optional: move a few nodes to Front SLL to boost next hits
-                        tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
-                        // 安全に bind 可能（freelist 存在 && remote_head==0 保証）
-                        tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
-                        if (g_adopt_cool_period > 0) {
-                            g_tls_adopt_cd[class_idx] = g_adopt_cool_period;
-                        }
-                        return h.ss;
-                    }
-                    // Safe to bind 失敗（freelist なしor remote pending）→ adopt 中止
-                    slab_release(&h);
-                }
-                // Failed to acquire or no freelist - continue searching
-            }
-            // If no freelist found, ignore and continue (optional: republish)
-        }
-    }
-    }
-
-    // Phase 7.6 Step 4: Check existing SuperSlab with priority order
-    if (tls->ss) {
-        // Priority 1: Reuse slabs with freelist (already freed blocks)
-        int tls_cap = ss_slabs_capacity(tls->ss);
-        uint32_t nonempty_mask = 0;
-        do {
-            static int g_mask_en = -1;
-            if (__builtin_expect(g_mask_en == -1, 0)) {
-                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
-                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
-            }
-            if (__builtin_expect(g_mask_en, 0)) {
-                nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire);
-                break;
-            }
-            for (int i = 0; i < tls_cap; i++) {
-                if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i);
-            }
-        } while (0);
-
-        // O(1) lookup: scan mask with ctz (1 instruction!)
-        while (__builtin_expect(nonempty_mask != 0, 1)) {
-            int i = __builtin_ctz(nonempty_mask);  // Find first non-empty slab (O(1))
-            nonempty_mask &= ~(1u << i);  // Clear bit for next iteration
-
-            // FIX #1 DELETED (Race condition fix):
-            // Previous drain without ownership caused concurrent freelist corruption.
-            // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
-            // Remote frees will be drained when the slab is adopted (see tiny_refill.h paths).
-
-            uint32_t self_tid = tiny_self_u32();
-            SlabHandle h = slab_try_acquire(tls->ss, i, self_tid);
-            if (slab_is_valid(&h)) {
-                if (slab_remote_pending(&h)) {
-                    slab_drain_remote_full(&h);
-                    if (__builtin_expect(g_debug_remote_guard, 0)) {
-                        uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed);
-                        tiny_remote_watch_note("reuse_remote_pending",
-                                               h.ss,
-                                               h.slab_idx,
-                                               (void*)head,
-                                               0xA254u,
-                                               self_tid,
-                                               0);
-                    }
-                    slab_release(&h);
-                    continue;
-                }
-                // Box 4 Boundary: bind は remote_head==0 を保証する必要がある
-                if (slab_is_safe_to_bind(&h)) {
-                    // Optional: move a few nodes to Front SLL to boost next hits
-                    tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
-                    reused_slabs = 1;
-                    tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
-                    return h.ss;
-                }
-                // Safe to bind 失敗 → 次の slab を試す
-                slab_release(&h);
-            }
-        }
-
-        // Priority 2: Use unused slabs (virgin slabs)
-        if (tls->ss->active_slabs < tls_cap) {
-            // Find next free slab
-            int free_idx = superslab_find_free_slab(tls->ss);
-            free_idx_attempted = free_idx;
-            if (free_idx >= 0) {
-                // Initialize this slab
-                uint32_t my_tid = tiny_self_u32();
-                superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
-
-                // Update TLS cache (unified update)
-                tiny_tls_bind_slab(tls, tls->ss, free_idx);
-
-                return tls->ss;
-            }
-        }
-    }
-
-    // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan)
-    // This reduces pressure to allocate new SS when other threads freed blocks.
-    // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan
-    if (!tls->ss) {
-        // Phase 6: Use per-class registry (262K → ~10-100 entries per class!)
-        extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
-        extern int g_super_reg_class_size[TINY_NUM_CLASSES];
-
-        const int scan_max = tiny_reg_scan_max();
-        int reg_size = g_super_reg_class_size[class_idx];
-        int scan_limit = (scan_max < reg_size) ? scan_max : reg_size;
-
-        for (int i = 0; i < scan_limit; i++) {
-            SuperSlab* ss = g_super_reg_by_class[class_idx][i];
-            if (!ss || ss->magic != SUPERSLAB_MAGIC) continue;
-            // Note: class_idx check is not needed (per-class registry!)
-
-            // Pick first slab with freelist (Box 4: 所有権取得 + remote check)
-            int reg_cap = ss_slabs_capacity(ss);
-            uint32_t self_tid = tiny_self_u32();
-            for (int s = 0; s < reg_cap; s++) {
-                if (ss->slabs[s].freelist) {
-                    SlabHandle h = slab_try_acquire(ss, s, self_tid);
-                    if (slab_is_valid(&h)) {
-                        slab_drain_remote_full(&h);
-                        if (slab_is_safe_to_bind(&h)) {
-                            tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
-                            tiny_tls_bind_slab(tls, ss, s);
-                            return ss;
-                        }
-                        slab_release(&h);
-                    }
-                }
-            }
-        }
-    }
-
-    // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window
-    {
-        SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls);
-        if (gate_ss) return gate_ss;
-    }
-
-    // Allocate new SuperSlab
-    SuperSlab* ss = superslab_allocate((uint8_t)class_idx);
-    if (!ss) {
-        if (!g_superslab_refill_debug_once) {
-            g_superslab_refill_debug_once = 1;
-            int err = errno;
-            fprintf(stderr,
-                    "[DEBUG] superslab_refill NULL detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n",
-                    class_idx,
-                    (void*)prev_ss,
-                    (unsigned)prev_active,
-                    prev_bitmap,
-                    (void*)prev_meta,
-                    (unsigned)prev_meta_used,
-                    (unsigned)prev_meta_cap,
-                    (unsigned)prev_slab_idx,
-                    reused_slabs,
-                    free_idx_attempted,
-                    err);
-        }
-        return NULL;  // OOM
-    }
-
-    // Initialize first slab
-    uint32_t my_tid = tiny_self_u32();
-    superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid);
-
-    // Cache in unified TLS（前のSS参照を解放）
-    SuperSlab* old = tls->ss;
-    tiny_tls_bind_slab(tls, ss, 0);
-    // Maintain refcount（将来の空回収に備え、TLS参照をカウント）
-    superslab_ref_inc(ss);
-    if (old && old != ss) {
-        superslab_ref_dec(old);
-    }
-
-    return ss;
-}
-
-// Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix)
-static inline void* hak_tiny_alloc_superslab(int class_idx) {
-    // DEBUG: Function entry trace
-    tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0);
-
-    // MidTC fast path: 128..1024B（class>=4）はTLS tcacheを最優先
-    do {
-        void* mp = midtc_pop(class_idx);
-        if (mp) {
-            HAK_RET_ALLOC(class_idx, mp);
-        }
-    } while (0);
-
-    // Phase 6.24: 1 TLS read (down from 3)
-    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
-
-    TinySlabMeta* meta = tls->meta;
-    int slab_idx = tls->slab_idx;
-    if (meta && slab_idx >= 0 && tls->ss) {
-        // A/B: Relaxed read for remote head presence check
-        static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed
-        if (__builtin_expect(g_alloc_remote_relax == -1, 0)) {
-            const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX");
-            g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0;
-        }
-        uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx],
-                                                 g_alloc_remote_relax ? memory_order_relaxed
-                                                                       : memory_order_acquire);
-        if (__builtin_expect(pending != 0, 0)) {
-            uint32_t self_tid = tiny_self_u32();
-            if (ss_owner_try_acquire(meta, self_tid)) {
-                _ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta);
-            }
-        }
-    }
-
-    // FIX #2 DELETED (Race condition fix):
-    // Previous drain-all-slabs without ownership caused concurrent freelist corruption.
-    // Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash.
-    // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
-    // Remote frees will be drained when the slab is adopted via refill paths.
-
-    // Fast path: Direct metadata access (no repeated TLS reads!)
-    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
-        // Linear allocation (lazy init)
-        size_t block_size = g_tiny_class_sizes[tls->ss->size_class];
-        void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));
-        meta->used++;
-        // Track active blocks in SuperSlab for conservative reclamation
-        ss_active_inc(tls->ss);
-        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
-    }
-
-    if (meta && meta->freelist) {
-        // Freelist allocation
-        void* block = meta->freelist;
-        // Safety: bounds/alignment check (debug)
-        if (__builtin_expect(g_tiny_safe_free, 0)) {
-            size_t blk = g_tiny_class_sizes[tls->ss->size_class];
-            uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
-            uintptr_t delta = (uintptr_t)block - (uintptr_t)base;
-            int align_ok = ((delta % blk) == 0);
-            int range_ok = (delta / blk) < meta->capacity;
-            if (!align_ok || !range_ok) {
-                uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u);
-                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u);
-                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; }
-                return NULL;
-            }
-        }
-        void* next = *(void**)block;
-        meta->freelist = next;
-        meta->used++;
-        // Optional: clear freelist bit when becomes empty
-        do {
-            static int g_mask_en = -1;
-            if (__builtin_expect(g_mask_en == -1, 0)) {
-                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
-                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
-            }
-            if (__builtin_expect(g_mask_en, 0) && next == NULL) {
-                uint32_t bit = (1u << slab_idx);
-                atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release);
-            }
-        } while (0);
-        // Track active blocks in SuperSlab for conservative reclamation
-        ss_active_inc(tls->ss);
-        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
-    }
-
-    // Slow path: Refill TLS slab
-    SuperSlab* ss = superslab_refill(class_idx);
-    if (!ss) {
-        static int log_oom = 0;
-        if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; }
-        return NULL;  // OOM
-    }
-
-    // Retry allocation (metadata already cached in superslab_refill)
-    meta = tls->meta;
-
-    // DEBUG: Check each condition (disabled for benchmarks)
-    // static int log_retry = 0;
-    // if (log_retry < 2) {
-    //     fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n",
-    //             (void*)meta, meta ? meta->freelist : NULL,
-    //             meta ? meta->used : 0, meta ? meta->capacity : 0,
-    //             (void*)tls->slab_base);
-    //     log_retry++;
-    // }
-
-    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
-        size_t block_size = g_tiny_class_sizes[ss->size_class];
-        void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));
-
-        // Disabled for benchmarks
-        // static int log_success = 0;
-        // if (log_success < 2) {
-        //     fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n",
-        //             block, class_idx, meta->used, meta->used + 1);
-        //     log_success++;
-        // }
-
-        meta->used++;
-        // Track active blocks in SuperSlab for conservative reclamation
-        ss_active_inc(ss);
-        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
-    }
-
-    // Disabled for benchmarks
-    // static int log_fail = 0;
-    // if (log_fail < 2) {
-    //     fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n");
-    //     log_fail++;
-    // }
-    return NULL;
-}
-
-// Phase 6.22-B: SuperSlab fast free path
-static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
-    HAK_DBG_INC(g_superslab_free_count);  // Phase 7.6: Track SuperSlab frees
-    // Get slab index (supports 1MB/2MB SuperSlabs)
-    int slab_idx = slab_index_for(ss, ptr);
-    size_t ss_size = (size_t)1ULL << ss->lg_size;
-    uintptr_t ss_base = (uintptr_t)ss;
-    if (__builtin_expect(slab_idx < 0, 0)) {
-        uintptr_t aux = tiny_remote_pack_diag(0xBAD1u, ss_base, ss_size, (uintptr_t)ptr);
-        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-        return;
-    }
-    TinySlabMeta* meta = &ss->slabs[slab_idx];
-    if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
-        tiny_remote_watch_note("free_enter", ss, slab_idx, ptr, 0xA240u, tiny_self_u32(), 0);
-        extern __thread TinyTLSSlab g_tls_slabs[];
-        tiny_alloc_dump_tls_state(ss->size_class, "watch_free_enter", &g_tls_slabs[ss->size_class]);
-#if !HAKMEM_BUILD_RELEASE
-        extern __thread TinyTLSMag g_tls_mags[];
-        TinyTLSMag* watch_mag = &g_tls_mags[ss->size_class];
-        fprintf(stderr,
-                "[REMOTE_WATCH_MAG] cls=%u mag_top=%d cap=%d\n",
-                ss->size_class,
-                watch_mag->top,
-                watch_mag->cap);
-#endif
-    }
-    if (__builtin_expect(g_tiny_safe_free, 0)) {
-        size_t blk = g_tiny_class_sizes[ss->size_class];
-        uint8_t* base = tiny_slab_base_for(ss, slab_idx);
-        uintptr_t delta = (uintptr_t)ptr - (uintptr_t)base;
-        int cap_ok = (meta->capacity > 0) ? 1 : 0;
-        int align_ok = (delta % blk) == 0;
-        int range_ok = cap_ok && (delta / blk) < meta->capacity;
-        if (!align_ok || !range_ok) {
-            uint32_t code = 0xA100u;
-            if (align_ok) code |= 0x2u;
-            if (range_ok) code |= 0x1u;
-            uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr);
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-            return;
-        }
-        // Duplicate in freelist (best-effort scan up to 64)
-        void* scan = meta->freelist; int scanned = 0; int dup = 0;
-        while (scan && scanned < 64) { if (scan == ptr) { dup = 1; break; } scan = *(void**)scan; scanned++; }
-        if (dup) {
-            uintptr_t aux = tiny_remote_pack_diag(0xDFu, ss_base, ss_size, (uintptr_t)ptr);
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-            return;
-        }
-    }
-
-    // Phase 6.23: Same-thread check
-    uint32_t my_tid = tiny_self_u32();
-    const int debug_guard = g_debug_remote_guard;
-    static __thread int g_debug_free_count = 0;
-    if (!g_tiny_force_remote && meta->owner_tid != 0 && meta->owner_tid == my_tid) {
-        // Fast path: Direct freelist push (same-thread)
-        if (g_debug_free_count < 1) {
-            fprintf(stderr, "[FREE_SS] SAME-THREAD: owner=%u my=%u\n",
-                    meta->owner_tid, my_tid);
-            g_debug_free_count++;
-        }
-        if (__builtin_expect(meta->used == 0, 0)) {
-            uintptr_t aux = tiny_remote_pack_diag(0x00u, ss_base, ss_size, (uintptr_t)ptr);
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-            return;
-        }
-        tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "local_free_enter", my_tid);
-        if (!tiny_remote_guard_allow_local_push(ss, slab_idx, meta, ptr, "local_free", my_tid)) {
-            int transitioned = ss_remote_push(ss, slab_idx, ptr);
-            meta->used--;
-            ss_active_dec_one(ss);
-            if (transitioned) {
-                ss_partial_publish((int)ss->size_class, ss);
-            }
-            return;
-        }
-        // Optional: MidTC (TLS tcache for 128..1024B)
-        do {
-            int cls = (int)ss->size_class;
-            if (midtc_enabled() && cls >= 4) {
-                if (midtc_push(cls, ptr)) {
-                    // Treat as returned to TLS cache (not SS freelist)
-                    meta->used--;
-                    ss_active_dec_one(ss);
-                    return;
-                }
-            }
-        } while (0);
-
-        void* prev = meta->freelist;
-        *(void**)ptr = prev;
-        meta->freelist = ptr;
-        do {
-            static int g_mask_en = -1;
-            if (__builtin_expect(g_mask_en == -1, 0)) {
-                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
-                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
-            }
-            if (__builtin_expect(g_mask_en, 0) && prev == NULL) {
-                uint32_t bit = (1u << slab_idx);
-                atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
-            }
-        } while (0);
-        tiny_remote_track_on_local_free(ss, slab_idx, ptr, "local_free", my_tid);
-        meta->used--;
-        // Decrement SuperSlab active counter (actual return to SS)
-        ss_active_dec_one(ss);
-        if (prev == NULL) {
-            ss_partial_publish((int)ss->size_class, ss);
-        }
-
-        if (__builtin_expect(debug_guard, 0)) {
-            fprintf(stderr, "[REMOTE_LOCAL] cls=%u slab=%d owner=%u my=%u ptr=%p prev=%p used=%u\n",
-                    ss->size_class, slab_idx, meta->owner_tid, my_tid, ptr, prev, meta->used);
-        }
-
-        // 空検出は別途（ホットパス除外）
-    } else {
-        if (__builtin_expect(meta->owner_tid == my_tid && meta->owner_tid == 0, 0)) {
-            uintptr_t aux = tiny_remote_pack_diag(0xA300u, ss_base, ss_size, (uintptr_t)ptr);
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-            if (debug_guard) {
-                fprintf(stderr, "[REMOTE_OWNER_ZERO] cls=%u slab=%d ptr=%p my=%u used=%u\n",
-                        ss->size_class, slab_idx, ptr, my_tid, (unsigned)meta->used);
-            }
-        }
-        tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "remote_free_enter", my_tid);
-        // Slow path: Remote free (cross-thread)
-        if (g_debug_free_count < 5) {
-            fprintf(stderr, "[FREE_SS] CROSS-THREAD: owner=%u my=%u slab_idx=%d\n",
-                    meta->owner_tid, my_tid, slab_idx);
-            g_debug_free_count++;
-        }
-        if (__builtin_expect(g_tiny_safe_free, 0)) {
-            // Best-effort duplicate scan in remote stack (up to 64 nodes)
-            uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
-            uintptr_t base = ss_base;
-            int scanned = 0; int dup = 0;
-            uintptr_t cur = head;
-            while (cur && scanned < 64) {
-                if ((cur < base) || (cur >= base + ss_size)) {
-                    uintptr_t aux = tiny_remote_pack_diag(0xA200u, base, ss_size, cur);
-                    tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
-                    if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-                    break;
-                }
-                if ((void*)cur == ptr) { dup = 1; break; }
-                if (__builtin_expect(g_remote_side_enable, 0)) {
-                    if (!tiny_remote_sentinel_ok((void*)cur)) {
-                        uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, cur);
-                        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
-                        uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)(void*)cur, memory_order_relaxed);
-                        tiny_remote_report_corruption("scan", (void*)cur, observed);
-                        fprintf(stderr,
-                                "[REMOTE_SENTINEL] cls=%u slab=%d cur=%p head=%p ptr=%p scanned=%d observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p remote_head=%p\n",
-                                ss->size_class,
-                                slab_idx,
-                                (void*)cur,
-                                (void*)head,
-                                ptr,
-                                scanned,
-                                observed,
-                                meta->owner_tid,
-                                (unsigned)meta->used,
-                                meta->freelist,
-                                (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed));
-                        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-                        break;
-                    }
-                    cur = tiny_remote_side_get(ss, slab_idx, (void*)cur);
-                } else {
-                    if ((cur & (uintptr_t)(sizeof(void*) - 1)) != 0) {
-                        uintptr_t aux = tiny_remote_pack_diag(0xA201u, base, ss_size, cur);
-                        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
-                        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-                        break;
-                    }
-                    cur = (uintptr_t)(*(void**)(void*)cur);
-                }
-                scanned++;
-            }
-            if (dup) {
-            uintptr_t aux = tiny_remote_pack_diag(0xD1u, ss_base, ss_size, (uintptr_t)ptr);
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-            return;
-        }
-        }
-        if (__builtin_expect(meta->used == 0, 0)) {
-            uintptr_t aux = tiny_remote_pack_diag(0x01u, ss_base, ss_size, (uintptr_t)ptr);
-            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-            return;
-        }
-        static int g_ss_adopt_en2 = -1; // env cached
-        if (g_ss_adopt_en2 == -1) {
-            char* e = getenv("HAKMEM_TINY_SS_ADOPT");
-            // 既定: Remote Queueを使う（1）。env指定時のみ上書き。
-            g_ss_adopt_en2 = (e == NULL) ? 1 : ((*e != '0') ? 1 : 0);
-            if (__builtin_expect(debug_guard, 0)) {
-                fprintf(stderr, "[FREE_SS] g_ss_adopt_en2=%d (env='%s')\n", g_ss_adopt_en2, e ? e : "(null)");
-            }
-        }
-        if (g_ss_adopt_en2) {
-            // Use remote queue
-            uintptr_t head_word = __atomic_load_n((uintptr_t*)ptr, __ATOMIC_RELAXED);
-            fprintf(stderr, "[REMOTE_PUSH_CALL] cls=%u slab=%d owner=%u my=%u ptr=%p used=%u remote_count=%u head=%p word=0x%016" PRIxPTR "\n",
-                    ss->size_class,
-                    slab_idx,
-                    meta->owner_tid,
-                    my_tid,
-                    ptr,
-                    (unsigned)meta->used,
-                    atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_relaxed),
-                    (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed),
-                    head_word);
-            int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, ptr);
-            if (!dup_remote && __builtin_expect(g_remote_side_enable, 0)) {
-                dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, ptr);
-            }
-            if (__builtin_expect(head_word == TINY_REMOTE_SENTINEL && !dup_remote && g_debug_remote_guard, 0)) {
-                tiny_remote_watch_note("dup_scan_miss", ss, slab_idx, ptr, 0xA215u, my_tid, 0);
-            }
-            if (dup_remote) {
-                uintptr_t aux = tiny_remote_pack_diag(0xA214u, ss_base, ss_size, (uintptr_t)ptr);
-                tiny_remote_watch_mark(ptr, "dup_prevent", my_tid);
-                tiny_remote_watch_note("dup_prevent", ss, slab_idx, ptr, 0xA214u, my_tid, 0);
-                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-                return;
-            }
-            if (__builtin_expect(g_remote_side_enable && (head_word & 0xFFFFu) == 0x6261u, 0)) {
-                // TLS guard scribble detected on the node's first word → same-pointer double free across routes
-                uintptr_t aux = tiny_remote_pack_diag(0xA213u, ss_base, ss_size, (uintptr_t)ptr);
-                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
-                tiny_remote_watch_mark(ptr, "pre_push", my_tid);
-                tiny_remote_watch_note("pre_push", ss, slab_idx, ptr, 0xA231u, my_tid, 0);
-                tiny_remote_report_corruption("pre_push", ptr, head_word);
-                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
-                return;
-            }
-            if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
-                tiny_remote_watch_note("free_remote", ss, slab_idx, ptr, 0xA232u, my_tid, 0);
-            }
-            int was_empty = ss_remote_push(ss, slab_idx, ptr);
-            meta->used--;
-            ss_active_dec_one(ss);
-            if (was_empty) {
-                ss_partial_publish((int)ss->size_class, ss);
-            }
-        } else {
-            // Fallback: direct freelist push (legacy)
-            fprintf(stderr, "[FREE_SS] Using LEGACY freelist push (not remote queue)\n");
-            void* prev = meta->freelist;
-            *(void**)ptr = prev;
-            meta->freelist = ptr;
-            do {
-                static int g_mask_en = -1;
-                if (__builtin_expect(g_mask_en == -1, 0)) {
-                    const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
-                    g_mask_en = (e && *e && *e != '0') ? 1 : 0;
-                }
-                if (__builtin_expect(g_mask_en, 0) && prev == NULL) {
-                    uint32_t bit = (1u << slab_idx);
-                    atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
-                }
-            } while (0);
-            meta->used--;
-            ss_active_dec_one(ss);
-            if (prev == NULL) {
-                ss_partial_publish((int)ss->size_class, ss);
-            }
-        }
-
-        // 空検出は別途（ホットパス除外）
-    }
-}
+#include "tiny_superslab_alloc.inc.h"
+#include "tiny_superslab_free.inc.h"
 
 void hak_tiny_free(void* ptr) {
     if (!ptr || !g_tiny_initialized) return;
@@ -1474,6 +276,13 @@ void hak_tiny_free(void* ptr) {
         fast_ss = hak_super_lookup(ptr);
         if (fast_ss && fast_ss->magic == SUPERSLAB_MAGIC) {
             fast_class_idx = fast_ss->size_class;
+            // BUGFIX: Validate size_class before using as array index (prevents OOB = 85% of FREE_TO_SS SEGV)
+            if (__builtin_expect(fast_class_idx < 0 || fast_class_idx >= TINY_NUM_CLASSES, 0)) {
+                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xF0, ptr, (uintptr_t)fast_class_idx);
+                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                fast_ss = NULL;
+                fast_class_idx = -1;
+            }
         } else {
             fast_ss = NULL;
         }
@@ -1515,6 +324,12 @@ void hak_tiny_free(void* ptr) {
         }
     }
     if (ss && ss->magic == SUPERSLAB_MAGIC) {
+        // BUGFIX: Validate size_class before using as array index (prevents OOB)
+        if (__builtin_expect(ss->size_class < 0 || ss->size_class >= TINY_NUM_CLASSES, 0)) {
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xF2, ptr, (uintptr_t)ss->size_class);
+            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+            return;
+        }
         // Direct SuperSlab free (avoid second lookup TOCTOU)
         hak_tiny_free_superslab(ptr, ss);
         HAK_STAT_FREE(ss->size_class);
diff --git a/core/tiny_api.h b/core/tiny_api.h
new file mode 100644
index 00000000..353e042e
--- /dev/null
+++ b/core/tiny_api.h
@@ -0,0 +1,12 @@
+// tiny_api.h - API headers for Tiny allocator
+// Consolidates Phase 2B API modules
+
+#ifndef TINY_API_H
+#define TINY_API_H
+
+#include "hakmem_tiny_stats_api.h"     // Phase 2B: Stats API
+#include "hakmem_tiny_query_api.h"     // Phase 2B-1: Query API
+#include "hakmem_tiny_rss_api.h"       // Phase 2B-2: RSS Utils
+#include "hakmem_tiny_registry_api.h"  // Phase 2B-3: Registry
+
+#endif // TINY_API_H
diff --git a/core/tiny_free_magazine.inc.h b/core/tiny_free_magazine.inc.h
new file mode 100644
index 00000000..85358c36
--- /dev/null
+++ b/core/tiny_free_magazine.inc.h
@@ -0,0 +1,420 @@
+// tiny_free_magazine.inc.h - Magazine Layer for hak_tiny_free_with_slab()
+// Purpose: TLS caching (TinyQuickSlot, TLS SLL, Magazine) and spill logic
+// Extracted from: hakmem_tiny_free.inc lines 208-620
+// Box Theory: Box 5 (TLS Cache) integration
+//
+// Context: This file is #included within hak_tiny_free_with_slab() function body
+// Prerequisites: ss, meta, class_idx, ptr variables must be defined in calling scope
+
+#if !HAKMEM_BUILD_RELEASE
+        // SuperSlab uses Magazine for TLS caching (same as TinySlab)
+        tiny_small_mags_init_once();
+        if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
+        TinyTLSMag* mag = &g_tls_mags[class_idx];
+        int cap = mag->cap;
+
+        // 32/64B: SLL優先（mag優先は無効化）
+        // Prefer TinyQuickSlot (compile-out if HAKMEM_TINY_NO_QUICK)
+#if !defined(HAKMEM_TINY_NO_QUICK)
+        if (g_quick_enable && class_idx <= 4) {
+            TinyQuickSlot* qs = &g_tls_quick[class_idx];
+            if (__builtin_expect(qs->top < QUICK_CAP, 1)) {
+                qs->items[qs->top++] = ptr;
+                HAK_STAT_FREE(class_idx);
+                return;
+            }
+        }
+#endif
+
+        // Fast path: TLS SLL push for hottest classes
+        if (!g_tls_list_enable && g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)cap)) {
+            *(void**)ptr = g_tls_sll_head[class_idx];
+            g_tls_sll_head[class_idx] = ptr;
+            g_tls_sll_count[class_idx]++;
+            // BUGFIX: Decrement used counter (was missing, causing Fail-Fast on next free)
+            meta->used--;
+            // Active → Inactive: count down immediately (TLS保管中は"使用中"ではない)
+            ss_active_dec_one(ss);
+            HAK_TP1(sll_push, class_idx);
+            tiny_debug_ring_record(TINY_RING_EVENT_FREE_LOCAL, (uint16_t)class_idx, ptr, 3);
+            HAK_STAT_FREE(class_idx);
+            return;
+        }
+
+        // Next: Magazine push（必要ならmag→SLLへバルク転送で空きを作る）
+        // Hysteresis: allow slight overfill before deciding to spill under lock
+        if (mag->top >= cap && g_spill_hyst > 0) {
+            (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2);
+        }
+        if (mag->top < cap + g_spill_hyst) {
+            mag->items[mag->top].ptr = ptr;
+#if HAKMEM_TINY_MAG_OWNER
+            mag->items[mag->top].owner = NULL; // SuperSlab owner not a TinySlab; leave NULL
+#endif
+            mag->top++;
+#if HAKMEM_DEBUG_COUNTERS
+            g_magazine_push_count++;  // Phase 7.6: Track pushes
+#endif
+            // Active → Inactive: decrement now（アプリ解放時に非アクティブ扱い）
+            ss_active_dec_one(ss);
+            HAK_TP1(mag_push, class_idx);
+            tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 2);
+            HAK_STAT_FREE(class_idx);
+            return;
+        }
+
+        // Background spill: queue to BG thread instead of locking (when enabled)
+        if (g_bg_spill_enable) {
+            uint32_t qlen = atomic_load_explicit(&g_bg_spill_len[class_idx], memory_order_relaxed);
+            if ((int)qlen < g_bg_spill_target) {
+                // Build a small chain: include current ptr and pop from mag up to limit
+                int limit = g_bg_spill_max_batch;
+                if (limit > cap/2) limit = cap/2;
+                if (limit > 32) limit = 32; // keep free-path bounded
+                void* head = ptr;
+                *(void**)head = NULL;
+                void* tail = head; // current tail
+                int taken = 1;
+                while (taken < limit && mag->top > 0) {
+                    void* p2 = mag->items[--mag->top].ptr;
+                    *(void**)p2 = head;
+                    head = p2;
+                    taken++;
+                }
+                // Push chain to spill queue (single CAS)
+                bg_spill_push_chain(class_idx, head, tail, taken);
+                tiny_debug_ring_record(TINY_RING_EVENT_FREE_RETURN_MAG, (uint16_t)class_idx, ptr, 3);
+                HAK_STAT_FREE(class_idx);
+                return;
+            }
+        }
+
+        // Spill half (SuperSlab version - simpler than TinySlab)
+                pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
+        hkm_prof_begin(NULL);
+        pthread_mutex_lock(lock);
+        // Batch spill: reduce lock frequency and work per call
+        int spill = cap / 2;
+        int over = mag->top - (cap + g_spill_hyst);
+        if (over > 0 && over < spill) spill = over;
+
+        for (int i = 0; i < spill && mag->top > 0; i++) {
+            TinyMagItem it = mag->items[--mag->top];
+
+            // Phase 7.6: SuperSlab spill - return to freelist
+            SuperSlab* owner_ss = hak_super_lookup(it.ptr);
+            if (owner_ss && owner_ss->magic == SUPERSLAB_MAGIC) {
+                // Direct freelist push (same as old hak_tiny_free_superslab)
+                int slab_idx = slab_index_for(owner_ss, it.ptr);
+                // BUGFIX: Validate slab_idx before array access (prevents OOB)
+                if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(owner_ss)) {
+                    continue;  // Skip invalid index
+                }
+                TinySlabMeta* meta = &owner_ss->slabs[slab_idx];
+                *(void**)it.ptr = meta->freelist;
+                meta->freelist = it.ptr;
+                meta->used--;
+                // Decrement SuperSlab active counter (spill returns blocks to SS)
+                ss_active_dec_one(owner_ss);
+
+                // Phase 8.4: Empty SuperSlab detection (will use meta->used scan)
+                // TODO: Implement scan-based empty detection
+                // Empty SuperSlab detection/munmapは別途フラッシュAPIで実施（ホットパスから除外）
+            }
+        }
+
+        pthread_mutex_unlock(lock);
+        hkm_prof_end(ss_time, HKP_TINY_SPILL, &tss);
+
+        // Adaptive increase of cap after spill
+        int max_cap = tiny_cap_max_for_class(class_idx);
+                if (mag->cap < max_cap) {
+            int new_cap = mag->cap + (mag->cap / 2);
+            if (new_cap > max_cap) new_cap = max_cap;
+            if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP;
+            mag->cap = new_cap;
+        }
+
+        // Finally, try FastCache push first (≤128B) — compile-out if HAKMEM_TINY_NO_FRONT_CACHE
+#if !defined(HAKMEM_TINY_NO_FRONT_CACHE)
+        if (g_fastcache_enable && class_idx <= 4) {
+            if (fastcache_push(class_idx, ptr)) {
+                HAK_TP1(front_push, class_idx);
+                HAK_STAT_FREE(class_idx);
+                return;
+            }
+        }
+#endif
+        // Then TLS SLL if room, else magazine
+        if (g_tls_sll_enable && g_tls_sll_count[class_idx] < sll_cap_for_class(class_idx, (uint32_t)mag->cap)) {
+            *(void**)ptr = g_tls_sll_head[class_idx];
+            g_tls_sll_head[class_idx] = ptr;
+            g_tls_sll_count[class_idx]++;
+        } else {
+            mag->items[mag->top].ptr = ptr;
+#if HAKMEM_TINY_MAG_OWNER
+            mag->items[mag->top].owner = slab;
+#endif
+            mag->top++;
+        }
+
+#if HAKMEM_DEBUG_COUNTERS
+        g_magazine_push_count++;  // Phase 7.6: Track pushes
+#endif
+        HAK_STAT_FREE(class_idx);
+        return;
+#endif  // HAKMEM_BUILD_RELEASE
+    }
+
+    // Phase 7.6: TinySlab path (original)
+    //g_tiny_free_with_slab_count++;  // Phase 7.6: Track calls - DISABLED due to segfault
+    // Same-thread → TLS magazine; remote-thread → MPSC stack
+    if (pthread_equal(slab->owner_tid, tiny_self_pt())) {
+        int class_idx = slab->class_idx;
+
+        if (g_tls_list_enable) {
+            TinyTLSList* tls = &g_tls_lists[class_idx];
+            uint32_t seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
+            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
+                tiny_tls_refresh_params(class_idx, tls);
+            }
+            // TinyHotMag front push（8/16/32B, A/B）
+            if (__builtin_expect(g_hotmag_enable && class_idx <= 2, 1)) {
+                if (hotmag_push(class_idx, ptr)) {
+                    HAK_STAT_FREE(class_idx);
+                    return;
+                }
+            }
+            if (tls->count < tls->cap) {
+                tiny_tls_list_guard_push(class_idx, tls, ptr);
+                tls_list_push(tls, ptr);
+                HAK_STAT_FREE(class_idx);
+                return;
+            }
+            seq = atomic_load_explicit(&g_tls_param_seq[class_idx], memory_order_relaxed);
+            if (__builtin_expect(seq != g_tls_param_seen[class_idx], 0)) {
+                tiny_tls_refresh_params(class_idx, tls);
+            }
+            tiny_tls_list_guard_push(class_idx, tls, ptr);
+            tls_list_push(tls, ptr);
+            if (tls_list_should_spill(tls)) {
+                tls_list_spill_excess(class_idx, tls);
+            }
+            HAK_STAT_FREE(class_idx);
+            return;
+        }
+
+        tiny_mag_init_if_needed(class_idx);
+        TinyTLSMag* mag = &g_tls_mags[class_idx];
+        int cap = mag->cap;
+        // 32/64B: SLL優先（mag優先は無効化）
+        // Fast path: FastCache push (preferred for ≤128B), then TLS SLL
+        if (g_fastcache_enable && class_idx <= 4) {
+            if (fastcache_push(class_idx, ptr)) {
+                HAK_STAT_FREE(class_idx);
+                return;
+            }
+        }
+        // Fast path: TLS SLL push (preferred)
+        if (!g_tls_list_enable && g_tls_sll_enable && class_idx <= 5) {
+            uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)cap);
+            if (g_tls_sll_count[class_idx] < sll_cap) {
+                *(void**)ptr = g_tls_sll_head[class_idx];
+                g_tls_sll_head[class_idx] = ptr;
+                g_tls_sll_count[class_idx]++;
+                HAK_STAT_FREE(class_idx);
+                return;
+            }
+        }
+        // Next: if magazine has room, push immediately and return（満杯ならmag→SLLへバルク）
+        if (mag->top >= cap) {
+            (void)bulk_mag_to_sll_if_room(class_idx, mag, cap / 2);
+        }
+        // Remote-drain can be handled opportunistically on future calls.
+        if (mag->top < cap) {
+            mag->items[mag->top].ptr = ptr;
+#if HAKMEM_TINY_MAG_OWNER
+            mag->items[mag->top].owner = slab;
+#endif
+            mag->top++;
+
+#if HAKMEM_DEBUG_COUNTERS
+            g_magazine_push_count++;  // Phase 7.6: Track pushes
+#endif
+            // Note: SuperSlab uses separate path (slab == NULL branch above)
+            HAK_STAT_FREE(class_idx);  // Phase 3
+            return;
+        }
+        // Magazine full: before spilling, opportunistically drain remotes once under lock.
+        if (atomic_load_explicit(&slab->remote_count, memory_order_relaxed) >= (unsigned)g_remote_drain_thresh_per_class[class_idx] || atomic_load_explicit(&slab->remote_head, memory_order_acquire)) {
+            pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
+            pthread_mutex_lock(lock);
+            HAK_TP1(remote_drain, class_idx);
+            tiny_remote_drain_locked(slab);
+            pthread_mutex_unlock(lock);
+        }
+        // Spill half under class lock
+        pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
+        pthread_mutex_lock(lock);
+        int spill = cap / 2;
+
+        // Phase 4.2: High-water threshold for gating Phase 4 logic
+        int high_water = (cap * 3) / 4;  // 75% of capacity
+
+        for (int i = 0; i < spill && mag->top > 0; i++) {
+            TinyMagItem it = mag->items[--mag->top];
+
+            // Phase 7.6: Check for SuperSlab first (mixed Magazine support)
+            SuperSlab* ss_owner = hak_super_lookup(it.ptr);
+            if (ss_owner && ss_owner->magic == SUPERSLAB_MAGIC) {
+                // SuperSlab spill - return to freelist
+                int slab_idx = slab_index_for(ss_owner, it.ptr);
+                // BUGFIX: Validate slab_idx before array access (prevents OOB)
+                if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss_owner)) {
+                    HAK_STAT_FREE(class_idx);
+                    continue;  // Skip invalid index
+                }
+                TinySlabMeta* meta = &ss_owner->slabs[slab_idx];
+                *(void**)it.ptr = meta->freelist;
+                meta->freelist = it.ptr;
+                meta->used--;
+                // 空SuperSlab処理はフラッシュ/バックグラウンドで対応（ホットパス除外）
+                HAK_STAT_FREE(class_idx);
+                continue;  // Skip TinySlab processing
+            }
+
+            TinySlab* owner =
+#if HAKMEM_TINY_MAG_OWNER
+                it.owner;
+#else
+                NULL;
+#endif
+            if (!owner) {
+                owner = tls_active_owner_for_ptr(class_idx, it.ptr);
+            }
+            if (!owner) {
+                owner = hak_tiny_owner_slab(it.ptr);
+            }
+            if (!owner) continue;
+
+            // Phase 4.2: Adaptive gating - skip Phase 4 when TLS Magazine is high-water
+            // Rationale: When mag->top >= 75%, next alloc will come from TLS anyway
+            //            so pushing to mini-mag is wasted work
+            int is_high_water = (mag->top >= high_water);
+
+            if (!is_high_water) {
+                // Low-water: Phase 4.1 logic (try mini-magazine first)
+                uint8_t cidx = owner->class_idx;  // Option A: 1回だけ読む
+                TinySlab* tls_a = g_tls_active_slab_a[cidx];
+                TinySlab* tls_b = g_tls_active_slab_b[cidx];
+
+                // Option B: Branch prediction hint (spill → TLS-active への戻りが likely)
+                if (__builtin_expect((owner == tls_a || owner == tls_b) &&
+                                     !mini_mag_is_full(&owner->mini_mag), 1)) {
+                    // Fast path: mini-magazineに戻す（bitmap触らない）
+                    mini_mag_push(&owner->mini_mag, it.ptr);
+                    HAK_TP1(spill_tiny, cidx);
+                    HAK_STAT_FREE(cidx);
+                    continue;  // bitmap操作スキップ
+                }
+            }
+            // High-water or Phase 4.1 mini-mag full: fall through to bitmap
+
+            // Slow path: bitmap直接書き込み（既存ロジック）
+            size_t bs = g_tiny_class_sizes[owner->class_idx];
+            int idx = ((uintptr_t)it.ptr - (uintptr_t)owner->base) / bs;
+            if (hak_tiny_is_used(owner, idx)) {
+                hak_tiny_set_free(owner, idx);
+                int was_full = (owner->free_count == 0);
+                owner->free_count++;
+                if (was_full) move_to_free_list(owner->class_idx, owner);
+                if (owner->free_count == owner->total_count) {
+                    // If this slab is TLS-active for this thread, clear the pointer before releasing
+                    if (g_tls_active_slab_a[owner->class_idx] == owner) g_tls_active_slab_a[owner->class_idx] = NULL;
+                    if (g_tls_active_slab_b[owner->class_idx] == owner) g_tls_active_slab_b[owner->class_idx] = NULL;
+                    TinySlab** headp = &g_tiny_pool.free_slabs[owner->class_idx];
+                    TinySlab* prev = NULL;
+                    for (TinySlab* s = *headp; s; prev = s, s = s->next) {
+                        if (s == owner) { if (prev) prev->next = s->next; else *headp = s->next; break; }
+                    }
+                    release_slab(owner);
+                }
+                HAK_TP1(spill_tiny, owner->class_idx);
+                HAK_STAT_FREE(owner->class_idx);
+            }
+        }
+        pthread_mutex_unlock(lock);
+        hkm_prof_end(ss, HKP_TINY_SPILL, &tss);
+        // Adaptive increase of cap after spill
+        int max_cap = tiny_cap_max_for_class(class_idx);
+        if (mag->cap < max_cap) {
+            int new_cap = mag->cap + (mag->cap / 2);
+            if (new_cap > max_cap) new_cap = max_cap;
+            if (new_cap > TINY_TLS_MAG_CAP) new_cap = TINY_TLS_MAG_CAP;
+            mag->cap = new_cap;
+        }
+        // Finally: prefer TinyQuickSlot → SLL → UltraFront → HotMag → Magazine（順序で局所性を確保）
+#if !HAKMEM_BUILD_RELEASE && !defined(HAKMEM_TINY_NO_QUICK)
+        if (g_quick_enable && class_idx <= 4) {
+            TinyQuickSlot* qs = &g_tls_quick[class_idx];
+            if (__builtin_expect(qs->top < QUICK_CAP, 1)) {
+                qs->items[qs->top++] = ptr;
+            } else if (g_tls_sll_enable) {
+                uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
+                if (g_tls_sll_count[class_idx] < sll_cap2) {
+                    *(void**)ptr = g_tls_sll_head[class_idx];
+                    g_tls_sll_head[class_idx] = ptr;
+                    g_tls_sll_count[class_idx]++;
+                } else if (!tiny_optional_push(class_idx, ptr)) {
+                    mag->items[mag->top].ptr = ptr;
+#if HAKMEM_TINY_MAG_OWNER
+                    mag->items[mag->top].owner = slab;
+#endif
+                    mag->top++;
+                }
+            } else {
+                if (!tiny_optional_push(class_idx, ptr)) {
+                    mag->items[mag->top].ptr = ptr;
+#if HAKMEM_TINY_MAG_OWNER
+                    mag->items[mag->top].owner = slab;
+#endif
+                    mag->top++;
+                }
+            }
+        } else
+#endif
+        {
+            if (g_tls_sll_enable && class_idx <= 5) {
+                uint32_t sll_cap2 = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
+                if (g_tls_sll_count[class_idx] < sll_cap2) {
+                    *(void**)ptr = g_tls_sll_head[class_idx];
+                    g_tls_sll_head[class_idx] = ptr;
+                    g_tls_sll_count[class_idx]++;
+                } else if (!tiny_optional_push(class_idx, ptr)) {
+                    mag->items[mag->top].ptr = ptr;
+#if HAKMEM_TINY_MAG_OWNER
+                    mag->items[mag->top].owner = slab;
+#endif
+                    mag->top++;
+                }
+            } else {
+                if (!tiny_optional_push(class_idx, ptr)) {
+                    mag->items[mag->top].ptr = ptr;
+#if HAKMEM_TINY_MAG_OWNER
+                    mag->items[mag->top].owner = slab;
+#endif
+                    mag->top++;
+                }
+            }
+        }
+
+#if HAKMEM_DEBUG_COUNTERS
+        g_magazine_push_count++;  // Phase 7.6: Track pushes
+#endif
+        // Note: SuperSlab uses separate path (slab == NULL branch above)
+        HAK_STAT_FREE(class_idx);  // Phase 3
+        return;
+    } else {
+        tiny_remote_push(slab, ptr);
+    }
+}
diff --git a/core/tiny_superslab_alloc.inc.h b/core/tiny_superslab_alloc.inc.h
new file mode 100644
index 00000000..65a953c6
--- /dev/null
+++ b/core/tiny_superslab_alloc.inc.h
@@ -0,0 +1,558 @@
+// tiny_superslab_alloc.inc.h - SuperSlab Allocation Layer
+// Purpose: Slab allocation, refill, and adoption logic
+// Extracted from: hakmem_tiny_free.inc lines 626-1170
+// Box Theory: Box 4 (Refill/Adoption) integration
+//
+// Public functions:
+// - superslab_alloc_from_slab(): Allocate from specific slab (linear or freelist)
+// - superslab_refill(): Refill TLS slab (adoption, registry scan, fresh alloc)
+// - hak_tiny_alloc_superslab(): Main SuperSlab allocation entry point
+
+// ============================================================================
+// Phase 6.23: SuperSlab Allocation Helpers
+// ============================================================================
+
+// Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation)
+static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
+    TinySlabMeta* meta = &ss->slabs[slab_idx];
+
+    // Ensure remote queue is drained before handing blocks back to TLS
+    if (atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0) {
+        uint32_t self_tid = tiny_self_u32();
+        SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
+        if (slab_is_valid(&h)) {
+            slab_drain_remote_full(&h);
+            int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0;
+            if (__builtin_expect(pending, 0)) {
+                if (__builtin_expect(g_debug_remote_guard, 0)) {
+                    uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed);
+                    tiny_remote_watch_note("alloc_pending_remote",
+                                           ss,
+                                           slab_idx,
+                                           (void*)head,
+                                           0xA243u,
+                                           self_tid,
+                                           0);
+                }
+                slab_release(&h);
+                return NULL;
+            }
+            slab_release(&h);
+        } else {
+            if (__builtin_expect(g_debug_remote_guard, 0)) {
+                tiny_remote_watch_note("alloc_acquire_fail",
+                                       ss,
+                                       slab_idx,
+                                       meta,
+                                       0xA244u,
+                                       self_tid,
+                                       0);
+            }
+            return NULL;
+        }
+    }
+
+    if (__builtin_expect(g_debug_remote_guard, 0)) {
+        uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
+        if (head_pending != 0) {
+            tiny_remote_watch_note("alloc_remote_pending",
+                                   ss,
+                                   slab_idx,
+                                   (void*)head_pending,
+                                   0xA247u,
+                                   tiny_self_u32(),
+                                   1);
+            return NULL;
+        }
+    }
+
+    // Phase 6.24: Linear allocation mode (freelist == NULL)
+    // This avoids the 4000-8000 cycle cost of building freelist on init
+    if (meta->freelist == NULL && meta->used < meta->capacity) {
+        // Linear allocation: sequential memory access (cache-friendly!)
+        size_t block_size = g_tiny_class_sizes[ss->size_class];
+        void* slab_start = slab_data_start(ss, slab_idx);
+
+        // First slab: skip SuperSlab header
+        if (slab_idx == 0) {
+            slab_start = (char*)slab_start + 1024;
+        }
+
+        void* block = (char*)slab_start + (meta->used * block_size);
+        meta->used++;
+        tiny_remote_track_on_alloc(ss, slab_idx, block, "linear_alloc", 0);
+        tiny_remote_assert_not_remote(ss, slab_idx, block, "linear_alloc_ret", 0);
+        return block;  // Fast path: O(1) pointer arithmetic
+    }
+
+    // Freelist mode (after first free())
+    if (meta->freelist) {
+        void* block = meta->freelist;
+        meta->freelist = *(void**)block;  // Pop from freelist
+        meta->used++;
+        tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
+        tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
+        return block;
+    }
+
+    return NULL;  // Slab is full
+}
+
+// Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation)
+static SuperSlab* superslab_refill(int class_idx) {
+#if HAKMEM_DEBUG_COUNTERS
+    g_superslab_refill_calls_dbg[class_idx]++;
+#endif
+    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
+    static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen
+    if (g_ss_adopt_en == -1) {
+        char* e = getenv("HAKMEM_TINY_SS_ADOPT");
+        if (e) {
+            g_ss_adopt_en = (*e != '0') ? 1 : 0;
+        } else {
+            extern _Atomic int g_ss_remote_seen;
+            g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0;
+        }
+    }
+    extern int g_adopt_cool_period;
+    extern __thread int g_tls_adopt_cd[];
+    if (g_adopt_cool_period == -1) {
+        char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN");
+        int v = (cd ? atoi(cd) : 0);
+        if (v < 0) v = 0; if (v > 1024) v = 1024;
+        g_adopt_cool_period = v;
+    }
+
+    static int g_superslab_refill_debug_once = 0;
+    SuperSlab* prev_ss = tls->ss;
+    TinySlabMeta* prev_meta = tls->meta;
+    uint8_t prev_slab_idx = tls->slab_idx;
+    uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0;
+    uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0;
+    uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0;
+    uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0;
+    int free_idx_attempted = -2;  // -2 = not evaluated, -1 = none, >=0 = chosen
+    int reused_slabs = 0;
+
+    // Optional: Mid-size simple refill to avoid multi-layer scans (class>=4)
+    do {
+        static int g_mid_simple_warn = 0;
+        if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) {
+            // If current TLS has a SuperSlab, prefer taking a virgin slab directly
+            if (tls->ss) {
+                int tls_cap = ss_slabs_capacity(tls->ss);
+                if (tls->ss->active_slabs < tls_cap) {
+                    int free_idx = superslab_find_free_slab(tls->ss);
+                    if (free_idx >= 0) {
+                        uint32_t my_tid = tiny_self_u32();
+                        superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
+                        tiny_tls_bind_slab(tls, tls->ss, free_idx);
+                        return tls->ss;
+                    }
+                }
+            }
+            // Otherwise allocate a fresh SuperSlab and bind first slab
+            SuperSlab* ssn = superslab_allocate((uint8_t)class_idx);
+            if (!ssn) {
+                if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) {
+                    g_mid_simple_warn++;
+                    int err = errno;
+                    fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err);
+                }
+                return NULL;
+            }
+            uint32_t my_tid = tiny_self_u32();
+            superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid);
+            SuperSlab* old = tls->ss;
+            tiny_tls_bind_slab(tls, ssn, 0);
+            superslab_ref_inc(ssn);
+            if (old && old != ssn) { superslab_ref_dec(old); }
+            return ssn;
+        }
+    } while (0);
+
+
+    // First, try to adopt a published partial SuperSlab for this class
+    if (g_ss_adopt_en) {
+        if (g_adopt_cool_period > 0) {
+            if (g_tls_adopt_cd[class_idx] > 0) {
+                g_tls_adopt_cd[class_idx]--;
+            } else {
+                // eligible to adopt
+            }
+        }
+        if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) {
+        SuperSlab* adopt = ss_partial_adopt(class_idx);
+        if (adopt && adopt->magic == SUPERSLAB_MAGIC) {
+            // ========================================================================
+            // Quick Win #2: First-Fit Adopt (vs Best-Fit scoring all 32 slabs)
+            // For Larson, any slab with freelist works - no need to score all 32!
+            // Expected improvement: -3,000 cycles (from 32 atomic loads + 32 scores)
+            // ========================================================================
+            int adopt_cap = ss_slabs_capacity(adopt);
+            int best = -1;
+            for (int s = 0; s < adopt_cap; s++) {
+                TinySlabMeta* m = &adopt->slabs[s];
+                // Quick check: Does this slab have a freelist?
+                if (m->freelist) {
+                    // Yes! Try to acquire it immediately (first-fit)
+                    best = s;
+                    break;  // ✅ OPTIMIZATION: Stop at first slab with freelist!
+                }
+                // Optional: Also check remote_heads if we want to prioritize those
+                // (But for Larson, freelist is sufficient)
+            }
+            if (best >= 0) {
+                // Box: Try to acquire ownership atomically
+                uint32_t self = tiny_self_u32();
+                SlabHandle h = slab_try_acquire(adopt, best, self);
+                if (slab_is_valid(&h)) {
+                    slab_drain_remote_full(&h);
+                    if (slab_remote_pending(&h)) {
+                        if (__builtin_expect(g_debug_remote_guard, 0)) {
+                            uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed);
+                            tiny_remote_watch_note("adopt_remote_pending",
+                                                   h.ss,
+                                                   h.slab_idx,
+                                                   (void*)head,
+                                                   0xA255u,
+                                                   self,
+                                                   0);
+                        }
+                        // Remote still pending; give up adopt path and fall through to normal refill.
+                        slab_release(&h);
+                    }
+
+                    // Box 4 Boundary: bind は remote_head==0 を保証する必要がある
+                    // slab_is_safe_to_bind() で TOCTOU-safe にチェック
+                    if (slab_is_safe_to_bind(&h)) {
+                        // Optional: move a few nodes to Front SLL to boost next hits
+                        tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
+                        // 安全に bind 可能（freelist 存在 && remote_head==0 保証）
+                        tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
+                        if (g_adopt_cool_period > 0) {
+                            g_tls_adopt_cd[class_idx] = g_adopt_cool_period;
+                        }
+                        return h.ss;
+                    }
+                    // Safe to bind 失敗（freelist なしor remote pending）→ adopt 中止
+                    slab_release(&h);
+                }
+                // Failed to acquire or no freelist - continue searching
+            }
+            // If no freelist found, ignore and continue (optional: republish)
+        }
+    }
+    }
+
+    // Phase 7.6 Step 4: Check existing SuperSlab with priority order
+    if (tls->ss) {
+        // Priority 1: Reuse slabs with freelist (already freed blocks)
+        int tls_cap = ss_slabs_capacity(tls->ss);
+        uint32_t nonempty_mask = 0;
+        do {
+            static int g_mask_en = -1;
+            if (__builtin_expect(g_mask_en == -1, 0)) {
+                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
+                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
+            }
+            if (__builtin_expect(g_mask_en, 0)) {
+                nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire);
+                break;
+            }
+            for (int i = 0; i < tls_cap; i++) {
+                if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i);
+            }
+        } while (0);
+
+        // O(1) lookup: scan mask with ctz (1 instruction!)
+        while (__builtin_expect(nonempty_mask != 0, 1)) {
+            int i = __builtin_ctz(nonempty_mask);  // Find first non-empty slab (O(1))
+            nonempty_mask &= ~(1u << i);  // Clear bit for next iteration
+
+            // FIX #1 DELETED (Race condition fix):
+            // Previous drain without ownership caused concurrent freelist corruption.
+            // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
+            // Remote frees will be drained when the slab is adopted (see tiny_refill.h paths).
+
+            uint32_t self_tid = tiny_self_u32();
+            SlabHandle h = slab_try_acquire(tls->ss, i, self_tid);
+            if (slab_is_valid(&h)) {
+                if (slab_remote_pending(&h)) {
+                    slab_drain_remote_full(&h);
+                    if (__builtin_expect(g_debug_remote_guard, 0)) {
+                        uintptr_t head = atomic_load_explicit(&h.ss->remote_heads[h.slab_idx], memory_order_relaxed);
+                        tiny_remote_watch_note("reuse_remote_pending",
+                                               h.ss,
+                                               h.slab_idx,
+                                               (void*)head,
+                                               0xA254u,
+                                               self_tid,
+                                               0);
+                    }
+                    slab_release(&h);
+                    continue;
+                }
+                // Box 4 Boundary: bind は remote_head==0 を保証する必要がある
+                if (slab_is_safe_to_bind(&h)) {
+                    // Optional: move a few nodes to Front SLL to boost next hits
+                    tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
+                    reused_slabs = 1;
+                    tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
+                    return h.ss;
+                }
+                // Safe to bind 失敗 → 次の slab を試す
+                slab_release(&h);
+            }
+        }
+
+        // Priority 2: Use unused slabs (virgin slabs)
+        if (tls->ss->active_slabs < tls_cap) {
+            // Find next free slab
+            int free_idx = superslab_find_free_slab(tls->ss);
+            free_idx_attempted = free_idx;
+            if (free_idx >= 0) {
+                // Initialize this slab
+                uint32_t my_tid = tiny_self_u32();
+                superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
+
+                // Update TLS cache (unified update)
+                tiny_tls_bind_slab(tls, tls->ss, free_idx);
+
+                return tls->ss;
+            }
+        }
+    }
+
+    // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan)
+    // This reduces pressure to allocate new SS when other threads freed blocks.
+    // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan
+    if (!tls->ss) {
+        // Phase 6: Use per-class registry (262K → ~10-100 entries per class!)
+        extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
+        extern int g_super_reg_class_size[TINY_NUM_CLASSES];
+
+        const int scan_max = tiny_reg_scan_max();
+        int reg_size = g_super_reg_class_size[class_idx];
+        int scan_limit = (scan_max < reg_size) ? scan_max : reg_size;
+
+        for (int i = 0; i < scan_limit; i++) {
+            SuperSlab* ss = g_super_reg_by_class[class_idx][i];
+            if (!ss || ss->magic != SUPERSLAB_MAGIC) continue;
+            // Note: class_idx check is not needed (per-class registry!)
+
+            // Pick first slab with freelist (Box 4: 所有権取得 + remote check)
+            int reg_cap = ss_slabs_capacity(ss);
+            uint32_t self_tid = tiny_self_u32();
+            for (int s = 0; s < reg_cap; s++) {
+                if (ss->slabs[s].freelist) {
+                    SlabHandle h = slab_try_acquire(ss, s, self_tid);
+                    if (slab_is_valid(&h)) {
+                        slab_drain_remote_full(&h);
+                        if (slab_is_safe_to_bind(&h)) {
+                            tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
+                            tiny_tls_bind_slab(tls, ss, s);
+                            return ss;
+                        }
+                        slab_release(&h);
+                    }
+                }
+            }
+        }
+    }
+
+    // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window
+    {
+        SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls);
+        if (gate_ss) return gate_ss;
+    }
+
+    // Allocate new SuperSlab
+    SuperSlab* ss = superslab_allocate((uint8_t)class_idx);
+    if (!ss) {
+        if (!g_superslab_refill_debug_once) {
+            g_superslab_refill_debug_once = 1;
+            int err = errno;
+            fprintf(stderr,
+                    "[DEBUG] superslab_refill NULL detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n",
+                    class_idx,
+                    (void*)prev_ss,
+                    (unsigned)prev_active,
+                    prev_bitmap,
+                    (void*)prev_meta,
+                    (unsigned)prev_meta_used,
+                    (unsigned)prev_meta_cap,
+                    (unsigned)prev_slab_idx,
+                    reused_slabs,
+                    free_idx_attempted,
+                    err);
+        }
+        return NULL;  // OOM
+    }
+
+    // Initialize first slab
+    uint32_t my_tid = tiny_self_u32();
+    superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid);
+
+    // Cache in unified TLS（前のSS参照を解放）
+    SuperSlab* old = tls->ss;
+    tiny_tls_bind_slab(tls, ss, 0);
+    // Maintain refcount（将来の空回収に備え、TLS参照をカウント）
+    superslab_ref_inc(ss);
+    if (old && old != ss) {
+        superslab_ref_dec(old);
+    }
+
+    return ss;
+}
+
+// Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix)
+static inline void* hak_tiny_alloc_superslab(int class_idx) {
+    // DEBUG: Function entry trace (gated to avoid ring spam)
+    do {
+        static int g_alloc_ring = -1;
+        if (__builtin_expect(g_alloc_ring == -1, 0)) {
+            const char* e = getenv("HAKMEM_TINY_ALLOC_RING");
+            g_alloc_ring = (e && *e && *e != '0') ? 1 : 0;
+        }
+        if (g_alloc_ring) {
+            tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0);
+        }
+    } while (0);
+
+    // MidTC fast path: 128..1024B（class>=4）はTLS tcacheを最優先
+    do {
+        void* mp = midtc_pop(class_idx);
+        if (mp) {
+            HAK_RET_ALLOC(class_idx, mp);
+        }
+    } while (0);
+
+    // Phase 6.24: 1 TLS read (down from 3)
+    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
+
+    TinySlabMeta* meta = tls->meta;
+    int slab_idx = tls->slab_idx;
+    if (meta && slab_idx >= 0 && tls->ss) {
+        // A/B: Relaxed read for remote head presence check
+        static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed
+        if (__builtin_expect(g_alloc_remote_relax == -1, 0)) {
+            const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX");
+            g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0;
+        }
+        uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx],
+                                                 g_alloc_remote_relax ? memory_order_relaxed
+                                                                       : memory_order_acquire);
+        if (__builtin_expect(pending != 0, 0)) {
+            uint32_t self_tid = tiny_self_u32();
+            if (ss_owner_try_acquire(meta, self_tid)) {
+                _ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta);
+            }
+        }
+    }
+
+    // FIX #2 DELETED (Race condition fix):
+    // Previous drain-all-slabs without ownership caused concurrent freelist corruption.
+    // Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash.
+    // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
+    // Remote frees will be drained when the slab is adopted via refill paths.
+
+    // Fast path: Direct metadata access (no repeated TLS reads!)
+    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
+        // Linear allocation (lazy init)
+        size_t block_size = g_tiny_class_sizes[tls->ss->size_class];
+        void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));
+        meta->used++;
+        // Track active blocks in SuperSlab for conservative reclamation
+        ss_active_inc(tls->ss);
+        // Route: slab linear
+        ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60);
+        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
+    }
+
+    if (meta && meta->freelist) {
+        // Freelist allocation
+        void* block = meta->freelist;
+        // Safety: bounds/alignment check (debug)
+        if (__builtin_expect(g_tiny_safe_free, 0)) {
+            size_t blk = g_tiny_class_sizes[tls->ss->size_class];
+            uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
+            uintptr_t delta = (uintptr_t)block - (uintptr_t)base;
+            int align_ok = ((delta % blk) == 0);
+            int range_ok = (delta / blk) < meta->capacity;
+            if (!align_ok || !range_ok) {
+                uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u);
+                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u);
+                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; }
+                return NULL;
+            }
+        }
+        void* next = *(void**)block;
+        meta->freelist = next;
+        meta->used++;
+        // Optional: clear freelist bit when becomes empty
+        do {
+            static int g_mask_en = -1;
+            if (__builtin_expect(g_mask_en == -1, 0)) {
+                const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
+                g_mask_en = (e && *e && *e != '0') ? 1 : 0;
+            }
+            if (__builtin_expect(g_mask_en, 0) && next == NULL) {
+                uint32_t bit = (1u << slab_idx);
+                atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release);
+            }
+        } while (0);
+        // Track active blocks in SuperSlab for conservative reclamation
+        ss_active_inc(tls->ss);
+        // Route: slab freelist
+        ROUTE_MARK(12); ROUTE_COMMIT(class_idx, 0x61);
+        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
+    }
+
+    // Slow path: Refill TLS slab
+    SuperSlab* ss = superslab_refill(class_idx);
+    if (!ss) {
+        static int log_oom = 0;
+        if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; }
+        return NULL;  // OOM
+    }
+
+    // Retry allocation (metadata already cached in superslab_refill)
+    meta = tls->meta;
+
+    // DEBUG: Check each condition (disabled for benchmarks)
+    // static int log_retry = 0;
+    // if (log_retry < 2) {
+    //     fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n",
+    //             (void*)meta, meta ? meta->freelist : NULL,
+    //             meta ? meta->used : 0, meta ? meta->capacity : 0,
+    //             (void*)tls->slab_base);
+    //     log_retry++;
+    // }
+
+    if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
+        size_t block_size = g_tiny_class_sizes[ss->size_class];
+        void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));
+
+        // Disabled for benchmarks
+        // static int log_success = 0;
+        // if (log_success < 2) {
+        //     fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n",
+        //             block, class_idx, meta->used, meta->used + 1);
+        //     log_success++;
+        // }
+
+        meta->used++;
+        // Track active blocks in SuperSlab for conservative reclamation
+        ss_active_inc(ss);
+        HAK_RET_ALLOC(class_idx, block);  // Phase 8.4: Zero hot-path overhead
+    }
+
+    // Disabled for benchmarks
+    // static int log_fail = 0;
+    // if (log_fail < 2) {
+    //     fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n");
+    //     log_fail++;
+    // }
+    return NULL;
+}
diff --git a/core/tiny_superslab_free.inc.h b/core/tiny_superslab_free.inc.h
new file mode 100644
index 00000000..5331444c
--- /dev/null
+++ b/core/tiny_superslab_free.inc.h
@@ -0,0 +1,313 @@
+// tiny_superslab_free.inc.h - SuperSlab Free Layer
+// Purpose: Same-thread and cross-thread free handling
+// Extracted from: hakmem_tiny_free.inc lines 1171-1475
+// Box Theory: Box 6 (Free Fast Path) + Box 2 (Remote Queue) integration
+//
+// Public functions:
+// - hak_tiny_free_superslab(): Main SuperSlab free entry point
+
+// Phase 6.22-B: SuperSlab fast free path
+static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
+    ROUTE_MARK(16); // free_enter
+    HAK_DBG_INC(g_superslab_free_count);  // Phase 7.6: Track SuperSlab frees
+    // Get slab index (supports 1MB/2MB SuperSlabs)
+    int slab_idx = slab_index_for(ss, ptr);
+    size_t ss_size = (size_t)1ULL << ss->lg_size;
+    uintptr_t ss_base = (uintptr_t)ss;
+    if (__builtin_expect(slab_idx < 0, 0)) {
+        uintptr_t aux = tiny_remote_pack_diag(0xBAD1u, ss_base, ss_size, (uintptr_t)ptr);
+        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+        return;
+    }
+    TinySlabMeta* meta = &ss->slabs[slab_idx];
+    if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
+        tiny_remote_watch_note("free_enter", ss, slab_idx, ptr, 0xA240u, tiny_self_u32(), 0);
+        extern __thread TinyTLSSlab g_tls_slabs[];
+        tiny_alloc_dump_tls_state(ss->size_class, "watch_free_enter", &g_tls_slabs[ss->size_class]);
+#if !HAKMEM_BUILD_RELEASE
+        extern __thread TinyTLSMag g_tls_mags[];
+        TinyTLSMag* watch_mag = &g_tls_mags[ss->size_class];
+        fprintf(stderr,
+                "[REMOTE_WATCH_MAG] cls=%u mag_top=%d cap=%d\n",
+                ss->size_class,
+                watch_mag->top,
+                watch_mag->cap);
+#endif
+    }
+    // BUGFIX: Validate size_class before using as array index (prevents OOB)
+    if (__builtin_expect(ss->size_class < 0 || ss->size_class >= TINY_NUM_CLASSES, 0)) {
+        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xF1, ptr, (uintptr_t)ss->size_class);
+        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+        return;
+    }
+    if (__builtin_expect(g_tiny_safe_free, 0)) {
+        size_t blk = g_tiny_class_sizes[ss->size_class];
+        uint8_t* base = tiny_slab_base_for(ss, slab_idx);
+        uintptr_t delta = (uintptr_t)ptr - (uintptr_t)base;
+        int cap_ok = (meta->capacity > 0) ? 1 : 0;
+        int align_ok = (delta % blk) == 0;
+        int range_ok = cap_ok && (delta / blk) < meta->capacity;
+        if (!align_ok || !range_ok) {
+            uint32_t code = 0xA100u;
+            if (align_ok) code |= 0x2u;
+            if (range_ok) code |= 0x1u;
+            uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr);
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+            return;
+        }
+        // Duplicate in freelist (best-effort scan up to 64)
+        void* scan = meta->freelist; int scanned = 0; int dup = 0;
+        while (scan && scanned < 64) { if (scan == ptr) { dup = 1; break; } scan = *(void**)scan; scanned++; }
+        if (dup) {
+            uintptr_t aux = tiny_remote_pack_diag(0xDFu, ss_base, ss_size, (uintptr_t)ptr);
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+            return;
+        }
+    }
+
+    // Phase 6.23: Same-thread check
+    uint32_t my_tid = tiny_self_u32();
+    const int debug_guard = g_debug_remote_guard;
+    static __thread int g_debug_free_count = 0;
+    if (!g_tiny_force_remote && meta->owner_tid != 0 && meta->owner_tid == my_tid) {
+        ROUTE_MARK(17); // free_same_thread
+        // Fast path: Direct freelist push (same-thread)
+        if (0 && debug_guard && g_debug_free_count < 1) {
+            fprintf(stderr, "[FREE_SS] SAME-THREAD: owner=%u my=%u\n",
+                    meta->owner_tid, my_tid);
+            g_debug_free_count++;
+        }
+        if (__builtin_expect(meta->used == 0, 0)) {
+            uintptr_t aux = tiny_remote_pack_diag(0x00u, ss_base, ss_size, (uintptr_t)ptr);
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+            return;
+        }
+        tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "local_free_enter", my_tid);
+        if (!tiny_remote_guard_allow_local_push(ss, slab_idx, meta, ptr, "local_free", my_tid)) {
+            #include "box/free_remote_box.h"
+            int transitioned = tiny_free_remote_box(ss, slab_idx, meta, ptr, my_tid);
+            if (transitioned) {
+                extern unsigned long long g_remote_free_transitions[];
+                g_remote_free_transitions[ss->size_class]++;
+                // Free-side route: remote transition observed
+                do {
+                    static int g_route_free = -1; if (__builtin_expect(g_route_free == -1, 0)) {
+                        const char* e = getenv("HAKMEM_TINY_ROUTE_FREE");
+                        g_route_free = (e && *e && *e != '0') ? 1 : 0; }
+                    if (g_route_free) route_free_commit((int)ss->size_class, (1ull<<18), 0xE2);
+                } while (0);
+            }
+            return;
+        }
+        // Optional: MidTC (TLS tcache for 128..1024B) — allow bypass via env HAKMEM_TINY_FREE_TO_SS=1
+        do {
+            static int g_free_to_ss = -1;
+            if (__builtin_expect(g_free_to_ss == -1, 0)) {
+                const char* e = getenv("HAKMEM_TINY_FREE_TO_SS");
+                g_free_to_ss = (e && *e && *e != '0') ? 1 : 0; // default OFF
+            }
+            if (!g_free_to_ss) {
+                int cls = (int)ss->size_class;
+                if (midtc_enabled() && cls >= 4) {
+                    if (midtc_push(cls, ptr)) {
+                        // Treat as returned to TLS cache (not SS freelist)
+                        meta->used--;
+                        ss_active_dec_one(ss);
+                        return;
+                    }
+                }
+            }
+        } while (0);
+
+        #include "box/free_local_box.h"
+        // Perform freelist push (+first-free publish if applicable)
+        void* prev_before = meta->freelist;
+        tiny_free_local_box(ss, slab_idx, meta, ptr, my_tid);
+        if (prev_before == NULL) {
+            ROUTE_MARK(19); // first_free_transition
+            extern unsigned long long g_first_free_transitions[];
+            g_first_free_transitions[ss->size_class]++;
+            ROUTE_MARK(20); // mailbox_publish
+            // Free-side route commit (one-shot)
+            do {
+                static int g_route_free = -1; if (__builtin_expect(g_route_free == -1, 0)) {
+                    const char* e = getenv("HAKMEM_TINY_ROUTE_FREE");
+                    g_route_free = (e && *e && *e != '0') ? 1 : 0; }
+                int cls = (int)ss->size_class;
+                if (g_route_free) route_free_commit(cls, (1ull<<19) | (1ull<<20), 0xE1);
+            } while (0);
+        }
+
+        if (__builtin_expect(debug_guard, 0)) {
+            fprintf(stderr, "[REMOTE_LOCAL] cls=%u slab=%d owner=%u my=%u ptr=%p prev=%p used=%u\n",
+                    ss->size_class, slab_idx, meta->owner_tid, my_tid, ptr, prev_before, meta->used);
+        }
+
+        // 空検出は別途（ホットパス除外）
+    } else {
+        ROUTE_MARK(18); // free_remote_transition
+        if (__builtin_expect(meta->owner_tid == my_tid && meta->owner_tid == 0, 0)) {
+            uintptr_t aux = tiny_remote_pack_diag(0xA300u, ss_base, ss_size, (uintptr_t)ptr);
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+            if (debug_guard) {
+                fprintf(stderr, "[REMOTE_OWNER_ZERO] cls=%u slab=%d ptr=%p my=%u used=%u\n",
+                        ss->size_class, slab_idx, ptr, my_tid, (unsigned)meta->used);
+            }
+        }
+        tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "remote_free_enter", my_tid);
+        // Slow path: Remote free (cross-thread)
+        if (0 && debug_guard && g_debug_free_count < 5) {
+            fprintf(stderr, "[FREE_SS] CROSS-THREAD: owner=%u my=%u slab_idx=%d\n",
+                    meta->owner_tid, my_tid, slab_idx);
+            g_debug_free_count++;
+        }
+        if (__builtin_expect(g_tiny_safe_free, 0)) {
+            // Best-effort duplicate scan in remote stack (up to 64 nodes)
+            uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
+            uintptr_t base = ss_base;
+            int scanned = 0; int dup = 0;
+            uintptr_t cur = head;
+            while (cur && scanned < 64) {
+                if ((cur < base) || (cur >= base + ss_size)) {
+                    uintptr_t aux = tiny_remote_pack_diag(0xA200u, base, ss_size, cur);
+                    tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
+                    if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                    break;
+                }
+                if ((void*)cur == ptr) { dup = 1; break; }
+                if (__builtin_expect(g_remote_side_enable, 0)) {
+                    if (!tiny_remote_sentinel_ok((void*)cur)) {
+                        uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, cur);
+                        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
+                        uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)(void*)cur, memory_order_relaxed);
+                        tiny_remote_report_corruption("scan", (void*)cur, observed);
+                        fprintf(stderr,
+                                "[REMOTE_SENTINEL] cls=%u slab=%d cur=%p head=%p ptr=%p scanned=%d observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p remote_head=%p\n",
+                                ss->size_class,
+                                slab_idx,
+                                (void*)cur,
+                                (void*)head,
+                                ptr,
+                                scanned,
+                                observed,
+                                meta->owner_tid,
+                                (unsigned)meta->used,
+                                meta->freelist,
+                                (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed));
+                        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                        break;
+                    }
+                    cur = tiny_remote_side_get(ss, slab_idx, (void*)cur);
+                } else {
+                    if ((cur & (uintptr_t)(sizeof(void*) - 1)) != 0) {
+                        uintptr_t aux = tiny_remote_pack_diag(0xA201u, base, ss_size, cur);
+                        tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
+                        if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                        break;
+                    }
+                    cur = (uintptr_t)(*(void**)(void*)cur);
+                }
+                scanned++;
+            }
+            if (dup) {
+            uintptr_t aux = tiny_remote_pack_diag(0xD1u, ss_base, ss_size, (uintptr_t)ptr);
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+            return;
+        }
+        }
+        if (__builtin_expect(meta->used == 0, 0)) {
+            uintptr_t aux = tiny_remote_pack_diag(0x01u, ss_base, ss_size, (uintptr_t)ptr);
+            tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+            if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+            return;
+        }
+        static int g_ss_adopt_en2 = -1; // env cached
+        if (g_ss_adopt_en2 == -1) {
+            char* e = getenv("HAKMEM_TINY_SS_ADOPT");
+            // 既定: Remote Queueを使う（1）。env指定時のみ上書き。
+            g_ss_adopt_en2 = (e == NULL) ? 1 : ((*e != '0') ? 1 : 0);
+            if (__builtin_expect(debug_guard, 0)) {
+                fprintf(stderr, "[FREE_SS] g_ss_adopt_en2=%d (env='%s')\n", g_ss_adopt_en2, e ? e : "(null)");
+            }
+        }
+        if (g_ss_adopt_en2) {
+            // Use remote queue
+            uintptr_t head_word = __atomic_load_n((uintptr_t*)ptr, __ATOMIC_RELAXED);
+            if (debug_guard) fprintf(stderr, "[REMOTE_PUSH_CALL] cls=%u slab=%d owner=%u my=%u ptr=%p used=%u remote_count=%u head=%p word=0x%016" PRIxPTR "\n",
+                    ss->size_class,
+                    slab_idx,
+                    meta->owner_tid,
+                    my_tid,
+                    ptr,
+                    (unsigned)meta->used,
+                    atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_relaxed),
+                    (void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed),
+                    head_word);
+            int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, ptr);
+            if (!dup_remote && __builtin_expect(g_remote_side_enable, 0)) {
+                dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, ptr);
+            }
+            if (__builtin_expect(head_word == TINY_REMOTE_SENTINEL && !dup_remote && g_debug_remote_guard, 0)) {
+                tiny_remote_watch_note("dup_scan_miss", ss, slab_idx, ptr, 0xA215u, my_tid, 0);
+            }
+            if (dup_remote) {
+                uintptr_t aux = tiny_remote_pack_diag(0xA214u, ss_base, ss_size, (uintptr_t)ptr);
+                tiny_remote_watch_mark(ptr, "dup_prevent", my_tid);
+                tiny_remote_watch_note("dup_prevent", ss, slab_idx, ptr, 0xA214u, my_tid, 0);
+                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                return;
+            }
+            if (__builtin_expect(g_remote_side_enable && (head_word & 0xFFFFu) == 0x6261u, 0)) {
+                // TLS guard scribble detected on the node's first word → same-pointer double free across routes
+                uintptr_t aux = tiny_remote_pack_diag(0xA213u, ss_base, ss_size, (uintptr_t)ptr);
+                tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
+                tiny_remote_watch_mark(ptr, "pre_push", my_tid);
+                tiny_remote_watch_note("pre_push", ss, slab_idx, ptr, 0xA231u, my_tid, 0);
+                tiny_remote_report_corruption("pre_push", ptr, head_word);
+                if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
+                return;
+            }
+            if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
+                tiny_remote_watch_note("free_remote", ss, slab_idx, ptr, 0xA232u, my_tid, 0);
+            }
+            int was_empty = ss_remote_push(ss, slab_idx, ptr);
+            meta->used--;
+            ss_active_dec_one(ss);
+            if (was_empty) {
+                extern unsigned long long g_remote_free_transitions[];
+                g_remote_free_transitions[ss->size_class]++;
+                ss_partial_publish((int)ss->size_class, ss);
+            }
+        } else {
+            // Fallback: direct freelist push (legacy)
+            if (debug_guard) fprintf(stderr, "[FREE_SS] Using LEGACY freelist push (not remote queue)\n");
+            void* prev = meta->freelist;
+            *(void**)ptr = prev;
+            meta->freelist = ptr;
+            do {
+                static int g_mask_en = -1;
+                if (__builtin_expect(g_mask_en == -1, 0)) {
+                    const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
+                    g_mask_en = (e && *e && *e != '0') ? 1 : 0;
+                }
+                if (__builtin_expect(g_mask_en, 0) && prev == NULL) {
+                    uint32_t bit = (1u << slab_idx);
+                    atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
+                }
+            } while (0);
+            meta->used--;
+            ss_active_dec_one(ss);
+            if (prev == NULL) {
+                ss_partial_publish((int)ss->size_class, ss);
+            }
+        }
+
+        // 空検出は別途（ホットパス除外）
+    }
+}
diff --git a/core/tiny_system.h b/core/tiny_system.h
new file mode 100644
index 00000000..aabbbc08
--- /dev/null
+++ b/core/tiny_system.h
@@ -0,0 +1,18 @@
+// tiny_system.h - System includes for Tiny allocator
+// Consolidates all standard library includes to reduce clutter
+
+#ifndef TINY_SYSTEM_H
+#define TINY_SYSTEM_H
+
+#include <stdio.h>
+#include <stdlib.h>
+#include <string.h>
+#include <strings.h>
+#include <sys/mman.h>
+#include <unistd.h>
+#include <errno.h>
+#include <sched.h>
+#include <pthread.h>
+#include <time.h>
+
+#endif // TINY_SYSTEM_H