diff --git a/core/box/front_gate_classifier.c b/core/box/front_gate_classifier.c
index 557d9878..464be322 100644
--- a/core/box/front_gate_classifier.c
+++ b/core/box/front_gate_classifier.c
@@ -8,6 +8,7 @@
 
 #include <stdio.h>   // For fprintf in debug
 #include <stdlib.h>  // For abort in debug
+#include <sys/mman.h>  // For mincore() in Step 3 safety check
 #include "front_gate_classifier.h"
 #include "../tiny_region_id.h"  // Must come before hakmem_tiny_superslab.h for HEADER_MAGIC
 #include "../hakmem_tiny_superslab.h"
@@ -207,27 +208,25 @@ ptr_classification_t classify_ptr(void* ptr) {
         return result;
     }
 
-    // Step 3: Try AllocHeader (HAKMEM header) for Mid/Large/Mmap
-    do {
-        if (!ptr) break;
-        // Quick page-safety check: avoid crossing page for header read
-        uintptr_t off = (uintptr_t)ptr & 0xFFFu;
-        int safe_same_page = (off >= HEADER_SIZE);
-        void* raw = (char*)ptr - HEADER_SIZE;
-        if (!safe_same_page) {
-            if (!hak_is_memory_readable(raw)) break;
-        }
-        AllocHeader* hdr = (AllocHeader*)raw;
-        if (hdr->magic == HAKMEM_MAGIC) {
-            result.kind = PTR_KIND_MID_LARGE;  // HAKMEM-owned (non-Tiny)
-#if !HAKMEM_BUILD_RELEASE
-            g_classify_unknown_hit++; // reuse for stats without adding a new counter
-#endif
-            return result;
-        }
-    } while (0);
-
-    // Step 4: Not recognized → UNKNOWN (route to libc or slow path)
+    // Step 3: SAFETY FIX - Skip AllocHeader probe for unknown pointers
+    //
+    // RATIONALE:
+    // - If pointer isn't in Pool TLS or SuperSlab registries, it's either:
+    //   1. Mid/Large allocation (has AllocHeader)
+    //   2. External allocation (libc, stack, etc.)
+    // - We CANNOT safely distinguish (1) from (2) without dereferencing memory
+    // - Dereferencing unknown memory can SEGV (e.g., ptr at page boundary)
+    // - SAFER approach: Return UNKNOWN and let free wrapper handle it
+    //
+    // FREE WRAPPER BEHAVIOR (hak_free_api.inc.h):
+    // - PTR_KIND_UNKNOWN routes to Mid/Large registry lookups (hak_pool_mid_lookup, hak_l25_lookup)
+    // - If those fail → routes to AllocHeader dispatch (safe, same-page check)
+    // - If AllocHeader invalid → routes to __libc_free()
+    //
+    // PERFORMANCE IMPACT:
+    // - Only affects pointers NOT in our registries (rare)
+    // - Avoids SEGV on external pointers (correctness > performance)
+    //
     result.kind = PTR_KIND_UNKNOWN;
 
 #if !HAKMEM_BUILD_RELEASE
diff --git a/core/box/hak_alloc_api.inc.h b/core/box/hak_alloc_api.inc.h
index 51622243..bde1289a 100644
--- a/core/box/hak_alloc_api.inc.h
+++ b/core/box/hak_alloc_api.inc.h
@@ -21,14 +21,6 @@ static inline void* hak_os_map_boundary(size_t size, uintptr_t site_id) {
 
 __attribute__((always_inline))
 inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
-#if !HAKMEM_BUILD_RELEASE
-    static _Atomic uint64_t hak_alloc_call_count = 0;
-    uint64_t call_num = atomic_fetch_add(&hak_alloc_call_count, 1);
-    if (call_num > 14250 && call_num < 14280 && size <= 1024) {
-        fprintf(stderr, "[HAK_ALLOC_AT] call=%lu size=%zu\n", call_num, size);
-        fflush(stderr);
-    }
-#endif
 
 #if HAKMEM_DEBUG_TIMING
     HKM_TIME_START(t0);
@@ -38,30 +30,12 @@ inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
     uintptr_t site_id = (uintptr_t)site;
 
     if (__builtin_expect(size <= TINY_MAX_SIZE, 1)) {
-#if !HAKMEM_BUILD_RELEASE
-        if (call_num > 14250 && call_num < 14280 && size <= 1024) {
-            fprintf(stderr, "[HAK_ALLOC_AT] call=%lu entering tiny path\n", call_num);
-            fflush(stderr);
-        }
-#endif
 #if HAKMEM_DEBUG_TIMING
         HKM_TIME_START(t_tiny);
 #endif
         void* tiny_ptr = NULL;
 #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
-#if !HAKMEM_BUILD_RELEASE
-        if (call_num > 14250 && call_num < 14280 && size <= 1024) {
-            fprintf(stderr, "[HAK_ALLOC_AT] call=%lu calling hak_tiny_alloc_fast_wrapper\n", call_num);
-            fflush(stderr);
-        }
-#endif
         tiny_ptr = hak_tiny_alloc_fast_wrapper(size);
-#if !HAKMEM_BUILD_RELEASE
-        if (call_num > 14250 && call_num < 14280 && size <= 1024) {
-            fprintf(stderr, "[HAK_ALLOC_AT] call=%lu hak_tiny_alloc_fast_wrapper returned %p\n", call_num, tiny_ptr);
-            fflush(stderr);
-        }
-#endif
 #elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
         tiny_ptr = hak_tiny_alloc_ultra_simple(size);
 #elif defined(HAKMEM_TINY_PHASE6_METADATA)
diff --git a/core/box/hak_free_api.inc.h b/core/box/hak_free_api.inc.h
index 2014da86..a468af47 100644
--- a/core/box/hak_free_api.inc.h
+++ b/core/box/hak_free_api.inc.h
@@ -2,6 +2,7 @@
 #ifndef HAK_FREE_API_INC_H
 #define HAK_FREE_API_INC_H
 
+#include <sys/mman.h>  // For mincore() in AllocHeader safety check
 #include "hakmem_tiny_superslab.h"  // For SUPERSLAB_MAGIC, SuperSlab
 #include "../tiny_free_fast_v2.inc.h"  // Phase 7: Header-based ultra-fast free
 #include "../ptr_trace.h"              // Debug: pointer trace immediate dump on libc fallback
@@ -191,9 +192,26 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
     {
         void* raw = (char*)ptr - HEADER_SIZE;
 
-        // CRITICAL FIX (2025-11-07): Check if memory is accessible before dereferencing
-        // This prevents SEGV when ptr has no header (Tiny alloc where SS lookup failed, or libc alloc)
-        if (!hak_is_memory_readable(raw)) {
+        // CRITICAL FIX (2025-11-14): Use real mincore() to check memory accessibility
+        // Phase 9 gutted hak_is_memory_readable() to always return 1 (unsafe!)
+        // We MUST verify memory is mapped before dereferencing AllocHeader
+        int is_mapped = 0;
+        #ifdef __linux__
+        {
+            unsigned char vec;
+            // Check both pages if header crosses page boundary
+            void* page1 = (void*)((uintptr_t)raw & ~0xFFFUL);
+            void* page2 = (void*)(((uintptr_t)raw + sizeof(AllocHeader) - 1) & ~0xFFFUL);
+            is_mapped = (mincore(page1, 1, &vec) == 0);
+            if (is_mapped && page2 != page1) {
+                is_mapped = (mincore(page2, 1, &vec) == 0);
+            }
+        }
+        #else
+        is_mapped = 1;  // Assume mapped on non-Linux
+        #endif
+
+        if (!is_mapped) {
             // Memory not accessible, ptr likely has no header
             hak_free_route_log("unmapped_header_fallback", ptr);
 
diff --git a/core/hakmem_build_flags.h b/core/hakmem_build_flags.h
index 638bc8ce..96ea335a 100644
--- a/core/hakmem_build_flags.h
+++ b/core/hakmem_build_flags.h
@@ -63,6 +63,13 @@
 #  define HAKMEM_TINY_AGGRESSIVE_INLINE 0
 #endif
 
+// Inline TLS SLL pop (experimental, A/B only)
+// Default: OFF (HAKMEM_TINY_INLINE_SLL=0) to keep Box TLS-SLL API as the standard path.
+// Enable explicitly via build flag: -DHAKMEM_TINY_INLINE_SLL=1 (bench/debug only).
+#ifndef HAKMEM_TINY_INLINE_SLL
+#  define HAKMEM_TINY_INLINE_SLL 0
+#endif
+
 // Phase 7 Task 3: Pre-warm TLS cache at init
 // Default: OFF (enable after implementation)
 // Build: make PREWARM_TLS=1 or make phase7
diff --git a/core/hakmem_pool.c.bak2 b/core/hakmem_pool.c.bak2
deleted file mode 100644
index 0b507e3f..00000000
--- a/core/hakmem_pool.c.bak2
+++ /dev/null
@@ -1,1454 +0,0 @@
-// ============================================================================
-// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB)
-// ============================================================================
-//
-// サイズクラス定義:
-// ┌──────────┬─────────┬──────────────┬─────────────┐
-// │ クラス   │ サイズ  │ 初期CAP      │ ページ構成  │
-// ├──────────┼─────────┼──────────────┼─────────────┤
-// │ Class 0  │  2 KiB  │  64 pages    │ 32 blocks/p │
-// │ Class 1  │  4 KiB  │  64 pages    │ 16 blocks/p │
-// │ Class 2  │  8 KiB  │  64 pages    │  8 blocks/p │
-// │ Class 3  │ 16 KiB  │  32 pages    │  4 blocks/p │
-// │ Class 4  │ 32 KiB  │  16 pages    │  2 blocks/p │
-// │ DYN1     │ 6 KiB*  │  0 (無効)    │ 可変        │
-// │ DYN2     │ (未使用)│  0 (無効)    │ 可変        │
-// └──────────┴─────────┴──────────────┴─────────────┘
-// * DYN1はギャップ(8-16KB)を埋めるための動的クラス
-//
-// W_MAX (切り上げ許容倍率):
-//   - 意味: 要求サイズの何倍までのクラスを許容するか
-//   - デフォルト: 1.40 (40%までの切り上げを許容)
-//   - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40)
-//   - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能
-//
-// CAP (在庫量):
-//   - 意味: 各クラスで保持する最大ページ数
-//   - 初期値: {64,64,64,32,16} - 保守的（フットプリント優先）
-//   - 推奨値: {256,256,256,128,64} - パフォーマンス優先
-//   - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定
-//   - 学習モード: HAKMEM_LEARN=1 で自動調整
-//
-// TLSリング構造:
-//   - POOL_L2_RING_CAP: リングバッファ容量（デフォルト16）
-//   - ActivePage A/B: bump-run方式（ロックフリー）
-//   - LIFO overflow: リングから溢れた分
-//
-// パフォーマンスチューニング:
-//   1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64
-//   2. W_MAX緩和: HAKMEM_WMAX_MID=1.6
-//   3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64
-//   4. 学習モード: HAKMEM_LEARN=1
-//
-// License: MIT
-// Last Updated: 2025-10-26 (Code Cleanup完了)
-
-#include "hakmem_pool.h"
-#include "hakmem_config.h"
-#include "hakmem_internal.h"  // For AllocHeader and HAKMEM_MAGIC
-#include "hakmem_syscall.h"   // Box 3 syscall layer (bypasses LD_PRELOAD)
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <pthread.h>
-#include <stdatomic.h>
-#include "hakmem_prof.h"
-#include "hakmem_policy.h"   // FrozenPolicy caps (Soft CAP gating)
-#include "hakmem_debug.h"
-
-// False sharing mitigation: padded mutex type (64B)
-typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;
-
-// ===========================================================================
-// Internal Data Structures
-// ===========================================================================
-#include "box/pool_tls_types.inc.h"
-
-// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid})
-#include "box/pool_mid_desc.inc.h"
-
-// ---------------- Transfer Cache (per-thread per-class inbox) --------------
-#include "box/pool_mid_tc.inc.h"
-
-// ===========================================================================
-// MF2 Per-Page Sharding: Mimalloc-Inspired Architecture
-// ===========================================================================
-//
-// Key idea: Each 64KB page has independent freelist (no sharing!)
-// - O(1) page lookup from block address: (addr & ~0xFFFF)
-// - Owner thread: fast path (no locks, no atomics)
-// - Cross-thread free: lock-free remote stack
-// - Expected: +50% (13.78 → 20.7 M/s, 60-75% of mimalloc)
-
-// MF2 Configuration Constants (Quick Win #5)
-#define MF2_PENDING_QUEUE_BUDGET    4      // Max pages to drain from pending queue
-#define MF2_DEBUG_SAMPLE_COUNT      20     // Number of debug samples to log
-#define MF2_TSC_CYCLES_PER_US       3000   // Estimated TSC cycles per microsecond
-#define MF2_PAGE_SIZE_SHIFT         16     // log2(64KB) for fast division
-#define MF2_PAGE_ALIGNMENT          65536  // 64KB alignment for mmap
-
-// Debug Logging Macros (Quick Win #6)
-// Conditional compilation for debug logs - set HAKMEM_DEBUG_MF2=1 to enable
-#ifdef HAKMEM_DEBUG_MF2
-    #define MF2_DEBUG_LOG(fmt, ...) fprintf(stderr, "[MF2] " fmt "\n", ##__VA_ARGS__)
-    #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
-#else
-    #define MF2_DEBUG_LOG(fmt, ...) ((void)0)
-    #define MF2_ERROR_LOG(fmt, ...) fprintf(stderr, "[MF2 ERROR] " fmt "\n", ##__VA_ARGS__)
-#endif
-
-// Forward declarations
-static size_t g_class_sizes[POOL_NUM_CLASSES];
-
-// MF2 Page descriptor: per-page metadata (one per 64KB page)
-typedef struct MidPage {
-    // Page identity
-    void* base;              // Page base address (64KB aligned)
-    uint8_t class_idx;       // Size class index (0-6)
-    uint8_t flags;           // Page flags (reserved for future use)
-    uint16_t _pad0;
-
-    // Ownership
-    pthread_t owner_tid;     // Owner thread ID (for fast-path check)
-    struct MF2_ThreadPages* owner_tp;  // Owner thread's heap (for pending queue access)
-    uint64_t last_transfer_time;  // rdtsc() timestamp of last ownership transfer (lease mechanism)
-
-    // Page-local freelist (owner-only, NO LOCK!)
-    PoolBlock* freelist;     // Local freelist head
-    uint16_t free_count;     // Number of free blocks
-    uint16_t capacity;       // Total blocks per page
-
-    // Remote frees (cross-thread, lock-free MPSC stack)
-    atomic_uintptr_t remote_head;   // Lock-free remote free stack
-    atomic_uint remote_count;       // Remote free count (for quick check)
-
-    // Lifecycle
-    atomic_int in_use;       // Live allocations on this page
-    atomic_int pending_dn;   // DONTNEED enqueued flag
-
-    // Linkage (thread-local page lists)
-    struct MidPage* next_page;   // Next page in thread's list
-    struct MidPage* prev_page;   // Previous page in thread's list
-
-    // Pending queue (remote drain notification)
-    _Atomic(_Bool) in_remote_pending;  // Is this page in pending queue?
-    struct MidPage* next_pending;       // Next page in pending queue
-
-    // Padding to cache line boundary (avoid false sharing)
-    char _pad[64 - ((sizeof(void*) * 5 + sizeof(PoolBlock*) + sizeof(uint16_t) * 2 +
-                     sizeof(atomic_uintptr_t) + sizeof(atomic_uint) +
-                     sizeof(atomic_int) * 2 + sizeof(pthread_t) +
-                     sizeof(_Atomic(_Bool)) + 4) % 64)];
-} MidPage;
-
-// Page registry: O(1) lookup from block address
-// Use direct indexing: (addr >> 16) & MASK
-#define MF2_PAGE_REGISTRY_BITS 16   // 64K entries (4GB address space with 64KB pages)
-#define MF2_PAGE_REGISTRY_SIZE (1 << MF2_PAGE_REGISTRY_BITS)
-#define MF2_PAGE_REGISTRY_MASK (MF2_PAGE_REGISTRY_SIZE - 1)
-
-typedef struct {
-    // Direct-mapped page table (no hash collisions!)
-    MidPage* pages[MF2_PAGE_REGISTRY_SIZE];
-
-    // Coarse-grained locks for rare updates (page alloc/free)
-    // 256 locks = 256-way parallelism for page registration
-    pthread_mutex_t locks[256];
-
-    // Statistics
-    atomic_uint_fast64_t total_pages;      // Total pages allocated
-    atomic_uint_fast64_t active_pages;     // Pages with live allocations
-} MF2_PageRegistry;
-
-// Thread-local page lists (one list per size class)
-typedef struct MF2_ThreadPages {
-    // Active pages (have free blocks)
-    MidPage* active_page[POOL_NUM_CLASSES];
-
-    // Partial pages (drained pages with free blocks, LIFO for cache locality)
-    // Checked before allocating new pages (fast reuse path)
-    MidPage* partial_pages[POOL_NUM_CLASSES];
-
-    // Full pages (no free blocks, but may receive remote frees)
-    // TODO: Gradually deprecate in favor of partial_pages
-    MidPage* full_pages[POOL_NUM_CLASSES];
-
-    // Pending queue (pages with remote frees, MPSC lock-free stack)
-    atomic_uintptr_t pages_remote_pending[POOL_NUM_CLASSES];
-
-    // Pending claim flags (prevent multi-consumer CAS thrashing)
-    // One adopter at a time per queue (test_and_set to claim, clear to release)
-    atomic_flag pending_claim[POOL_NUM_CLASSES];
-
-    // Page ownership count (for statistics)
-    uint32_t page_count[POOL_NUM_CLASSES];
-
-    // Thread identity (cached for fast comparison)
-    pthread_t my_tid;
-
-    // Route P: Activity tracking for idle-based adoption
-    // Updated on every allocation (mf2_alloc_fast)
-    // Read by adopters to check if owner is idle
-    atomic_uint_fast64_t last_alloc_tsc;
-} MF2_ThreadPages;
-
-// Global page registry (shared, rarely accessed)
-static MF2_PageRegistry g_mf2_page_registry;
-
-// Thread-local page lists (hot path, no sharing!)
-static __thread MF2_ThreadPages* t_mf2_pages = NULL;
-
-// ===========================================================================
-// MF2 Global State (Quick Win #3b - Structured Globals)
-// ===========================================================================
-// Individual globals replaced with structured state below.
-// Old declarations removed, replaced with macro-mapped struct instances.
-//
-// Benefits:
-// - Logical grouping (config, registry, stats)
-// - Better documentation
-// - Easier to extend or refactor
-// - Single source of truth for each category
-
-#define MF2_MAX_THREADS 256
-
-// MF2 Configuration (environment variables)
-typedef struct {
-    int enabled;              // HAKMEM_MF2_ENABLE (0=disabled, 1=enabled)
-    int max_queues;           // HAKMEM_MF2_MAX_QUEUES (default: 2)
-    int lease_ms;             // HAKMEM_MF2_LEASE_MS (default: 10ms, 0=disabled)
-    int idle_threshold_us;    // HAKMEM_MF2_IDLE_THRESHOLD_US (default: 150µs)
-} MF2_Config;
-
-// MF2 Thread Registry (cross-thread coordination)
-typedef struct {
-    MF2_ThreadPages* all_thread_pages[MF2_MAX_THREADS];  // Global registry
-    _Atomic int num_thread_pages;                         // Active thread count
-    _Atomic int adoptable_count[POOL_NUM_CLASSES];        // Non-empty pending queues
-    pthread_key_t tls_key;                                 // Thread-local storage key
-    pthread_once_t key_once;                               // TLS initialization guard
-} MF2_Registry;
-
-// MF2 Statistics (debug instrumentation)
-typedef struct {
-    // Allocation path
-    atomic_uint_fast64_t alloc_fast_hit;
-    atomic_uint_fast64_t alloc_slow_hit;
-    atomic_uint_fast64_t page_reuse_count;
-    atomic_uint_fast64_t new_page_count;
-
-    // Free path
-    atomic_uint_fast64_t free_owner_count;
-    atomic_uint_fast64_t free_remote_count;
-
-    // Drain operations
-    atomic_uint_fast64_t drain_count;
-    atomic_uint_fast64_t drain_blocks;
-    atomic_uint_fast64_t drain_attempts;
-    atomic_uint_fast64_t drain_success;
-    atomic_uint_fast64_t slow_checked_drain;
-    atomic_uint_fast64_t slow_found_remote;
-
-    // Full page scan (obsolete, kept for historical tracking)
-    atomic_uint_fast64_t full_scan_checked;
-    atomic_uint_fast64_t full_scan_found_remote;
-    atomic_uint_fast64_t eager_drain_scanned;
-    atomic_uint_fast64_t eager_drain_found;
-
-    // Pending queue
-    atomic_uint_fast64_t pending_enqueued;
-    atomic_uint_fast64_t pending_drained;
-    atomic_uint_fast64_t pending_requeued;
-} MF2_Stats;
-
-// Instantiate structured global state (Quick Win #3b)
-static MF2_Config g_mf2_config = {
-    .enabled = 0,           // Will be set by env var
-    .max_queues = 2,
-    .lease_ms = 10,
-    .idle_threshold_us = 150
-};
-
-static MF2_Registry g_mf2_registry = {
-    .all_thread_pages = {0},
-    .num_thread_pages = 0,
-    .adoptable_count = {0},
-    .tls_key = 0,
-    .key_once = PTHREAD_ONCE_INIT
-};
-
-static MF2_Stats g_mf2_stats = {
-    // All fields initialized to 0 (atomic zero-initialization is valid)
-    .alloc_fast_hit = 0,
-    .alloc_slow_hit = 0,
-    .page_reuse_count = 0,
-    .new_page_count = 0,
-    .free_owner_count = 0,
-    .free_remote_count = 0,
-    .drain_count = 0,
-    .drain_blocks = 0,
-    .drain_attempts = 0,
-    .drain_success = 0,
-    .slow_checked_drain = 0,
-    .slow_found_remote = 0,
-    .full_scan_checked = 0,
-    .full_scan_found_remote = 0,
-    .eager_drain_scanned = 0,
-    .eager_drain_found = 0,
-    .pending_enqueued = 0,
-    .pending_drained = 0,
-    .pending_requeued = 0
-};
-
-// Compatibility macros: Map old global names to struct fields
-// This allows existing code to work unchanged while using structured state
-#define g_mf2_enabled           (g_mf2_config.enabled)
-#define g_mf2_max_queues        (g_mf2_config.max_queues)
-#define g_mf2_lease_ms          (g_mf2_config.lease_ms)
-#define g_mf2_idle_threshold_us (g_mf2_config.idle_threshold_us)
-
-#define g_all_thread_pages      (g_mf2_registry.all_thread_pages)
-#define g_num_thread_pages      (g_mf2_registry.num_thread_pages)
-#define g_adoptable_count       (g_mf2_registry.adoptable_count)
-#define g_mf2_tls_key           (g_mf2_registry.tls_key)
-#define g_mf2_key_once          (g_mf2_registry.key_once)
-
-#define g_mf2_alloc_fast_hit        (g_mf2_stats.alloc_fast_hit)
-#define g_mf2_alloc_slow_hit        (g_mf2_stats.alloc_slow_hit)
-#define g_mf2_page_reuse_count      (g_mf2_stats.page_reuse_count)
-#define g_mf2_new_page_count        (g_mf2_stats.new_page_count)
-#define g_mf2_free_owner_count      (g_mf2_stats.free_owner_count)
-#define g_mf2_free_remote_count     (g_mf2_stats.free_remote_count)
-#define g_mf2_drain_count           (g_mf2_stats.drain_count)
-#define g_mf2_drain_blocks          (g_mf2_stats.drain_blocks)
-#define g_mf2_drain_attempts        (g_mf2_stats.drain_attempts)
-#define g_mf2_drain_success         (g_mf2_stats.drain_success)
-#define g_mf2_slow_checked_drain    (g_mf2_stats.slow_checked_drain)
-#define g_mf2_slow_found_remote     (g_mf2_stats.slow_found_remote)
-#define g_mf2_full_scan_checked     (g_mf2_stats.full_scan_checked)
-#define g_mf2_full_scan_found_remote (g_mf2_stats.full_scan_found_remote)
-#define g_mf2_eager_drain_scanned   (g_mf2_stats.eager_drain_scanned)
-#define g_mf2_eager_drain_found     (g_mf2_stats.eager_drain_found)
-#define g_mf2_pending_enqueued      (g_mf2_stats.pending_enqueued)
-#define g_mf2_pending_drained       (g_mf2_stats.pending_drained)
-#define g_mf2_pending_requeued      (g_mf2_stats.pending_requeued)
-
-// ===========================================================================
-// End of MF2 Data Structures
-// ===========================================================================
-
-// --- MF2 Initialization Functions ---
-
-// Thread-safe initialization using pthread_once
-static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT;
-static void mf2_page_registry_init_impl(void) {
-    // Initialize all page slots to NULL
-    memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry));
-
-    // Initialize 256 coarse-grained locks for registry updates
-    for (int i = 0; i < 256; i++) {
-        pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL);
-    }
-
-    // Initialize counters
-    atomic_store(&g_mf2_page_registry.total_pages, 0);
-    atomic_store(&g_mf2_page_registry.active_pages, 0);
-}
-static void mf2_page_registry_init(void) {
-    pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl);
-}
-
-// Strategy A: ThreadPages destructor (cleanup on thread exit)
-static void mf2_thread_pages_destructor(void* arg) {
-    MF2_ThreadPages* tp = (MF2_ThreadPages*)arg;
-    if (!tp) return;
-
-    // SAFETY: Don't remove from global registry or free memory
-    // Reason: Causes "malloc(): unsorted double linked list corrupted" crashes
-    // Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime)
-    // TODO: Investigate safe cleanup mechanism
-
-    // Remove from global registry (DISABLED for safety)
-    // for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) {
-    //     if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) {
-    //         atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release);
-    //         break;
-    //     }
-    // }
-
-    // Free all pages owned by this thread (DISABLED for safety)
-    // hkm_libc_free(tp);
-
-    (void)tp;  // Suppress unused warning
-}
-
-// Strategy A: Initialize pthread_key (once only)
-static void mf2_init_tls_key(void) {
-    pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor);
-}
-
-// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection)
-static inline uint64_t mf2_rdtsc(void) {
-#if defined(__x86_64__) || defined(__i386__)
-    uint32_t lo, hi;
-    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-    return ((uint64_t)hi << 32) | lo;
-#else
-    // Fallback for non-x86 architectures (use clock_gettime approximation)
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
-#endif
-}
-
-static MF2_ThreadPages* mf2_thread_pages_get(void) {
-    if (t_mf2_pages) return t_mf2_pages;
-
-    // Initialize pthread_key (once only)
-    pthread_once(&g_mf2_key_once, mf2_init_tls_key);
-
-    // Allocate thread-local page lists
-    MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages));
-    if (!tp) return NULL;
-
-    // Initialize with current thread ID
-    tp->my_tid = pthread_self();
-
-    // All page lists start empty (NULL)
-    for (int c = 0; c < POOL_NUM_CLASSES; c++) {
-        tp->active_page[c] = NULL;
-        tp->full_pages[c] = NULL;
-        atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed);
-        atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed);
-        tp->page_count[c] = 0;
-    }
-
-    // Route P: Initialize activity tracking
-    atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
-
-    // Strategy A: Register in global array for round-robin drain
-    int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel);
-    if (idx < MF2_MAX_THREADS) {
-        atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release);
-
-        // DEBUG: Log first 10 thread registrations - Disabled for performance
-        // static _Atomic int reg_samples = 0;
-        // int rs = atomic_fetch_add_explicit(&reg_samples, 1, memory_order_relaxed);
-        // if (rs < 10) {
-        //     fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n",
-        //             rs, (unsigned long)tp->my_tid, tp, idx);
-        // }
-    } else {
-        MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS);
-    }
-
-    // Set pthread-specific data for destructor
-    pthread_setspecific(g_mf2_tls_key, tp);
-
-    t_mf2_pages = tp;
-    return tp;
-}
-
-// --- MF2 Page Allocation & Lookup ---
-
-// O(1) page lookup from block address (mimalloc's secret sauce!)
-static inline MidPage* mf2_addr_to_page(void* addr) {
-    // Step 1: Get page base address (64KB aligned)
-    // 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits
-    void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL);
-
-    // Step 2: Index into registry (direct-mapped, 64K entries)
-    // (addr >> 16) extracts page index, & 0xFFFF wraps to registry size
-    size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
-
-    // Step 3: Direct lookup (no hash collision handling needed with 64K entries)
-    MidPage* page = g_mf2_page_registry.pages[idx];
-
-    // ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups
-    static _Atomic int lookup_count = 0;
-    // DEBUG: Disabled for performance
-    // int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed);
-    // if (count < 100) {
-    //     int found = (page != NULL);
-    //     int match = (page && page->base == page_base);
-    //     fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s",
-    //             count, addr, page_base, idx, found ? "YES" : "NO");
-    //     if (page) {
-    //         fprintf(stderr, ", page->base=%p, match=%s",
-    //                 page->base, match ? "YES" : "NO");
-    //     }
-    //     fprintf(stderr, "\n");
-    // }
-
-    // Validation: Ensure page base matches (handles potential collisions)
-    if (page && page->base == page_base) {
-        return page;
-    }
-
-    // Collision or not registered (shouldn't happen in normal operation)
-    return NULL;
-}
-
-// Register a page in the global registry (called once per page allocation)
-static void mf2_register_page(MidPage* page) {
-    if (!page) return;
-
-    // Calculate registry index from page base
-    size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
-
-    // ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance
-    // static int register_count = 0;
-    // if (register_count < 10) {
-    //     fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n",
-    //             register_count, page->base, idx,
-    //             (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO");
-    //     register_count++;
-    // }
-
-    // Coarse-grained lock (256 locks for 64K entries = 256 entries/lock)
-    int lock_idx = idx % 256;
-    pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
-
-    // Check for collision (should be rare with 64K entries)
-    if (g_mf2_page_registry.pages[idx] != NULL) {
-        // Collision detected - this is a problem!
-        // For MVP, we'll just log and overwrite (TODO: handle collisions properly)
-        HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx);
-    }
-
-    // Register the page
-    g_mf2_page_registry.pages[idx] = page;
-
-    // Update counters
-    atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed);
-    atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
-
-    pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
-}
-
-// Unregister a page from the global registry (called when returning page to OS)
-__attribute__((unused)) static void mf2_unregister_page(MidPage* page) {
-    if (!page) return;
-
-    size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
-    int lock_idx = idx % 256;
-
-    pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
-
-    if (g_mf2_page_registry.pages[idx] == page) {
-        g_mf2_page_registry.pages[idx] = NULL;
-        atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
-    }
-
-    pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
-}
-
-// Allocate and initialize a new 64KB page for given size class
-static MidPage* mf2_alloc_new_page(int class_idx) {
-    if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL;
-
-    // Get user size class (2KB, 4KB, 8KB, 16KB, 32KB)
-    size_t user_size = g_class_sizes[class_idx];
-    if (user_size == 0) return NULL; // Dynamic class disabled
-
-    // CRITICAL FIX: Each block needs HEADER_SIZE + user_size
-    // The header stores metadata (AllocHeader), user_size is the usable space
-    size_t block_size = HEADER_SIZE + user_size;
-
-    // Step 1: Allocate 64KB page (aligned to 64KB boundary)
-    // CRITICAL FIX #4: Must ensure 64KB alignment!
-    // mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup.
-    // This caused 97% of frees to fail silently (fatal bug!)
-    //
-    // CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion!
-    // Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion.
-
-    // Allocate 2x size to allow alignment adjustment
-    size_t alloc_size = POOL_PAGE_SIZE * 2;  // 128KB
-    void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
-                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    if (raw == MAP_FAILED) {
-        return NULL; // OOM
-    }
-
-    // Find 64KB aligned address within allocation
-    uintptr_t addr = (uintptr_t)raw;
-    uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL;  // Round up to 64KB boundary
-    void* page_base = (void*)aligned;
-
-    // Free unused prefix (if any)
-    size_t prefix_size = aligned - addr;
-    if (prefix_size > 0) {
-        munmap(raw, prefix_size);
-    }
-
-    // Free unused suffix
-    size_t suffix_offset = prefix_size + POOL_PAGE_SIZE;
-    if (suffix_offset < alloc_size) {
-        munmap((char*)raw + suffix_offset, alloc_size - suffix_offset);
-    }
-
-    // DEBUG: Log first few allocations
-    static _Atomic int mmap_count = 0;
-    int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed);
-    if (mc < 5) {
-        MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu",
-                      mc, raw, page_base, prefix_size, alloc_size - suffix_offset);
-    }
-
-    // ALIGNMENT VERIFICATION (Step 1)
-    if (((uintptr_t)page_base & 0xFFFF) != 0) {
-        MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)",
-                      page_base, ((uintptr_t)page_base & 0xFFFF));
-    }
-
-    // Zero-fill (required for posix_memalign)
-    // Note: This adds ~15μs overhead, but is necessary for correctness
-    memset(page_base, 0, POOL_PAGE_SIZE);
-
-    // Step 2: Allocate MidPage descriptor
-    MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage));
-    if (!page) {
-        // CRITICAL FIX: Use munmap for mmap-allocated memory
-        munmap(page_base, POOL_PAGE_SIZE);
-        return NULL;
-    }
-
-    // Step 3: Initialize page descriptor
-    page->base = page_base;
-    page->class_idx = (uint8_t)class_idx;
-    page->flags = 0;
-    page->owner_tid = pthread_self();
-    page->owner_tp = mf2_thread_pages_get();  // Store owner's ThreadPages for pending queue
-    page->last_transfer_time = 0;  // No transfer yet (lease mechanism)
-
-    // Step 4: Build freelist chain (walk through page and link blocks)
-    // Calculate how many blocks fit in 64KB page (including header overhead)
-    size_t usable_size = POOL_PAGE_SIZE;
-    size_t num_blocks = usable_size / block_size;
-
-    page->capacity = (uint16_t)num_blocks;
-    page->free_count = (uint16_t)num_blocks;
-
-    // Build linked list of free blocks
-    PoolBlock* freelist_head = NULL;
-    PoolBlock* freelist_tail = NULL;
-
-    for (size_t i = 0; i < num_blocks; i++) {
-        char* block_addr = (char*)page_base + (i * block_size);
-        PoolBlock* block = (PoolBlock*)block_addr;
-
-        block->next = NULL;
-
-        if (freelist_head == NULL) {
-            freelist_head = block;
-            freelist_tail = block;
-        } else {
-            freelist_tail->next = block;
-            freelist_tail = block;
-        }
-    }
-
-    page->freelist = freelist_head;
-
-    // Step 5: Initialize remote stack (for cross-thread frees)
-    atomic_store(&page->remote_head, (uintptr_t)0);
-    atomic_store(&page->remote_count, 0);
-
-    // Step 6: Initialize lifecycle counters
-    atomic_store(&page->in_use, 0);  // No blocks allocated yet
-    atomic_store(&page->pending_dn, 0);
-
-    // Step 7: Initialize linkage
-    page->next_page = NULL;
-    page->prev_page = NULL;
-
-    // Initialize pending queue fields
-    atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed);
-    page->next_pending = NULL;
-
-    // Step 8: Register page in global registry
-    mf2_register_page(page);
-
-    return page;
-}
-
-// --- MF2 Allocation & Free Operations ---
-
-// Forward declarations
-static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page);
-
-// Drain remote frees (cross-thread) into page's local freelist
-// Called by owner thread when local freelist is empty
-static int mf2_drain_remote_frees(MidPage* page) {
-    if (!page) return 0;
-
-    atomic_fetch_add(&g_mf2_drain_attempts, 1);
-
-    // Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG)
-    unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
-    if (remote_count == 0) {
-        return 0; // Nothing to drain
-    }
-
-    // Atomically swap remote stack head with NULL (lock-free pop all)
-    uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0,
-                                              memory_order_acq_rel);
-    if (!head) {
-        atomic_store_explicit(&page->remote_count, 0, memory_order_release);
-        return 0; // Race: someone else drained it
-    }
-
-    // Reset remote count (FIX #6: use release for future drain checks to see)
-    atomic_store_explicit(&page->remote_count, 0, memory_order_release);
-
-    // Walk the remote stack and count blocks
-    int drained = 0;
-    PoolBlock* cur = (PoolBlock*)head;
-    PoolBlock* tail = NULL;
-
-    while (cur) {
-        drained++;
-        tail = cur;
-        cur = cur->next;
-    }
-
-    // Append remote stack to local freelist (splice in front for simplicity)
-    if (tail) {
-        tail->next = page->freelist;
-        page->freelist = (PoolBlock*)head;
-        page->free_count += drained;
-    }
-
-    atomic_fetch_add(&g_mf2_drain_count, 1);
-    atomic_fetch_add(&g_mf2_drain_blocks, drained);
-
-    // CRITICAL FIX: Check if new remotes arrived DURING drain
-    // If so, re-enqueue to owner's pending queue (avoid losing remotes!)
-    unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire);
-    if (post_drain_count >= 1 && page->owner_tp) {  // Use same threshold as initial enqueue
-        // New remotes arrived during drain, re-enqueue for next round
-        // Note: This is safe because flag was cleared earlier
-        mf2_enqueue_pending(page->owner_tp, page);
-    }
-
-    return drained;
-}
-
-// ===========================================================================
-// Pending Queue Operations (MPSC Lock-Free Stack)
-// ===========================================================================
-
-// Enqueue page to owner's pending queue (called by remote threads)
-// MPSC: Multiple producers (remote free threads), single consumer (owner)
-static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) {
-    if (!owner_tp || !page) return;
-
-    // Already in pending? Skip (avoid duplicate enqueue)
-    _Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel);
-    if (was_pending) {
-        return;  // Already enqueued, nothing to do
-    }
-
-    atomic_fetch_add(&g_mf2_pending_enqueued, 1);
-
-    // Push to owner's pending stack (Treiber stack algorithm)
-    uintptr_t old_head;
-    do {
-        old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed);
-        page->next_pending = (MidPage*)old_head;
-    } while (!atomic_compare_exchange_weak_explicit(
-                &owner_tp->pages_remote_pending[page->class_idx],
-                &old_head, (uintptr_t)page,
-                memory_order_release,  // Publish page
-                memory_order_relaxed));
-
-    // 0→1 detection: Increment adoptable count for this class
-    // This enables O(1) early return in try_adopt (if count==0, no scan needed)
-    if (old_head == 0) {
-        atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed);
-    }
-}
-
-// Dequeue one page from pending queue (called by owner thread or adopter)
-// Uses CAS for correctness (multi-consumer in adoption path)
-static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return NULL;
-
-    uintptr_t old_head;
-    do {
-        old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire);
-        if (old_head == 0) {
-            return NULL;  // Queue empty
-        }
-        MidPage* page = (MidPage*)old_head;
-
-        // CAS to pop head
-        if (atomic_compare_exchange_weak_explicit(
-                &tp->pages_remote_pending[class_idx],
-                &old_head, (uintptr_t)page->next_pending,
-                memory_order_acq_rel, memory_order_relaxed)) {
-            // Successfully dequeued
-            MidPage* next = page->next_pending;
-            page->next_pending = NULL;  // Clear link
-
-            // If queue became empty (next==NULL), decrement adoptable count
-            // This enables O(1) early return in try_adopt when all queues empty
-            if (next == NULL) {
-                atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed);
-            }
-
-            return page;
-        }
-    } while (1);
-}
-
-// ===========================================================================
-// End of Pending Queue Operations
-// ===========================================================================
-
-// Forward declarations
-static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id);
-
-// ===========================================================================
-// Helper Functions (Clean & Modular)
-// ===========================================================================
-
-// Helper: Make page active (move old active to full_pages)
-static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return;
-
-    // Move old active page to full_pages (if any)
-    if (tp->active_page[class_idx]) {
-        MidPage* old_active = tp->active_page[class_idx];
-        old_active->next_page = tp->full_pages[class_idx];
-        tp->full_pages[class_idx] = old_active;
-    }
-
-    // Set new page as active
-    tp->active_page[class_idx] = page;
-    page->next_page = NULL;
-}
-
-// Helper: Drain page and add to partial list (LIFO for cache locality)
-// Returns true if page has free blocks after drain
-static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return false;
-
-    // Drain remote frees
-    int drained = mf2_drain_remote_frees(page);
-
-    // If page has freelist after drain, add to partial list (LIFO)
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-        page->next_page = tp->partial_pages[class_idx];
-        tp->partial_pages[class_idx] = page;
-        return true;
-    }
-
-    // No freelist, return to full_pages
-    page->next_page = tp->full_pages[class_idx];
-    tp->full_pages[class_idx] = page;
-    return false;
-}
-
-// Helper: Drain page and activate if successful (Direct Handoff - backward compat)
-// Returns true if page was activated
-static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return false;
-
-    // Drain remote frees
-    int drained = mf2_drain_remote_frees(page);
-
-    // If page has freelist after drain, make it active immediately
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-        mf2_make_page_active(tp, class_idx, page);
-        return true;
-    }
-
-    // No freelist, return to full_pages
-    page->next_page = tp->full_pages[class_idx];
-    tp->full_pages[class_idx] = page;
-    return false;
-}
-
-// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1)
-// Returns true if a page was successfully drained and activated
-static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return false;
-
-    // Budget: Process up to N pages to avoid blocking
-    for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) {
-        MidPage* pending_page = mf2_dequeue_pending(tp, class_idx);
-        if (!pending_page) break;  // Queue empty
-
-        atomic_fetch_add(&g_mf2_pending_drained, 1);
-
-        // Clear pending flag (no longer in queue)
-        atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release);
-
-        // DIRECT HANDOFF: Drain and activate if successful
-        if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) {
-            return true;  // Success! Page is now active
-        }
-        // No freelist after drain, page returned to full_pages by helper
-    }
-    return false;  // No pages available for reuse
-}
-
-// Helper: Try to drain remotes from active page (must-reuse gate part 2)
-// Returns true if active page has freelist after drain
-static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return false;
-
-    MidPage* page = tp->active_page[class_idx];
-    if (!page) return false;
-
-    atomic_fetch_add(&g_mf2_slow_checked_drain, 1);
-    unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
-
-    if (remote_cnt > 0) {
-        atomic_fetch_add(&g_mf2_slow_found_remote, 1);
-        int drained = mf2_drain_remote_frees(page);
-        if (drained > 0 && page->freelist) {
-            atomic_fetch_add(&g_mf2_drain_success, 1);
-            return true;  // Success! Active page now has freelist
-        }
-    }
-    return false;  // No remotes or drain failed
-}
-
-// Helper: Allocate new page and make it active
-// Returns the newly allocated page (or NULL on OOM)
-static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return NULL;
-
-    atomic_fetch_add(&g_mf2_new_page_count, 1);
-
-    // DEBUG: Log why we're allocating new page (first N samples)
-    static _Atomic int new_page_samples = 0;
-    int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed);
-    if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) {
-        // Count adoptable pages across all threads
-        int total_adoptable = 0;
-        for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-            total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed);
-        }
-        MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p",
-                      sample_idx, class_idx,
-                      (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed),
-                      total_adoptable,
-                      tp->active_page[class_idx],
-                      tp->full_pages[class_idx]);
-    }
-
-    MidPage* page = mf2_alloc_new_page(class_idx);
-    if (!page) {
-        return NULL; // OOM
-    }
-
-    // Move current active page to full list (if any)
-    if (tp->active_page[class_idx]) {
-        MidPage* old_page = tp->active_page[class_idx];
-        old_page->next_page = tp->full_pages[class_idx];
-        tp->full_pages[class_idx] = old_page;
-    }
-
-    // Set new page as active
-    tp->active_page[class_idx] = page;
-    tp->page_count[class_idx]++;
-
-    return page;
-}
-
-// ===========================================================================
-// End of Helper Functions
-// ===========================================================================
-
-// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue
-// Returns true if a page was successfully adopted and activated
-// Called from alloc_slow when allocating thread needs memory
-static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) {
-    if (!me) return false;
-
-    // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating)
-    // Avoids scanning empty queues (major performance win!)
-    int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed);
-    if (adoptable == 0) return false;  // All queues empty, no scan needed
-
-    // Get global thread registry
-    int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire);
-    if (num_tp == 0) return false;
-
-    // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES)
-    // Prevents excessive scanning overhead (2-8 threads is usually enough)
-    int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues;
-
-    // Round-robin scan (limited number of threads, not ALL!)
-    static _Atomic uint64_t adopt_counter = 0;
-    uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed);
-
-    for (int i = 0; i < scan_limit; i++) {
-        int tp_idx = (start_idx + i) % num_tp;
-        MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit(
-            (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire);
-
-        if (!other_tp) continue;
-
-        // Route P: Idle Detection - Only adopt from idle owners
-        // Check if owner is still actively allocating (threshold configurable via env var)
-        uint64_t now_tsc = mf2_rdtsc();
-        uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed);
-        uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US;
-
-        if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) {
-            continue;  // Owner still active, skip adoption
-        }
-
-        // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!)
-        // Only one thread scans each queue at a time → eliminates CAS contention
-        if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) {
-            continue;  // Another thread is already scanning this queue, skip
-        }
-
-        // Try to dequeue a pending page from this thread
-        MidPage* page = mf2_dequeue_pending(other_tp, class_idx);
-        if (!page) {
-            // Queue empty, release claim and try next thread
-            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-            continue;
-        }
-
-        // Clear pending flag (no longer in queue)
-        atomic_store_explicit(&page->in_remote_pending, false, memory_order_release);
-
-        // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS)
-        // 0ms = disabled (no lease check), >0 = lease period in milliseconds
-        uint64_t now = mf2_rdtsc();
-        uint64_t last_transfer = page->last_transfer_time;
-        if (g_mf2_lease_ms > 0 && last_transfer != 0) {
-            // Calculate lease cycles from ms (approx 3GHz CPU)
-            uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL);
-            if ((now - last_transfer) < lease_cycles) {
-                // Lease still active, return page to full_pages (don't thrash ownership)
-                page->next_page = other_tp->full_pages[class_idx];
-                other_tp->full_pages[class_idx] = page;
-                // Release claim before continuing
-                atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-                continue;  // Try next thread
-            }
-        }
-
-        // Try to transfer ownership using CAS
-        pthread_t old_owner = page->owner_tid;
-        pthread_t new_owner = pthread_self();
-
-        // Note: pthread_t may not be atomic-compatible on all platforms
-        // For now, we'll use a simple write (ownership transfer is rare)
-        // TODO: If thrashing is observed, add atomic CAS with serialization
-        page->owner_tid = new_owner;
-        page->owner_tp = me;
-        page->last_transfer_time = now;
-
-        // DEBUG: Log drain state
-        static _Atomic int adopt_samples = 0;
-        int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed);
-        unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed);
-        unsigned int pre_free = page->free_count;
-        PoolBlock* pre_freelist = page->freelist;
-
-        // Drain remote frees
-        int drained = mf2_drain_remote_frees(page);
-
-        // DEBUG: Log result (first 10 samples)
-        if (sample_idx < 10) {
-            MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p",
-                          sample_idx, class_idx, pre_remote, drained,
-                          pre_free, page->free_count, pre_freelist, page->freelist);
-        }
-
-        // Make adopted page ACTIVE immediately (not partial!)
-        // Adoption needs immediate activation for caller's mf2_alloc_fast()
-        // Partial list is only for own pending queue drains
-        if (page->freelist) {
-            atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-            atomic_fetch_add(&g_mf2_pending_drained, 1);
-            atomic_fetch_add(&g_mf2_drain_success, 1);
-
-            // Make it active (move old active to full_pages)
-            mf2_make_page_active(me, class_idx, page);
-
-            // Release claim before returning SUCCESS
-            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-            return true;  // SUCCESS! Page adopted and activated
-        }
-
-        // No freelist after drain, return to MY full_pages (I'm the new owner!)
-        page->next_page = me->full_pages[class_idx];
-        me->full_pages[class_idx] = page;
-        // Release claim before continuing search
-        atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-        // Continue searching for a better page
-    }
-
-    return false;  // No adoptable pages found
-}
-
-// Fast allocation path (owner thread, NO LOCK!)
-static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) {
-    // Get thread-local page lists
-    MF2_ThreadPages* tp = mf2_thread_pages_get();
-    if (!tp) return NULL;
-
-    // Get active page for this class
-    MidPage* page = tp->active_page[class_idx];
-    if (!page) {
-        // No active page, go to slow path
-        return mf2_alloc_slow(class_idx, size, site_id);
-    }
-
-    // FAST PATH: Pop from page-local freelist (NO LOCK!)
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_alloc_fast_hit, 1);
-
-        // Route P: Update activity tracking for idle detection
-        atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
-
-        PoolBlock* block = page->freelist;
-        page->freelist = block->next;
-        page->free_count--;
-
-        // Increment in-use count (atomic for cross-thread visibility)
-        atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed);
-
-        // Return user pointer (skip header)
-        return (char*)block + HEADER_SIZE;
-    }
-
-    // Local freelist empty, go to slow path
-    return mf2_alloc_slow(class_idx, size, site_id);
-}
-
-// Slow allocation path (drain remote or allocate new page)
-static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) {
-    (void)site_id; // Unused for now
-
-    atomic_fetch_add(&g_mf2_alloc_slow_hit, 1);
-
-    // Get thread-local page lists
-    MF2_ThreadPages* tp = mf2_thread_pages_get();
-    if (!tp) return NULL;
-
-    // ===========================================================================
-    // Allocation Strategy (Must-Reuse Order)
-    // ===========================================================================
-    // 1. MUST-REUSE GATE (Part 1): Drain own pending queue
-    //    - Process up to 4 pages to avoid blocking
-    //    - Direct handoff: activate first successful drain immediately
-    if (mf2_try_reuse_own_pending(tp, class_idx)) {
-        return mf2_alloc_fast(class_idx, size, site_id);
-    }
-
-    // 2. MUST-REUSE GATE (Part 2): Drain active page remotes
-    //    - Check if current active page has remote frees
-    //    - Drain and retry allocation if successful
-    if (mf2_try_drain_active_remotes(tp, class_idx)) {
-        return mf2_alloc_fast(class_idx, size, site_id);
-    }
-
-    // HISTORICAL NOTE: full_pages scan removed
-    // Old approach: Scan full_pages looking for pages with remotes
-    // Problem: Drained pages consumed before owner can scan them
-    // New approach: Direct Handoff immediately activates drained pages
-    // Result: full_pages scan always finds 0 pages (100% waste)
-    //
-    // Benchmark evidence (before removal):
-    // - Full scan checked: 1,879,484 pages
-    // - Full scan found:   0 pages (0% success rate!)
-
-    // 3. Consumer-Driven Adoption (Route P with idle detection)
-    //    - Only adopt from idle owners (haven't allocated in >150µs)
-    //    - Prevents "adoption stealing" from active owners
-    if (mf2_try_adopt_pending(tp, class_idx)) {
-        return mf2_alloc_fast(class_idx, size, site_id);
-    }
-
-    // 4. MUST-REUSE GATE (Final): Allocate new page (last resort)
-    //    - Only reached after exhausting all reuse opportunities
-    //    - Order: pending queue → active drain → adoption → NEW
-    MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx);
-    if (!page) {
-        return NULL; // OOM
-    }
-
-    // Retry allocation from new page
-    return mf2_alloc_fast(class_idx, size, site_id);
-}
-
-// Forward declaration of slow free path
-static void mf2_free_slow(MidPage* page, void* ptr);
-
-// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue)
-// Fast free path (owner thread, NO LOCK!)
-static inline void mf2_free_fast(MidPage* page, void* ptr) {
-    if (!page || !ptr) return;
-
-    atomic_fetch_add(&g_mf2_free_owner_count, 1);
-
-    // Get block pointer (rewind to header)
-    PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
-
-    // FAST PATH: Push to page-local freelist (NO LOCK!)
-    block->next = page->freelist;
-    page->freelist = block;
-    page->free_count++;
-
-    // Decrement in-use count (atomic for cross-thread visibility)
-    int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
-
-    // Check if page is now empty (all blocks free)
-    if (old_in_use == 1 && page->free_count == page->capacity) {
-        // Memory efficiency: Return empty pages to OS via MADV_DONTNEED
-        // Keeps VA mapped (no munmap), but releases physical memory
-        hak_batch_add_page(page->base, POOL_PAGE_SIZE);
-    }
-}
-
-// Slow free path (cross-thread free to remote stack)
-static void mf2_free_slow(MidPage* page, void* ptr) {
-    if (!page || !ptr) return;
-
-    atomic_fetch_add(&g_mf2_free_remote_count, 1);
-
-    // Get block pointer
-    PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
-
-    // Push to page's remote stack (lock-free MPSC)
-    uintptr_t old_head;
-    do {
-        old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire);
-        block->next = (PoolBlock*)old_head;
-    } while (!atomic_compare_exchange_weak_explicit(
-        &page->remote_head, &old_head, (uintptr_t)block,
-        memory_order_release, memory_order_relaxed));
-
-    // Increment remote count and detect threshold for enqueueing
-    unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst);
-
-    // CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge
-    // Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again)
-    // Solution: Only enqueue when remotes accumulate to threshold (better batching)
-    //
-    // Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4):
-    //   1 = immediate (0→1 edge, causes ping-pong)
-    //   4 = balanced (batch 4 blocks before notifying owner)
-    //   8 = aggressive batching (higher latency, but better efficiency)
-    //
-    // We enqueue on transitions TO the threshold (old_count == threshold-1)
-    static int g_enqueue_threshold = 1;  // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4
-    if (old_count + 1 == (unsigned int)g_enqueue_threshold) {
-        // Remote count just reached threshold, notify owner
-        if (page->owner_tp) {
-            mf2_enqueue_pending(page->owner_tp, page);
-        }
-    }
-
-    // DEBUG: Sample first 10 remote frees - Disabled for performance
-    // static _Atomic int remote_free_samples = 0;
-    // int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed);
-    // if (sample < 10) {
-    //     fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n",
-    //             sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO");
-    // }
-
-    // Decrement in-use count
-    int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
-
-    // Check if page is now empty (FIX #6: acquire to see all remote frees)
-    if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) {
-        // Memory efficiency: Return empty pages to OS via MADV_DONTNEED
-        // Keeps VA mapped (no munmap), but releases physical memory
-        hak_batch_add_page(page->base, POOL_PAGE_SIZE);
-    }
-}
-
-// Top-level free dispatcher
-static void mf2_free(void* ptr) {
-    if (!ptr) return;
-
-    // O(1) page lookup (mimalloc's magic!)
-    MidPage* page = mf2_addr_to_page(ptr);
-    if (!page) {
-        // Not a MF2 page (shouldn't happen if MF2 is enabled properly)
-        return;
-    }
-
-    // Check if we're the owner (fast path)
-    MF2_ThreadPages* tp = mf2_thread_pages_get();
-
-    if (tp && page->owner_tid == tp->my_tid) {
-        // Fast: Owner thread, push to local freelist (NO LOCK!)
-        mf2_free_fast(page, ptr);
-    } else {
-        // Slow: Cross-thread free, push to remote stack (lock-free)
-        mf2_free_slow(page, ptr);
-    }
-}
-
-// ===========================================================================
-// Global pool state (simplified: single-threaded for MVP)
-static struct {
-    PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-
-    // Locks: per (class, shard) freelist to allow concurrent operations
-    PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-
-    // Non-empty bitmap (O(1) empty class skip)
-    // Bit i = 1 if freelist[class][shard] is non-empty
-    // Use atomic to avoid class-wide locks
-    atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES];  // 1 bit per shard
-
-    // Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc
-    atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-    atomic_uint      remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-
-    // Statistics
-    uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t total_bytes_allocated __attribute__((aligned(64)));
-    uint64_t total_pages_allocated __attribute__((aligned(64)));
-
-    // Per-class page accounting (for Soft CAP guidance)
-    uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-
-    // ACE: per-class bundle factor for refill (1..4) + last snapshot
-    int bundle_factor[POOL_NUM_CLASSES];
-    uint64_t last_hits[POOL_NUM_CLASSES];
-    uint64_t last_misses[POOL_NUM_CLASSES];
-
-    int initialized;
-    int tls_free_enabled;  // env: HAKMEM_POOL_TLS_FREE (default: 1)
-
-    // Extra metrics (for learner logging): all relaxed atomics
-    atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64)));
-    atomic_uint_fast64_t trylock_success __attribute__((aligned(64)));
-    atomic_uint_fast64_t ring_underflow  __attribute__((aligned(64)));
-} g_pool;
-
-static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers
-static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing
-static int g_tls_ring_enabled = 1;  // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring
-static int g_trylock_probes = 3;     // env: HAKMEM_TRYLOCK_PROBES (1..8)
-static int g_ring_return_div = 2;    // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third)
-static int g_tls_lo_max = 256;       // env: HAKMEM_TLS_LO_MAX (LIFO size cap)
-int g_hdr_light_enabled = 0;  // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation)
-static int g_pool_min_bundle = 2;  // env: HAKMEM_POOL_MIN_BUNDLE (default 2)
-// Sampled counter updates to reduce hot-path stores: 1/2^k
-static int g_count_sample_exp = 10;      // env: HAKMEM_POOL_COUNT_SAMPLE (0..16)
-static __thread uint32_t t_pool_rng = 0x243f6a88u;  // per-thread RNG for sampling
-
-// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap.
-// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap
-static size_t g_class_sizes[POOL_NUM_CLASSES] = {
-    POOL_CLASS_2KB,     // 2 KB
-    POOL_CLASS_4KB,     // 4 KB
-    POOL_CLASS_8KB,     // 8 KB
-    POOL_CLASS_16KB,    // 16 KB
-    POOL_CLASS_32KB,    // 32 KB
-    POOL_CLASS_40KB,    // 40 KB (Bridge class 0)
-    POOL_CLASS_52KB     // 52 KB (Bridge class 1)
-};
-
-// Blocks per page (for each class)
-__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = {
-    POOL_PAGE_SIZE / POOL_CLASS_2KB,   // 32 blocks (2KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_4KB,   // 16 blocks (4KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_8KB,   // 8 blocks (8KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_16KB,  // 4 blocks (16KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_32KB,  // 2 blocks (32KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_40KB,  // 1 block (40KiB Bridge)
-    POOL_PAGE_SIZE / POOL_CLASS_52KB   // 1 block (52KiB Bridge)
-};
-
-// ===========================================================================
-// Helper Functions
-// ===========================================================================
-
-// Write minimal header for Mid allocation (fast-return friendly)
-static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) {
-    // For Mid, prefer headerless operation when HDR_LIGHT>=1.
-    // Debug or non-Mid callers can still write full headers elsewhere.
-    if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path
-    hdr->magic = HAKMEM_MAGIC;
-    hdr->method = ALLOC_METHOD_POOL;
-    hdr->size = class_sz;
-    if (!g_hdr_light_enabled) {
-        hdr->alloc_site = site_id;
-        hdr->class_bytes = 0;
-        hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
-    }
-}
-
-// Branchless LUT (Lookup Table) for O(1) class determination
-// Expanded to 53 entries for Bridge classes (40KB, 52KB)
-static const uint8_t SIZE_TO_CLASS[53] = {
-    0,0,0,     // 0-2KB → Class 0
-    1,1,       // 3-4KB → Class 1
-    2,2,2,2,   // 5-8KB → Class 2
-    3,3,3,3,3,3,3,3,  // 9-16KB → Class 3
-    4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,  // 17-32KB → Class 4
-    5,5,5,5,5,5,5,5,  // 33-40KB → Class 5 (Bridge class 0)
-    6,6,6,6,6,6,6,6,6,6,6,6  // 41-52KB → Class 6 (Bridge class 1)
-};
-
-// Get size class index from size (0-6, or -1 if out of range)
-// Updated range check for Bridge classes (0-52KB)
-static inline int hak_pool_get_class_index(size_t size) {
-    // Fast path: exact match against configured class sizes (covers Bridge classes)
-    // Note: size passed here should already be a rounded class size from ACE.
-    for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-        size_t cs = g_class_sizes[i];
-        if (cs != 0 && size == cs) return i;
-    }
-    // Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior)
-    uint32_t kb = (uint32_t)((size + 1023) >> 10);  // Round up to KB units
-    return (kb < 53) ? SIZE_TO_CLASS[kb] : -1;  // Expanded to 53KB for Bridge classes
-}
-
-// Get shard index from site_id (0-63)
-int hak_pool_get_shard_index(uintptr_t site_id) {
-    if (!g_shard_mix_enabled) {
-        // Legacy: Shift by 4 to reduce collision (instruction alignment)
-        return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1));
-    }
-    // SplitMix64-like mixer with thread id salt for better dispersion
-    uint64_t x = (uint64_t)site_id;
-    uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
-    x ^= (tid << 1);
-    x += 0x9e3779b97f4a7c15ULL;
-    x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
-    x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
-    x = (x ^ (x >> 31));
-    return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
-}
-
-// TLS helpers
-#include "box/pool_tls_core.inc.h"
-
-
-// Refill/ACE (boxed)
-#include "box/pool_refill.inc.h"
-
-// Init/Shutdown + MF2 debug (boxed)
-#include "box/pool_init_api.inc.h"
-
-// Pool statistics (boxed)
-#include "box/pool_stats.inc.h"
-
-// Public API (boxed): alloc/free/lookup/free_fast
-#include "box/pool_api.inc.h"
diff --git a/core/hakmem_pool.c.bak3 b/core/hakmem_pool.c.bak3
deleted file mode 100644
index f7dec263..00000000
--- a/core/hakmem_pool.c.bak3
+++ /dev/null
@@ -1,1190 +0,0 @@
-// ============================================================================
-// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB)
-// ============================================================================
-//
-// サイズクラス定義:
-// ┌──────────┬─────────┬──────────────┬─────────────┐
-// │ クラス   │ サイズ  │ 初期CAP      │ ページ構成  │
-// ├──────────┼─────────┼──────────────┼─────────────┤
-// │ Class 0  │  2 KiB  │  64 pages    │ 32 blocks/p │
-// │ Class 1  │  4 KiB  │  64 pages    │ 16 blocks/p │
-// │ Class 2  │  8 KiB  │  64 pages    │  8 blocks/p │
-// │ Class 3  │ 16 KiB  │  32 pages    │  4 blocks/p │
-// │ Class 4  │ 32 KiB  │  16 pages    │  2 blocks/p │
-// │ DYN1     │ 6 KiB*  │  0 (無効)    │ 可変        │
-// │ DYN2     │ (未使用)│  0 (無効)    │ 可変        │
-// └──────────┴─────────┴──────────────┴─────────────┘
-// * DYN1はギャップ(8-16KB)を埋めるための動的クラス
-//
-// W_MAX (切り上げ許容倍率):
-//   - 意味: 要求サイズの何倍までのクラスを許容するか
-//   - デフォルト: 1.40 (40%までの切り上げを許容)
-//   - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40)
-//   - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能
-//
-// CAP (在庫量):
-//   - 意味: 各クラスで保持する最大ページ数
-//   - 初期値: {64,64,64,32,16} - 保守的（フットプリント優先）
-//   - 推奨値: {256,256,256,128,64} - パフォーマンス優先
-//   - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定
-//   - 学習モード: HAKMEM_LEARN=1 で自動調整
-//
-// TLSリング構造:
-//   - POOL_L2_RING_CAP: リングバッファ容量（デフォルト16）
-//   - ActivePage A/B: bump-run方式（ロックフリー）
-//   - LIFO overflow: リングから溢れた分
-//
-// パフォーマンスチューニング:
-//   1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64
-//   2. W_MAX緩和: HAKMEM_WMAX_MID=1.6
-//   3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64
-//   4. 学習モード: HAKMEM_LEARN=1
-//
-// License: MIT
-// Last Updated: 2025-10-26 (Code Cleanup完了)
-
-#include "hakmem_pool.h"
-#include "hakmem_config.h"
-#include "hakmem_internal.h"  // For AllocHeader and HAKMEM_MAGIC
-#include "hakmem_syscall.h"   // Box 3 syscall layer (bypasses LD_PRELOAD)
-#include <stdlib.h>
-#include <string.h>
-#include <stdio.h>
-#include <stdbool.h>
-#include <sys/mman.h>
-#include <pthread.h>
-#include <stdatomic.h>
-#include "hakmem_prof.h"
-#include "hakmem_policy.h"   // FrozenPolicy caps (Soft CAP gating)
-#include "hakmem_debug.h"
-
-// False sharing mitigation: padded mutex type (64B)
-typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;
-
-// ===========================================================================
-// Internal Data Structures
-// ===========================================================================
-#include "box/pool_tls_types.inc.h"
-
-// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid})
-#include "box/pool_mid_desc.inc.h"
-
-// ---------------- Transfer Cache (per-thread per-class inbox) --------------
-#include "box/pool_mid_tc.inc.h"
-
-#include "box/pool_mf2_types.inc.h"
-
-
-// --- MF2 Initialization Functions ---
-
-// Thread-safe initialization using pthread_once
-static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT;
-static void mf2_page_registry_init_impl(void) {
-    // Initialize all page slots to NULL
-    memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry));
-
-    // Initialize 256 coarse-grained locks for registry updates
-    for (int i = 0; i < 256; i++) {
-        pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL);
-    }
-
-    // Initialize counters
-    atomic_store(&g_mf2_page_registry.total_pages, 0);
-    atomic_store(&g_mf2_page_registry.active_pages, 0);
-}
-static void mf2_page_registry_init(void) {
-    pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl);
-}
-
-// Strategy A: ThreadPages destructor (cleanup on thread exit)
-static void mf2_thread_pages_destructor(void* arg) {
-    MF2_ThreadPages* tp = (MF2_ThreadPages*)arg;
-    if (!tp) return;
-
-    // SAFETY: Don't remove from global registry or free memory
-    // Reason: Causes "malloc(): unsorted double linked list corrupted" crashes
-    // Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime)
-    // TODO: Investigate safe cleanup mechanism
-
-    // Remove from global registry (DISABLED for safety)
-    // for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) {
-    //     if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) {
-    //         atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release);
-    //         break;
-    //     }
-    // }
-
-    // Free all pages owned by this thread (DISABLED for safety)
-    // hkm_libc_free(tp);
-
-    (void)tp;  // Suppress unused warning
-}
-
-// Strategy A: Initialize pthread_key (once only)
-static void mf2_init_tls_key(void) {
-    pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor);
-}
-
-// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection)
-static inline uint64_t mf2_rdtsc(void) {
-#if defined(__x86_64__) || defined(__i386__)
-    uint32_t lo, hi;
-    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
-    return ((uint64_t)hi << 32) | lo;
-#else
-    // Fallback for non-x86 architectures (use clock_gettime approximation)
-    struct timespec ts;
-    clock_gettime(CLOCK_MONOTONIC, &ts);
-    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
-#endif
-}
-
-static MF2_ThreadPages* mf2_thread_pages_get(void) {
-    if (t_mf2_pages) return t_mf2_pages;
-
-    // Initialize pthread_key (once only)
-    pthread_once(&g_mf2_key_once, mf2_init_tls_key);
-
-    // Allocate thread-local page lists
-    MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages));
-    if (!tp) return NULL;
-
-    // Initialize with current thread ID
-    tp->my_tid = pthread_self();
-
-    // All page lists start empty (NULL)
-    for (int c = 0; c < POOL_NUM_CLASSES; c++) {
-        tp->active_page[c] = NULL;
-        tp->full_pages[c] = NULL;
-        atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed);
-        atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed);
-        tp->page_count[c] = 0;
-    }
-
-    // Route P: Initialize activity tracking
-    atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
-
-    // Strategy A: Register in global array for round-robin drain
-    int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel);
-    if (idx < MF2_MAX_THREADS) {
-        atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release);
-
-        // DEBUG: Log first 10 thread registrations - Disabled for performance
-        // static _Atomic int reg_samples = 0;
-        // int rs = atomic_fetch_add_explicit(&reg_samples, 1, memory_order_relaxed);
-        // if (rs < 10) {
-        //     fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n",
-        //             rs, (unsigned long)tp->my_tid, tp, idx);
-        // }
-    } else {
-        MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS);
-    }
-
-    // Set pthread-specific data for destructor
-    pthread_setspecific(g_mf2_tls_key, tp);
-
-    t_mf2_pages = tp;
-    return tp;
-}
-
-// --- MF2 Page Allocation & Lookup ---
-
-// O(1) page lookup from block address (mimalloc's secret sauce!)
-static inline MidPage* mf2_addr_to_page(void* addr) {
-    // Step 1: Get page base address (64KB aligned)
-    // 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits
-    void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL);
-
-    // Step 2: Index into registry (direct-mapped, 64K entries)
-    // (addr >> 16) extracts page index, & 0xFFFF wraps to registry size
-    size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
-
-    // Step 3: Direct lookup (no hash collision handling needed with 64K entries)
-    MidPage* page = g_mf2_page_registry.pages[idx];
-
-    // ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups
-    static _Atomic int lookup_count = 0;
-    // DEBUG: Disabled for performance
-    // int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed);
-    // if (count < 100) {
-    //     int found = (page != NULL);
-    //     int match = (page && page->base == page_base);
-    //     fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s",
-    //             count, addr, page_base, idx, found ? "YES" : "NO");
-    //     if (page) {
-    //         fprintf(stderr, ", page->base=%p, match=%s",
-    //                 page->base, match ? "YES" : "NO");
-    //     }
-    //     fprintf(stderr, "\n");
-    // }
-
-    // Validation: Ensure page base matches (handles potential collisions)
-    if (page && page->base == page_base) {
-        return page;
-    }
-
-    // Collision or not registered (shouldn't happen in normal operation)
-    return NULL;
-}
-
-// Register a page in the global registry (called once per page allocation)
-static void mf2_register_page(MidPage* page) {
-    if (!page) return;
-
-    // Calculate registry index from page base
-    size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
-
-    // ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance
-    // static int register_count = 0;
-    // if (register_count < 10) {
-    //     fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n",
-    //             register_count, page->base, idx,
-    //             (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO");
-    //     register_count++;
-    // }
-
-    // Coarse-grained lock (256 locks for 64K entries = 256 entries/lock)
-    int lock_idx = idx % 256;
-    pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
-
-    // Check for collision (should be rare with 64K entries)
-    if (g_mf2_page_registry.pages[idx] != NULL) {
-        // Collision detected - this is a problem!
-        // For MVP, we'll just log and overwrite (TODO: handle collisions properly)
-        HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx);
-    }
-
-    // Register the page
-    g_mf2_page_registry.pages[idx] = page;
-
-    // Update counters
-    atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed);
-    atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
-
-    pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
-}
-
-// Unregister a page from the global registry (called when returning page to OS)
-__attribute__((unused)) static void mf2_unregister_page(MidPage* page) {
-    if (!page) return;
-
-    size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
-    int lock_idx = idx % 256;
-
-    pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);
-
-    if (g_mf2_page_registry.pages[idx] == page) {
-        g_mf2_page_registry.pages[idx] = NULL;
-        atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
-    }
-
-    pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
-}
-
-// Allocate and initialize a new 64KB page for given size class
-static MidPage* mf2_alloc_new_page(int class_idx) {
-    if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL;
-
-    // Get user size class (2KB, 4KB, 8KB, 16KB, 32KB)
-    size_t user_size = g_class_sizes[class_idx];
-    if (user_size == 0) return NULL; // Dynamic class disabled
-
-    // CRITICAL FIX: Each block needs HEADER_SIZE + user_size
-    // The header stores metadata (AllocHeader), user_size is the usable space
-    size_t block_size = HEADER_SIZE + user_size;
-
-    // Step 1: Allocate 64KB page (aligned to 64KB boundary)
-    // CRITICAL FIX #4: Must ensure 64KB alignment!
-    // mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup.
-    // This caused 97% of frees to fail silently (fatal bug!)
-    //
-    // CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion!
-    // Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion.
-
-    // Allocate 2x size to allow alignment adjustment
-    size_t alloc_size = POOL_PAGE_SIZE * 2;  // 128KB
-    void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
-                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
-    if (raw == MAP_FAILED) {
-        return NULL; // OOM
-    }
-
-    // Find 64KB aligned address within allocation
-    uintptr_t addr = (uintptr_t)raw;
-    uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL;  // Round up to 64KB boundary
-    void* page_base = (void*)aligned;
-
-    // Free unused prefix (if any)
-    size_t prefix_size = aligned - addr;
-    if (prefix_size > 0) {
-        munmap(raw, prefix_size);
-    }
-
-    // Free unused suffix
-    size_t suffix_offset = prefix_size + POOL_PAGE_SIZE;
-    if (suffix_offset < alloc_size) {
-        munmap((char*)raw + suffix_offset, alloc_size - suffix_offset);
-    }
-
-    // DEBUG: Log first few allocations
-    static _Atomic int mmap_count = 0;
-    int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed);
-    if (mc < 5) {
-        MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu",
-                      mc, raw, page_base, prefix_size, alloc_size - suffix_offset);
-    }
-
-    // ALIGNMENT VERIFICATION (Step 1)
-    if (((uintptr_t)page_base & 0xFFFF) != 0) {
-        MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)",
-                      page_base, ((uintptr_t)page_base & 0xFFFF));
-    }
-
-    // Zero-fill (required for posix_memalign)
-    // Note: This adds ~15μs overhead, but is necessary for correctness
-    memset(page_base, 0, POOL_PAGE_SIZE);
-
-    // Step 2: Allocate MidPage descriptor
-    MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage));
-    if (!page) {
-        // CRITICAL FIX: Use munmap for mmap-allocated memory
-        munmap(page_base, POOL_PAGE_SIZE);
-        return NULL;
-    }
-
-    // Step 3: Initialize page descriptor
-    page->base = page_base;
-    page->class_idx = (uint8_t)class_idx;
-    page->flags = 0;
-    page->owner_tid = pthread_self();
-    page->owner_tp = mf2_thread_pages_get();  // Store owner's ThreadPages for pending queue
-    page->last_transfer_time = 0;  // No transfer yet (lease mechanism)
-
-    // Step 4: Build freelist chain (walk through page and link blocks)
-    // Calculate how many blocks fit in 64KB page (including header overhead)
-    size_t usable_size = POOL_PAGE_SIZE;
-    size_t num_blocks = usable_size / block_size;
-
-    page->capacity = (uint16_t)num_blocks;
-    page->free_count = (uint16_t)num_blocks;
-
-    // Build linked list of free blocks
-    PoolBlock* freelist_head = NULL;
-    PoolBlock* freelist_tail = NULL;
-
-    for (size_t i = 0; i < num_blocks; i++) {
-        char* block_addr = (char*)page_base + (i * block_size);
-        PoolBlock* block = (PoolBlock*)block_addr;
-
-        block->next = NULL;
-
-        if (freelist_head == NULL) {
-            freelist_head = block;
-            freelist_tail = block;
-        } else {
-            freelist_tail->next = block;
-            freelist_tail = block;
-        }
-    }
-
-    page->freelist = freelist_head;
-
-    // Step 5: Initialize remote stack (for cross-thread frees)
-    atomic_store(&page->remote_head, (uintptr_t)0);
-    atomic_store(&page->remote_count, 0);
-
-    // Step 6: Initialize lifecycle counters
-    atomic_store(&page->in_use, 0);  // No blocks allocated yet
-    atomic_store(&page->pending_dn, 0);
-
-    // Step 7: Initialize linkage
-    page->next_page = NULL;
-    page->prev_page = NULL;
-
-    // Initialize pending queue fields
-    atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed);
-    page->next_pending = NULL;
-
-    // Step 8: Register page in global registry
-    mf2_register_page(page);
-
-    return page;
-}
-
-// --- MF2 Allocation & Free Operations ---
-
-// Forward declarations
-static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page);
-
-// Drain remote frees (cross-thread) into page's local freelist
-// Called by owner thread when local freelist is empty
-static int mf2_drain_remote_frees(MidPage* page) {
-    if (!page) return 0;
-
-    atomic_fetch_add(&g_mf2_drain_attempts, 1);
-
-    // Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG)
-    unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
-    if (remote_count == 0) {
-        return 0; // Nothing to drain
-    }
-
-    // Atomically swap remote stack head with NULL (lock-free pop all)
-    uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0,
-                                              memory_order_acq_rel);
-    if (!head) {
-        atomic_store_explicit(&page->remote_count, 0, memory_order_release);
-        return 0; // Race: someone else drained it
-    }
-
-    // Reset remote count (FIX #6: use release for future drain checks to see)
-    atomic_store_explicit(&page->remote_count, 0, memory_order_release);
-
-    // Walk the remote stack and count blocks
-    int drained = 0;
-    PoolBlock* cur = (PoolBlock*)head;
-    PoolBlock* tail = NULL;
-
-    while (cur) {
-        drained++;
-        tail = cur;
-        cur = cur->next;
-    }
-
-    // Append remote stack to local freelist (splice in front for simplicity)
-    if (tail) {
-        tail->next = page->freelist;
-        page->freelist = (PoolBlock*)head;
-        page->free_count += drained;
-    }
-
-    atomic_fetch_add(&g_mf2_drain_count, 1);
-    atomic_fetch_add(&g_mf2_drain_blocks, drained);
-
-    // CRITICAL FIX: Check if new remotes arrived DURING drain
-    // If so, re-enqueue to owner's pending queue (avoid losing remotes!)
-    unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire);
-    if (post_drain_count >= 1 && page->owner_tp) {  // Use same threshold as initial enqueue
-        // New remotes arrived during drain, re-enqueue for next round
-        // Note: This is safe because flag was cleared earlier
-        mf2_enqueue_pending(page->owner_tp, page);
-    }
-
-    return drained;
-}
-
-// ===========================================================================
-// Pending Queue Operations (MPSC Lock-Free Stack)
-// ===========================================================================
-
-// Enqueue page to owner's pending queue (called by remote threads)
-// MPSC: Multiple producers (remote free threads), single consumer (owner)
-static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) {
-    if (!owner_tp || !page) return;
-
-    // Already in pending? Skip (avoid duplicate enqueue)
-    _Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel);
-    if (was_pending) {
-        return;  // Already enqueued, nothing to do
-    }
-
-    atomic_fetch_add(&g_mf2_pending_enqueued, 1);
-
-    // Push to owner's pending stack (Treiber stack algorithm)
-    uintptr_t old_head;
-    do {
-        old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed);
-        page->next_pending = (MidPage*)old_head;
-    } while (!atomic_compare_exchange_weak_explicit(
-                &owner_tp->pages_remote_pending[page->class_idx],
-                &old_head, (uintptr_t)page,
-                memory_order_release,  // Publish page
-                memory_order_relaxed));
-
-    // 0→1 detection: Increment adoptable count for this class
-    // This enables O(1) early return in try_adopt (if count==0, no scan needed)
-    if (old_head == 0) {
-        atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed);
-    }
-}
-
-// Dequeue one page from pending queue (called by owner thread or adopter)
-// Uses CAS for correctness (multi-consumer in adoption path)
-static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return NULL;
-
-    uintptr_t old_head;
-    do {
-        old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire);
-        if (old_head == 0) {
-            return NULL;  // Queue empty
-        }
-        MidPage* page = (MidPage*)old_head;
-
-        // CAS to pop head
-        if (atomic_compare_exchange_weak_explicit(
-                &tp->pages_remote_pending[class_idx],
-                &old_head, (uintptr_t)page->next_pending,
-                memory_order_acq_rel, memory_order_relaxed)) {
-            // Successfully dequeued
-            MidPage* next = page->next_pending;
-            page->next_pending = NULL;  // Clear link
-
-            // If queue became empty (next==NULL), decrement adoptable count
-            // This enables O(1) early return in try_adopt when all queues empty
-            if (next == NULL) {
-                atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed);
-            }
-
-            return page;
-        }
-    } while (1);
-}
-
-// ===========================================================================
-// End of Pending Queue Operations
-// ===========================================================================
-
-// Forward declarations
-static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id);
-
-// ===========================================================================
-// Helper Functions (Clean & Modular)
-// ===========================================================================
-
-// Helper: Make page active (move old active to full_pages)
-static inline void mf2_make_page_active(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return;
-
-    // Move old active page to full_pages (if any)
-    if (tp->active_page[class_idx]) {
-        MidPage* old_active = tp->active_page[class_idx];
-        old_active->next_page = tp->full_pages[class_idx];
-        tp->full_pages[class_idx] = old_active;
-    }
-
-    // Set new page as active
-    tp->active_page[class_idx] = page;
-    page->next_page = NULL;
-}
-
-// Helper: Drain page and add to partial list (LIFO for cache locality)
-// Returns true if page has free blocks after drain
-static inline bool mf2_try_drain_to_partial(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return false;
-
-    // Drain remote frees
-    int drained = mf2_drain_remote_frees(page);
-
-    // If page has freelist after drain, add to partial list (LIFO)
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-        page->next_page = tp->partial_pages[class_idx];
-        tp->partial_pages[class_idx] = page;
-        return true;
-    }
-
-    // No freelist, return to full_pages
-    page->next_page = tp->full_pages[class_idx];
-    tp->full_pages[class_idx] = page;
-    return false;
-}
-
-// Helper: Drain page and activate if successful (Direct Handoff - backward compat)
-// Returns true if page was activated
-static inline bool mf2_try_drain_and_activate(MF2_ThreadPages* tp, int class_idx, MidPage* page) {
-    if (!tp || !page) return false;
-
-    // Drain remote frees
-    int drained = mf2_drain_remote_frees(page);
-
-    // If page has freelist after drain, make it active immediately
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-        mf2_make_page_active(tp, class_idx, page);
-        return true;
-    }
-
-    // No freelist, return to full_pages
-    page->next_page = tp->full_pages[class_idx];
-    tp->full_pages[class_idx] = page;
-    return false;
-}
-
-// Helper: Try to reuse pages from own pending queue (must-reuse gate part 1)
-// Returns true if a page was successfully drained and activated
-static bool mf2_try_reuse_own_pending(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return false;
-
-    // Budget: Process up to N pages to avoid blocking
-    for (int budget = 0; budget < MF2_PENDING_QUEUE_BUDGET; budget++) {
-        MidPage* pending_page = mf2_dequeue_pending(tp, class_idx);
-        if (!pending_page) break;  // Queue empty
-
-        atomic_fetch_add(&g_mf2_pending_drained, 1);
-
-        // Clear pending flag (no longer in queue)
-        atomic_store_explicit(&pending_page->in_remote_pending, false, memory_order_release);
-
-        // DIRECT HANDOFF: Drain and activate if successful
-        if (mf2_try_drain_and_activate(tp, class_idx, pending_page)) {
-            return true;  // Success! Page is now active
-        }
-        // No freelist after drain, page returned to full_pages by helper
-    }
-    return false;  // No pages available for reuse
-}
-
-// Helper: Try to drain remotes from active page (must-reuse gate part 2)
-// Returns true if active page has freelist after drain
-static bool mf2_try_drain_active_remotes(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return false;
-
-    MidPage* page = tp->active_page[class_idx];
-    if (!page) return false;
-
-    atomic_fetch_add(&g_mf2_slow_checked_drain, 1);
-    unsigned int remote_cnt = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
-
-    if (remote_cnt > 0) {
-        atomic_fetch_add(&g_mf2_slow_found_remote, 1);
-        int drained = mf2_drain_remote_frees(page);
-        if (drained > 0 && page->freelist) {
-            atomic_fetch_add(&g_mf2_drain_success, 1);
-            return true;  // Success! Active page now has freelist
-        }
-    }
-    return false;  // No remotes or drain failed
-}
-
-// Helper: Allocate new page and make it active
-// Returns the newly allocated page (or NULL on OOM)
-static MidPage* mf2_alloc_and_activate_new_page(MF2_ThreadPages* tp, int class_idx) {
-    if (!tp) return NULL;
-
-    atomic_fetch_add(&g_mf2_new_page_count, 1);
-
-    // DEBUG: Log why we're allocating new page (first N samples)
-    static _Atomic int new_page_samples = 0;
-    int sample_idx = atomic_fetch_add_explicit(&new_page_samples, 1, memory_order_relaxed);
-    if (sample_idx < MF2_DEBUG_SAMPLE_COUNT) {
-        // Count adoptable pages across all threads
-        int total_adoptable = 0;
-        for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-            total_adoptable += atomic_load_explicit(&g_adoptable_count[i], memory_order_relaxed);
-        }
-        MF2_DEBUG_LOG("NEW_PAGE %d: class=%d, own_pending=%p, adoptable_total=%d, active=%p, full=%p",
-                      sample_idx, class_idx,
-                      (void*)atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_relaxed),
-                      total_adoptable,
-                      tp->active_page[class_idx],
-                      tp->full_pages[class_idx]);
-    }
-
-    MidPage* page = mf2_alloc_new_page(class_idx);
-    if (!page) {
-        return NULL; // OOM
-    }
-
-    // Move current active page to full list (if any)
-    if (tp->active_page[class_idx]) {
-        MidPage* old_page = tp->active_page[class_idx];
-        old_page->next_page = tp->full_pages[class_idx];
-        tp->full_pages[class_idx] = old_page;
-    }
-
-    // Set new page as active
-    tp->active_page[class_idx] = page;
-    tp->page_count[class_idx]++;
-
-    return page;
-}
-
-// ===========================================================================
-// End of Helper Functions
-// ===========================================================================
-
-// Consumer-Driven Adoption: Try to adopt a page from ANY thread's pending queue
-// Returns true if a page was successfully adopted and activated
-// Called from alloc_slow when allocating thread needs memory
-static bool mf2_try_adopt_pending(MF2_ThreadPages* me, int class_idx) {
-    if (!me) return false;
-
-    // IMMEDIATE FIX #1: Early return if no adoptable pages (O(1) gating)
-    // Avoids scanning empty queues (major performance win!)
-    int adoptable = atomic_load_explicit(&g_adoptable_count[class_idx], memory_order_relaxed);
-    if (adoptable == 0) return false;  // All queues empty, no scan needed
-
-    // Get global thread registry
-    int num_tp = atomic_load_explicit(&g_num_thread_pages, memory_order_acquire);
-    if (num_tp == 0) return false;
-
-    // IMMEDIATE FIX #2: Limit scan to MAX_QUEUES threads (configurable via HAKMEM_MF2_MAX_QUEUES)
-    // Prevents excessive scanning overhead (2-8 threads is usually enough)
-    int scan_limit = (num_tp < g_mf2_max_queues) ? num_tp : g_mf2_max_queues;
-
-    // Round-robin scan (limited number of threads, not ALL!)
-    static _Atomic uint64_t adopt_counter = 0;
-    uint64_t start_idx = atomic_fetch_add_explicit(&adopt_counter, 1, memory_order_relaxed);
-
-    for (int i = 0; i < scan_limit; i++) {
-        int tp_idx = (start_idx + i) % num_tp;
-        MF2_ThreadPages* other_tp = (MF2_ThreadPages*)atomic_load_explicit(
-            (atomic_uintptr_t*)&g_all_thread_pages[tp_idx], memory_order_acquire);
-
-        if (!other_tp) continue;
-
-        // Route P: Idle Detection - Only adopt from idle owners
-        // Check if owner is still actively allocating (threshold configurable via env var)
-        uint64_t now_tsc = mf2_rdtsc();
-        uint64_t owner_last_alloc = atomic_load_explicit(&other_tp->last_alloc_tsc, memory_order_relaxed);
-        uint64_t idle_threshold_cycles = (uint64_t)g_mf2_idle_threshold_us * MF2_TSC_CYCLES_PER_US;
-
-        if ((now_tsc - owner_last_alloc) < idle_threshold_cycles) {
-            continue;  // Owner still active, skip adoption
-        }
-
-        // IMMEDIATE FIX #3: Claim exclusive access (prevent multi-consumer CAS thrashing!)
-        // Only one thread scans each queue at a time → eliminates CAS contention
-        if (atomic_flag_test_and_set_explicit(&other_tp->pending_claim[class_idx], memory_order_acquire)) {
-            continue;  // Another thread is already scanning this queue, skip
-        }
-
-        // Try to dequeue a pending page from this thread
-        MidPage* page = mf2_dequeue_pending(other_tp, class_idx);
-        if (!page) {
-            // Queue empty, release claim and try next thread
-            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-            continue;
-        }
-
-        // Clear pending flag (no longer in queue)
-        atomic_store_explicit(&page->in_remote_pending, false, memory_order_release);
-
-        // Check lease: Has enough time passed since last transfer? (configurable via HAKMEM_MF2_LEASE_MS)
-        // 0ms = disabled (no lease check), >0 = lease period in milliseconds
-        uint64_t now = mf2_rdtsc();
-        uint64_t last_transfer = page->last_transfer_time;
-        if (g_mf2_lease_ms > 0 && last_transfer != 0) {
-            // Calculate lease cycles from ms (approx 3GHz CPU)
-            uint64_t lease_cycles = (uint64_t)g_mf2_lease_ms * (MF2_TSC_CYCLES_PER_US * 1000ULL);
-            if ((now - last_transfer) < lease_cycles) {
-                // Lease still active, return page to full_pages (don't thrash ownership)
-                page->next_page = other_tp->full_pages[class_idx];
-                other_tp->full_pages[class_idx] = page;
-                // Release claim before continuing
-                atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-                continue;  // Try next thread
-            }
-        }
-
-        // Try to transfer ownership using CAS
-        pthread_t old_owner = page->owner_tid;
-        pthread_t new_owner = pthread_self();
-
-        // Note: pthread_t may not be atomic-compatible on all platforms
-        // For now, we'll use a simple write (ownership transfer is rare)
-        // TODO: If thrashing is observed, add atomic CAS with serialization
-        page->owner_tid = new_owner;
-        page->owner_tp = me;
-        page->last_transfer_time = now;
-
-        // DEBUG: Log drain state
-        static _Atomic int adopt_samples = 0;
-        int sample_idx = atomic_fetch_add_explicit(&adopt_samples, 1, memory_order_relaxed);
-        unsigned int pre_remote = atomic_load_explicit(&page->remote_count, memory_order_relaxed);
-        unsigned int pre_free = page->free_count;
-        PoolBlock* pre_freelist = page->freelist;
-
-        // Drain remote frees
-        int drained = mf2_drain_remote_frees(page);
-
-        // DEBUG: Log result (first 10 samples)
-        if (sample_idx < 10) {
-            MF2_DEBUG_LOG("ADOPT_DRAIN %d: class=%d, remote_cnt=%u, drained=%d, pre_free=%u, post_free=%u, pre_freelist=%p, post_freelist=%p",
-                          sample_idx, class_idx, pre_remote, drained,
-                          pre_free, page->free_count, pre_freelist, page->freelist);
-        }
-
-        // Make adopted page ACTIVE immediately (not partial!)
-        // Adoption needs immediate activation for caller's mf2_alloc_fast()
-        // Partial list is only for own pending queue drains
-        if (page->freelist) {
-            atomic_fetch_add(&g_mf2_page_reuse_count, 1);
-            atomic_fetch_add(&g_mf2_pending_drained, 1);
-            atomic_fetch_add(&g_mf2_drain_success, 1);
-
-            // Make it active (move old active to full_pages)
-            mf2_make_page_active(me, class_idx, page);
-
-            // Release claim before returning SUCCESS
-            atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-            return true;  // SUCCESS! Page adopted and activated
-        }
-
-        // No freelist after drain, return to MY full_pages (I'm the new owner!)
-        page->next_page = me->full_pages[class_idx];
-        me->full_pages[class_idx] = page;
-        // Release claim before continuing search
-        atomic_flag_clear_explicit(&other_tp->pending_claim[class_idx], memory_order_release);
-        // Continue searching for a better page
-    }
-
-    return false;  // No adoptable pages found
-}
-
-// Fast allocation path (owner thread, NO LOCK!)
-static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) {
-    // Get thread-local page lists
-    MF2_ThreadPages* tp = mf2_thread_pages_get();
-    if (!tp) return NULL;
-
-    // Get active page for this class
-    MidPage* page = tp->active_page[class_idx];
-    if (!page) {
-        // No active page, go to slow path
-        return mf2_alloc_slow(class_idx, size, site_id);
-    }
-
-    // FAST PATH: Pop from page-local freelist (NO LOCK!)
-    if (page->freelist) {
-        atomic_fetch_add(&g_mf2_alloc_fast_hit, 1);
-
-        // Route P: Update activity tracking for idle detection
-        atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);
-
-        PoolBlock* block = page->freelist;
-        page->freelist = block->next;
-        page->free_count--;
-
-        // Increment in-use count (atomic for cross-thread visibility)
-        atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed);
-
-        // Return user pointer (skip header)
-        return (char*)block + HEADER_SIZE;
-    }
-
-    // Local freelist empty, go to slow path
-    return mf2_alloc_slow(class_idx, size, site_id);
-}
-
-// Slow allocation path (drain remote or allocate new page)
-static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) {
-    (void)site_id; // Unused for now
-
-    atomic_fetch_add(&g_mf2_alloc_slow_hit, 1);
-
-    // Get thread-local page lists
-    MF2_ThreadPages* tp = mf2_thread_pages_get();
-    if (!tp) return NULL;
-
-    // ===========================================================================
-    // Allocation Strategy (Must-Reuse Order)
-    // ===========================================================================
-    // 1. MUST-REUSE GATE (Part 1): Drain own pending queue
-    //    - Process up to 4 pages to avoid blocking
-    //    - Direct handoff: activate first successful drain immediately
-    if (mf2_try_reuse_own_pending(tp, class_idx)) {
-        return mf2_alloc_fast(class_idx, size, site_id);
-    }
-
-    // 2. MUST-REUSE GATE (Part 2): Drain active page remotes
-    //    - Check if current active page has remote frees
-    //    - Drain and retry allocation if successful
-    if (mf2_try_drain_active_remotes(tp, class_idx)) {
-        return mf2_alloc_fast(class_idx, size, site_id);
-    }
-
-    // HISTORICAL NOTE: full_pages scan removed
-    // Old approach: Scan full_pages looking for pages with remotes
-    // Problem: Drained pages consumed before owner can scan them
-    // New approach: Direct Handoff immediately activates drained pages
-    // Result: full_pages scan always finds 0 pages (100% waste)
-    //
-    // Benchmark evidence (before removal):
-    // - Full scan checked: 1,879,484 pages
-    // - Full scan found:   0 pages (0% success rate!)
-
-    // 3. Consumer-Driven Adoption (Route P with idle detection)
-    //    - Only adopt from idle owners (haven't allocated in >150µs)
-    //    - Prevents "adoption stealing" from active owners
-    if (mf2_try_adopt_pending(tp, class_idx)) {
-        return mf2_alloc_fast(class_idx, size, site_id);
-    }
-
-    // 4. MUST-REUSE GATE (Final): Allocate new page (last resort)
-    //    - Only reached after exhausting all reuse opportunities
-    //    - Order: pending queue → active drain → adoption → NEW
-    MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx);
-    if (!page) {
-        return NULL; // OOM
-    }
-
-    // Retry allocation from new page
-    return mf2_alloc_fast(class_idx, size, site_id);
-}
-
-// Forward declaration of slow free path
-static void mf2_free_slow(MidPage* page, void* ptr);
-
-// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue)
-// Fast free path (owner thread, NO LOCK!)
-static inline void mf2_free_fast(MidPage* page, void* ptr) {
-    if (!page || !ptr) return;
-
-    atomic_fetch_add(&g_mf2_free_owner_count, 1);
-
-    // Get block pointer (rewind to header)
-    PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
-
-    // FAST PATH: Push to page-local freelist (NO LOCK!)
-    block->next = page->freelist;
-    page->freelist = block;
-    page->free_count++;
-
-    // Decrement in-use count (atomic for cross-thread visibility)
-    int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
-
-    // Check if page is now empty (all blocks free)
-    if (old_in_use == 1 && page->free_count == page->capacity) {
-        // Memory efficiency: Return empty pages to OS via MADV_DONTNEED
-        // Keeps VA mapped (no munmap), but releases physical memory
-        hak_batch_add_page(page->base, POOL_PAGE_SIZE);
-    }
-}
-
-// Slow free path (cross-thread free to remote stack)
-static void mf2_free_slow(MidPage* page, void* ptr) {
-    if (!page || !ptr) return;
-
-    atomic_fetch_add(&g_mf2_free_remote_count, 1);
-
-    // Get block pointer
-    PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);
-
-    // Push to page's remote stack (lock-free MPSC)
-    uintptr_t old_head;
-    do {
-        old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire);
-        block->next = (PoolBlock*)old_head;
-    } while (!atomic_compare_exchange_weak_explicit(
-        &page->remote_head, &old_head, (uintptr_t)block,
-        memory_order_release, memory_order_relaxed));
-
-    // Increment remote count and detect threshold for enqueueing
-    unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst);
-
-    // CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge
-    // Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again)
-    // Solution: Only enqueue when remotes accumulate to threshold (better batching)
-    //
-    // Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4):
-    //   1 = immediate (0→1 edge, causes ping-pong)
-    //   4 = balanced (batch 4 blocks before notifying owner)
-    //   8 = aggressive batching (higher latency, but better efficiency)
-    //
-    // We enqueue on transitions TO the threshold (old_count == threshold-1)
-    static int g_enqueue_threshold = 1;  // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4
-    if (old_count + 1 == (unsigned int)g_enqueue_threshold) {
-        // Remote count just reached threshold, notify owner
-        if (page->owner_tp) {
-            mf2_enqueue_pending(page->owner_tp, page);
-        }
-    }
-
-    // DEBUG: Sample first 10 remote frees - Disabled for performance
-    // static _Atomic int remote_free_samples = 0;
-    // int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed);
-    // if (sample < 10) {
-    //     fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n",
-    //             sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO");
-    // }
-
-    // Decrement in-use count
-    int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);
-
-    // Check if page is now empty (FIX #6: acquire to see all remote frees)
-    if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) {
-        // Memory efficiency: Return empty pages to OS via MADV_DONTNEED
-        // Keeps VA mapped (no munmap), but releases physical memory
-        hak_batch_add_page(page->base, POOL_PAGE_SIZE);
-    }
-}
-
-// Top-level free dispatcher
-static void mf2_free(void* ptr) {
-    if (!ptr) return;
-
-    // O(1) page lookup (mimalloc's magic!)
-    MidPage* page = mf2_addr_to_page(ptr);
-    if (!page) {
-        // Not a MF2 page (shouldn't happen if MF2 is enabled properly)
-        return;
-    }
-
-    // Check if we're the owner (fast path)
-    MF2_ThreadPages* tp = mf2_thread_pages_get();
-
-    if (tp && page->owner_tid == tp->my_tid) {
-        // Fast: Owner thread, push to local freelist (NO LOCK!)
-        mf2_free_fast(page, ptr);
-    } else {
-        // Slow: Cross-thread free, push to remote stack (lock-free)
-        mf2_free_slow(page, ptr);
-    }
-}
-
-// ===========================================================================
-// Global pool state (simplified: single-threaded for MVP)
-static struct {
-    PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-
-    // Locks: per (class, shard) freelist to allow concurrent operations
-    PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-
-    // Non-empty bitmap (O(1) empty class skip)
-    // Bit i = 1 if freelist[class][shard] is non-empty
-    // Use atomic to avoid class-wide locks
-    atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES];  // 1 bit per shard
-
-    // Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc
-    atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-    atomic_uint      remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
-
-    // Statistics
-    uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-    uint64_t total_bytes_allocated __attribute__((aligned(64)));
-    uint64_t total_pages_allocated __attribute__((aligned(64)));
-
-    // Per-class page accounting (for Soft CAP guidance)
-    uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64)));
-
-    // ACE: per-class bundle factor for refill (1..4) + last snapshot
-    int bundle_factor[POOL_NUM_CLASSES];
-    uint64_t last_hits[POOL_NUM_CLASSES];
-    uint64_t last_misses[POOL_NUM_CLASSES];
-
-    int initialized;
-    int tls_free_enabled;  // env: HAKMEM_POOL_TLS_FREE (default: 1)
-
-    // Extra metrics (for learner logging): all relaxed atomics
-    atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64)));
-    atomic_uint_fast64_t trylock_success __attribute__((aligned(64)));
-    atomic_uint_fast64_t ring_underflow  __attribute__((aligned(64)));
-} g_pool;
-
-static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers
-static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing
-static int g_tls_ring_enabled = 1;  // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring
-static int g_trylock_probes = 3;     // env: HAKMEM_TRYLOCK_PROBES (1..8)
-static int g_ring_return_div = 2;    // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third)
-static int g_tls_lo_max = 256;       // env: HAKMEM_TLS_LO_MAX (LIFO size cap)
-int g_hdr_light_enabled = 0;  // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation)
-static int g_pool_min_bundle = 2;  // env: HAKMEM_POOL_MIN_BUNDLE (default 2)
-// Sampled counter updates to reduce hot-path stores: 1/2^k
-static int g_count_sample_exp = 10;      // env: HAKMEM_POOL_COUNT_SAMPLE (0..16)
-static __thread uint32_t t_pool_rng = 0x243f6a88u;  // per-thread RNG for sampling
-
-// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap.
-// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap
-static size_t g_class_sizes[POOL_NUM_CLASSES] = {
-    POOL_CLASS_2KB,     // 2 KB
-    POOL_CLASS_4KB,     // 4 KB
-    POOL_CLASS_8KB,     // 8 KB
-    POOL_CLASS_16KB,    // 16 KB
-    POOL_CLASS_32KB,    // 32 KB
-    POOL_CLASS_40KB,    // 40 KB (Bridge class 0)
-    POOL_CLASS_52KB     // 52 KB (Bridge class 1)
-};
-
-// Blocks per page (for each class)
-__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = {
-    POOL_PAGE_SIZE / POOL_CLASS_2KB,   // 32 blocks (2KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_4KB,   // 16 blocks (4KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_8KB,   // 8 blocks (8KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_16KB,  // 4 blocks (16KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_32KB,  // 2 blocks (32KiB)
-    POOL_PAGE_SIZE / POOL_CLASS_40KB,  // 1 block (40KiB Bridge)
-    POOL_PAGE_SIZE / POOL_CLASS_52KB   // 1 block (52KiB Bridge)
-};
-
-// ===========================================================================
-// Helper Functions
-// ===========================================================================
-
-// Write minimal header for Mid allocation (fast-return friendly)
-static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) {
-    // For Mid, prefer headerless operation when HDR_LIGHT>=1.
-    // Debug or non-Mid callers can still write full headers elsewhere.
-    if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path
-    hdr->magic = HAKMEM_MAGIC;
-    hdr->method = ALLOC_METHOD_POOL;
-    hdr->size = class_sz;
-    if (!g_hdr_light_enabled) {
-        hdr->alloc_site = site_id;
-        hdr->class_bytes = 0;
-        hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
-    }
-}
-
-// Branchless LUT (Lookup Table) for O(1) class determination
-// Expanded to 53 entries for Bridge classes (40KB, 52KB)
-static const uint8_t SIZE_TO_CLASS[53] = {
-    0,0,0,     // 0-2KB → Class 0
-    1,1,       // 3-4KB → Class 1
-    2,2,2,2,   // 5-8KB → Class 2
-    3,3,3,3,3,3,3,3,  // 9-16KB → Class 3
-    4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,  // 17-32KB → Class 4
-    5,5,5,5,5,5,5,5,  // 33-40KB → Class 5 (Bridge class 0)
-    6,6,6,6,6,6,6,6,6,6,6,6  // 41-52KB → Class 6 (Bridge class 1)
-};
-
-// Get size class index from size (0-6, or -1 if out of range)
-// Updated range check for Bridge classes (0-52KB)
-static inline int hak_pool_get_class_index(size_t size) {
-    // Fast path: exact match against configured class sizes (covers Bridge classes)
-    // Note: size passed here should already be a rounded class size from ACE.
-    for (int i = 0; i < POOL_NUM_CLASSES; i++) {
-        size_t cs = g_class_sizes[i];
-        if (cs != 0 && size == cs) return i;
-    }
-    // Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior)
-    uint32_t kb = (uint32_t)((size + 1023) >> 10);  // Round up to KB units
-    return (kb < 53) ? SIZE_TO_CLASS[kb] : -1;  // Expanded to 53KB for Bridge classes
-}
-
-// Get shard index from site_id (0-63)
-int hak_pool_get_shard_index(uintptr_t site_id) {
-    if (!g_shard_mix_enabled) {
-        // Legacy: Shift by 4 to reduce collision (instruction alignment)
-        return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1));
-    }
-    // SplitMix64-like mixer with thread id salt for better dispersion
-    uint64_t x = (uint64_t)site_id;
-    uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
-    x ^= (tid << 1);
-    x += 0x9e3779b97f4a7c15ULL;
-    x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
-    x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
-    x = (x ^ (x >> 31));
-    return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
-}
-
-// TLS helpers
-#include "box/pool_tls_core.inc.h"
-
-
-// Refill/ACE (boxed)
-#include "box/pool_refill.inc.h"
-
-// Init/Shutdown + MF2 debug (boxed)
-#include "box/pool_init_api.inc.h"
-
-// Pool statistics (boxed)
-#include "box/pool_stats.inc.h"
-
-// Public API (boxed): alloc/free/lookup/free_fast
-#include "box/pool_api.inc.h"
diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h
index 6e6aa758..080356db 100644
--- a/core/tiny_alloc_fast.inc.h
+++ b/core/tiny_alloc_fast.inc.h
@@ -612,11 +612,11 @@ static inline void* tiny_alloc_fast(size_t size) {
     if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
         // For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads.
         if (class_idx <= 3) {
-#if defined(HAKMEM_TINY_INLINE_SLL) && HAKMEM_TINY_AGGRESSIVE_INLINE
-            // Experimental: Use inline SLL pop macro (enable via HAKMEM_TINY_INLINE_SLL=1)
+#if HAKMEM_TINY_INLINE_SLL
+            // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
             TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
 #else
-            // Default: Safe Box API (bypasses inline SLL when Front-Direct)
+            // Default: Safe Box API (Box TLS-SLL) for all standard builds
             ptr = tiny_alloc_fast_pop(class_idx);
 #endif
         } else {
@@ -656,11 +656,11 @@ static inline void* tiny_alloc_fast(size_t size) {
             // Skip SLL retry if Front-Direct OR SLL disabled
             if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
                 if (class_idx <= 3) {
-#if defined(HAKMEM_TINY_INLINE_SLL) && HAKMEM_TINY_AGGRESSIVE_INLINE
-                    // Experimental: Use inline SLL pop macro (enable via HAKMEM_TINY_INLINE_SLL=1)
+#if HAKMEM_TINY_INLINE_SLL
+                    // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
                     TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
 #else
-                    // Default: Safe Box API (bypasses inline SLL when Front-Direct)
+                    // Default: Safe Box API (Box TLS-SLL) for all standard builds
                     ptr = tiny_alloc_fast_pop(class_idx);
 #endif
                 } else {