hakmem/core/hakmem_pool.c

// ============================================================================
// hakmem_pool.c - L2 Hybrid Pool Implementation (Mid-Size: 2-32KiB)
// ============================================================================
//
// サイズクラス定義:
// ┌──────────┬─────────┬──────────────┬─────────────┐
// │ クラス   │ サイズ  │ 初期CAP      │ ページ構成  │
// ├──────────┼─────────┼──────────────┼─────────────┤
// │ Class 0  │  2 KiB  │  64 pages    │ 32 blocks/p │
// │ Class 1  │  4 KiB  │  64 pages    │ 16 blocks/p │
// │ Class 2  │  8 KiB  │  64 pages    │  8 blocks/p │
// │ Class 3  │ 16 KiB  │  32 pages    │  4 blocks/p │
// │ Class 4  │ 32 KiB  │  16 pages    │  2 blocks/p │
// │ DYN1     │ 6 KiB*  │  0 (無効)    │ 可変        │
// │ DYN2     │ (未使用)│  0 (無効)    │ 可変        │
// └──────────┴─────────┴──────────────┴─────────────┘
// * DYN1はギャップ(8-16KB)を埋めるための動的クラス
//
// W_MAX (切り上げ許容倍率):
//   - 意味: 要求サイズの何倍までのクラスを許容するか
//   - デフォルト: 1.40 (40%までの切り上げを許容)
//   - 例: 3KiBの要求 → 4KiBクラス使用OK (1.33倍 < 1.40)
//   - 環境変数: HAKMEM_WMAX_MID=1.6 で変更可能
//
// CAP (在庫量):
//   - 意味: 各クラスで保持する最大ページ数
//   - 初期値: {64,64,64,32,16} - 保守的（フットプリント優先）
//   - 推奨値: {256,256,256,128,64} - パフォーマンス優先
//   - 環境変数: HAKMEM_CAP_MID=256,256,256,128,64 で設定
//   - 学習モード: HAKMEM_LEARN=1 で自動調整
//
// TLSリング構造:
//   - POOL_L2_RING_CAP: リングバッファ容量（デフォルト16）
//   - ActivePage A/B: bump-run方式（ロックフリー）
//   - LIFO overflow: リングから溢れた分
//
// パフォーマンスチューニング:
//   1. 初期CAP 4倍化: HAKMEM_CAP_MID=256,256,256,128,64
//   2. W_MAX緩和: HAKMEM_WMAX_MID=1.6
//   3. DYN1有効化: HAKMEM_MID_DYN1=6144 HAKMEM_CAP_MID_DYN1=64
//   4. 学習モード: HAKMEM_LEARN=1
//
// License: MIT
// Last Updated: 2025-10-26 (Code Cleanup完了)

#include "hakmem_pool.h"
#include "hakmem_config.h"
#include "hakmem_internal.h"  // For AllocHeader and HAKMEM_MAGIC
#include "hakmem_syscall.h"   // Box 3 syscall layer (bypasses LD_PRELOAD)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <stdbool.h>
#include <sys/mman.h>
#include <pthread.h>
#include <stdatomic.h>
#include "hakmem_prof.h"
#include "hakmem_policy.h"   // FrozenPolicy caps (Soft CAP gating)
#include "hakmem_debug.h"

// False sharing mitigation: padded mutex type (64B)
typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;

// ===========================================================================
// Internal Data Structures
// ===========================================================================
#include "box/pool_tls_types.inc.h"

// Mid page descriptor registry (64KiB pages → {class_idx, owner_tid})
#include "box/pool_mid_desc.inc.h"

// ---------------- Transfer Cache (per-thread per-class inbox) --------------
#include "box/pool_mid_tc.inc.h"

#include "box/pool_mf2_types.inc.h"


// --- MF2 Initialization Functions ---

// Thread-safe initialization using pthread_once
static pthread_once_t mf2_page_registry_init_control = PTHREAD_ONCE_INIT;
static void mf2_page_registry_init_impl(void) {
    // Initialize all page slots to NULL
    memset(&g_mf2_page_registry, 0, sizeof(g_mf2_page_registry));

    // Initialize 256 coarse-grained locks for registry updates
    for (int i = 0; i < 256; i++) {
        pthread_mutex_init(&g_mf2_page_registry.locks[i], NULL);
    }

    // Initialize counters
    atomic_store(&g_mf2_page_registry.total_pages, 0);
    atomic_store(&g_mf2_page_registry.active_pages, 0);
}
static void mf2_page_registry_init(void) {
    pthread_once(&mf2_page_registry_init_control, mf2_page_registry_init_impl);
}

// Strategy A: ThreadPages destructor (cleanup on thread exit)
static void mf2_thread_pages_destructor(void* arg) {
    MF2_ThreadPages* tp = (MF2_ThreadPages*)arg;
    if (!tp) return;

    // SAFETY: Don't remove from global registry or free memory
    // Reason: Causes "malloc(): unsorted double linked list corrupted" crashes
    // Tradeoff: Small memory leak (one ThreadPages struct per thread lifetime)
    // TODO: Investigate safe cleanup mechanism

    // Remove from global registry (DISABLED for safety)
    // for (int i = 0; i < atomic_load_explicit(&g_num_thread_pages, memory_order_acquire); i++) {
    //     if (atomic_load_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], memory_order_acquire) == (uintptr_t)tp) {
    //         atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[i], 0, memory_order_release);
    //         break;
    //     }
    // }

    // Free all pages owned by this thread (DISABLED for safety)
    // hkm_libc_free(tp);

    (void)tp;  // Suppress unused warning
}

// Strategy A: Initialize pthread_key (once only)
static void mf2_init_tls_key(void) {
    pthread_key_create(&g_mf2_tls_key, mf2_thread_pages_destructor);
}

// Helper: rdtsc() - Read CPU timestamp counter (for Route P idle detection)
static inline uint64_t mf2_rdtsc(void) {
#if defined(__x86_64__) || defined(__i386__)
    uint32_t lo, hi;
    __asm__ __volatile__ ("rdtsc" : "=a"(lo), "=d"(hi));
    return ((uint64_t)hi << 32) | lo;
#else
    // Fallback for non-x86 architectures (use clock_gettime approximation)
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
#endif
}

static MF2_ThreadPages* mf2_thread_pages_get(void) {
    if (t_mf2_pages) return t_mf2_pages;

    // Initialize pthread_key (once only)
    pthread_once(&g_mf2_key_once, mf2_init_tls_key);

    // Allocate thread-local page lists
    MF2_ThreadPages* tp = (MF2_ThreadPages*)hkm_libc_calloc(1, sizeof(MF2_ThreadPages));
    if (!tp) return NULL;

    // Initialize with current thread ID
    tp->my_tid = pthread_self();

    // All page lists start empty (NULL)
    for (int c = 0; c < POOL_NUM_CLASSES; c++) {
        tp->active_page[c] = NULL;
        tp->full_pages[c] = NULL;
        atomic_store_explicit(&tp->pages_remote_pending[c], 0, memory_order_relaxed);
        atomic_flag_clear_explicit(&tp->pending_claim[c], memory_order_relaxed);
        tp->page_count[c] = 0;
    }

    // Route P: Initialize activity tracking
    atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);

    // Strategy A: Register in global array for round-robin drain
    int idx = atomic_fetch_add_explicit(&g_num_thread_pages, 1, memory_order_acq_rel);
    if (idx < MF2_MAX_THREADS) {
        atomic_store_explicit((atomic_uintptr_t*)&g_all_thread_pages[idx], (uintptr_t)tp, memory_order_release);

        // DEBUG: Log first 10 thread registrations - Disabled for performance
        // static _Atomic int reg_samples = 0;
        // int rs = atomic_fetch_add_explicit(&reg_samples, 1, memory_order_relaxed);
        // if (rs < 10) {
        //     fprintf(stderr, "[TLS_REGISTER %d] tid=%lu, tp=%p, idx=%d\n",
        //             rs, (unsigned long)tp->my_tid, tp, idx);
        // }
    } else {
        MF2_ERROR_LOG("Too many threads! MAX=%d", MF2_MAX_THREADS);
    }

    // Set pthread-specific data for destructor
    pthread_setspecific(g_mf2_tls_key, tp);

    t_mf2_pages = tp;
    return tp;
}

// --- MF2 Page Allocation & Lookup ---

// O(1) page lookup from block address (mimalloc's secret sauce!)
static inline MidPage* mf2_addr_to_page(void* addr) {
    // Step 1: Get page base address (64KB aligned)
    // 0xFFFF = 65535, ~0xFFFF clears bottom 16 bits
    void* page_base = (void*)((uintptr_t)addr & ~0xFFFFULL);

    // Step 2: Index into registry (direct-mapped, 64K entries)
    // (addr >> 16) extracts page index, & 0xFFFF wraps to registry size
    size_t idx = ((uintptr_t)page_base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);

    // Step 3: Direct lookup (no hash collision handling needed with 64K entries)
    MidPage* page = g_mf2_page_registry.pages[idx];

    // ALIGNMENT VERIFICATION (Step 3) - Sample first 100 lookups
    static _Atomic int lookup_count = 0;
    // DEBUG: Disabled for performance
    // int count = atomic_fetch_add_explicit(&lookup_count, 1, memory_order_relaxed);
    // if (count < 100) {
    //     int found = (page != NULL);
    //     int match = (page && page->base == page_base);
    //     fprintf(stderr, "[LOOKUP %d] addr=%p → page_base=%p → idx=%zu → found=%s",
    //             count, addr, page_base, idx, found ? "YES" : "NO");
    //     if (page) {
    //         fprintf(stderr, ", page->base=%p, match=%s",
    //                 page->base, match ? "YES" : "NO");
    //     }
    //     fprintf(stderr, "\n");
    // }

    // Validation: Ensure page base matches (handles potential collisions)
    if (page && page->base == page_base) {
        return page;
    }

    // Collision or not registered (shouldn't happen in normal operation)
    return NULL;
}

// Register a page in the global registry (called once per page allocation)
static void mf2_register_page(MidPage* page) {
    if (!page) return;

    // Calculate registry index from page base
    size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);

    // ALIGNMENT VERIFICATION (Step 2) - DEBUG: Disabled for performance
    // static int register_count = 0;
    // if (register_count < 10) {
    //     fprintf(stderr, "[REGISTER %d] Page %p → idx %zu (aligned=%s)\n",
    //             register_count, page->base, idx,
    //             (((uintptr_t)page->base & 0xFFFF) == 0) ? "YES" : "NO");
    //     register_count++;
    // }

    // Coarse-grained lock (256 locks for 64K entries = 256 entries/lock)
    int lock_idx = idx % 256;
    pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);

    // Check for collision (should be rare with 64K entries)
    if (g_mf2_page_registry.pages[idx] != NULL) {
        // Collision detected - this is a problem!
        // For MVP, we'll just log and overwrite (TODO: handle collisions properly)
        HAKMEM_LOG("[MF2] WARNING: Page registry collision at index %zu\n", idx);
    }

    // Register the page
    g_mf2_page_registry.pages[idx] = page;

    // Update counters
    atomic_fetch_add_explicit(&g_mf2_page_registry.total_pages, 1, memory_order_relaxed);
    atomic_fetch_add_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);

    pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
}

// Unregister a page from the global registry (called when returning page to OS)
__attribute__((unused)) static void mf2_unregister_page(MidPage* page) {
    if (!page) return;

    size_t idx = ((uintptr_t)page->base >> 16) & (MF2_PAGE_REGISTRY_SIZE - 1);
    int lock_idx = idx % 256;

    pthread_mutex_lock(&g_mf2_page_registry.locks[lock_idx]);

    if (g_mf2_page_registry.pages[idx] == page) {
        g_mf2_page_registry.pages[idx] = NULL;
        atomic_fetch_sub_explicit(&g_mf2_page_registry.active_pages, 1, memory_order_relaxed);
    }

    pthread_mutex_unlock(&g_mf2_page_registry.locks[lock_idx]);
}

// Allocate and initialize a new 64KB page for given size class
static MidPage* mf2_alloc_new_page(int class_idx) {
    if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return NULL;

    // Get user size class (2KB, 4KB, 8KB, 16KB, 32KB)
    size_t user_size = g_class_sizes[class_idx];
    if (user_size == 0) return NULL; // Dynamic class disabled

    // CRITICAL FIX: Each block needs HEADER_SIZE + user_size
    // The header stores metadata (AllocHeader), user_size is the usable space
    size_t block_size = HEADER_SIZE + user_size;

    // Step 1: Allocate 64KB page (aligned to 64KB boundary)
    // CRITICAL FIX #4: Must ensure 64KB alignment!
    // mmap() only guarantees 4KB alignment, breaking addr_to_page() lookup.
    // This caused 97% of frees to fail silently (fatal bug!)
    //
    // CRITICAL FIX: Use mmap() + alignment adjustment to avoid recursion!
    // Using wrapped posix_memalign with WRAP_L2=1 causes infinite recursion.

    // Allocate 2x size to allow alignment adjustment
    size_t alloc_size = POOL_PAGE_SIZE * 2;  // 128KB
    void* raw = mmap(NULL, alloc_size, PROT_READ | PROT_WRITE,
                     MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (raw == MAP_FAILED) {
        return NULL; // OOM
    }

    // Find 64KB aligned address within allocation
    uintptr_t addr = (uintptr_t)raw;
    uintptr_t aligned = (addr + 0xFFFF) & ~0xFFFFULL;  // Round up to 64KB boundary
    void* page_base = (void*)aligned;

    // Free unused prefix (if any)
    size_t prefix_size = aligned - addr;
    if (prefix_size > 0) {
        munmap(raw, prefix_size);
    }

    // Free unused suffix
    size_t suffix_offset = prefix_size + POOL_PAGE_SIZE;
    if (suffix_offset < alloc_size) {
        munmap((char*)raw + suffix_offset, alloc_size - suffix_offset);
    }

    // DEBUG: Log first few allocations
    static _Atomic int mmap_count = 0;
    int mc = atomic_fetch_add_explicit(&mmap_count, 1, memory_order_relaxed);
    if (mc < 5) {
        MF2_DEBUG_LOG("MMAP_ALLOC %d: raw=%p, aligned=%p, prefix=%zu, suffix=%zu",
                      mc, raw, page_base, prefix_size, alloc_size - suffix_offset);
    }

    // ALIGNMENT VERIFICATION (Step 1)
    if (((uintptr_t)page_base & 0xFFFF) != 0) {
        MF2_ERROR_LOG("ALIGNMENT BUG: Page %p not 64KB aligned! (offset=%zu)",
                      page_base, ((uintptr_t)page_base & 0xFFFF));
    }

    // Zero-fill (required for posix_memalign)
    // Note: This adds ~15μs overhead, but is necessary for correctness
    memset(page_base, 0, POOL_PAGE_SIZE);

    // Step 2: Allocate MidPage descriptor
    MidPage* page = (MidPage*)hkm_libc_calloc(1, sizeof(MidPage));
    if (!page) {
        // CRITICAL FIX: Use munmap for mmap-allocated memory
        munmap(page_base, POOL_PAGE_SIZE);
        return NULL;
    }

    // Step 3: Initialize page descriptor
    page->base = page_base;
    page->class_idx = (uint8_t)class_idx;
    page->flags = 0;
    page->owner_tid = pthread_self();
    page->owner_tp = mf2_thread_pages_get();  // Store owner's ThreadPages for pending queue
    page->last_transfer_time = 0;  // No transfer yet (lease mechanism)

    // Step 4: Build freelist chain (walk through page and link blocks)
    // Calculate how many blocks fit in 64KB page (including header overhead)
    size_t usable_size = POOL_PAGE_SIZE;
    size_t num_blocks = usable_size / block_size;

    page->capacity = (uint16_t)num_blocks;
    page->free_count = (uint16_t)num_blocks;

    // Build linked list of free blocks
    PoolBlock* freelist_head = NULL;
    PoolBlock* freelist_tail = NULL;

    for (size_t i = 0; i < num_blocks; i++) {
        char* block_addr = (char*)page_base + (i * block_size);
        PoolBlock* block = (PoolBlock*)block_addr;

        block->next = NULL;

        if (freelist_head == NULL) {
            freelist_head = block;
            freelist_tail = block;
        } else {
            freelist_tail->next = block;
            freelist_tail = block;
        }
    }

    page->freelist = freelist_head;

    // Step 5: Initialize remote stack (for cross-thread frees)
    atomic_store(&page->remote_head, (uintptr_t)0);
    atomic_store(&page->remote_count, 0);

    // Step 6: Initialize lifecycle counters
    atomic_store(&page->in_use, 0);  // No blocks allocated yet
    atomic_store(&page->pending_dn, 0);

    // Step 7: Initialize linkage
    page->next_page = NULL;
    page->prev_page = NULL;

    // Initialize pending queue fields
    atomic_store_explicit(&page->in_remote_pending, false, memory_order_relaxed);
    page->next_pending = NULL;

    // Step 8: Register page in global registry
    mf2_register_page(page);

    return page;
}

// --- MF2 Allocation & Free Operations ---

// Forward declarations
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page);

// Drain remote frees (cross-thread) into page's local freelist
// Called by owner thread when local freelist is empty
static int mf2_drain_remote_frees(MidPage* page) {
    if (!page) return 0;

    atomic_fetch_add(&g_mf2_drain_attempts, 1);

    // Check if there are any remote frees (FIX #6: use seq_cst to ensure total ordering - DEBUG)
    unsigned int remote_count = atomic_load_explicit(&page->remote_count, memory_order_seq_cst);
    if (remote_count == 0) {
        return 0; // Nothing to drain
    }

    // Atomically swap remote stack head with NULL (lock-free pop all)
    uintptr_t head = atomic_exchange_explicit(&page->remote_head, (uintptr_t)0,
                                              memory_order_acq_rel);
    if (!head) {
        atomic_store_explicit(&page->remote_count, 0, memory_order_release);
        return 0; // Race: someone else drained it
    }

    // Reset remote count (FIX #6: use release for future drain checks to see)
    atomic_store_explicit(&page->remote_count, 0, memory_order_release);

    // Walk the remote stack and count blocks
    int drained = 0;
    PoolBlock* cur = (PoolBlock*)head;
    PoolBlock* tail = NULL;

    while (cur) {
        drained++;
        tail = cur;
        cur = cur->next;
    }

    // Append remote stack to local freelist (splice in front for simplicity)
    if (tail) {
        tail->next = page->freelist;
        page->freelist = (PoolBlock*)head;
        page->free_count += drained;
    }

    atomic_fetch_add(&g_mf2_drain_count, 1);
    atomic_fetch_add(&g_mf2_drain_blocks, drained);

    // CRITICAL FIX: Check if new remotes arrived DURING drain
    // If so, re-enqueue to owner's pending queue (avoid losing remotes!)
    unsigned int post_drain_count = atomic_load_explicit(&page->remote_count, memory_order_acquire);
    if (post_drain_count >= 1 && page->owner_tp) {  // Use same threshold as initial enqueue
        // New remotes arrived during drain, re-enqueue for next round
        // Note: This is safe because flag was cleared earlier
        mf2_enqueue_pending(page->owner_tp, page);
    }

    return drained;
}

// ===========================================================================
// Pending Queue Operations (MPSC Lock-Free Stack)
// ===========================================================================

// Enqueue page to owner's pending queue (called by remote threads)
// MPSC: Multiple producers (remote free threads), single consumer (owner)
static void mf2_enqueue_pending(MF2_ThreadPages* owner_tp, MidPage* page) {
    if (!owner_tp || !page) return;

    // Already in pending? Skip (avoid duplicate enqueue)
    _Bool was_pending = atomic_exchange_explicit(&page->in_remote_pending, true, memory_order_acq_rel);
    if (was_pending) {
        return;  // Already enqueued, nothing to do
    }

    atomic_fetch_add(&g_mf2_pending_enqueued, 1);

    // Push to owner's pending stack (Treiber stack algorithm)
    uintptr_t old_head;
    do {
        old_head = atomic_load_explicit(&owner_tp->pages_remote_pending[page->class_idx], memory_order_relaxed);
        page->next_pending = (MidPage*)old_head;
    } while (!atomic_compare_exchange_weak_explicit(
                &owner_tp->pages_remote_pending[page->class_idx],
                &old_head, (uintptr_t)page,
                memory_order_release,  // Publish page
                memory_order_relaxed));

    // 0→1 detection: Increment adoptable count for this class
    // This enables O(1) early return in try_adopt (if count==0, no scan needed)
    if (old_head == 0) {
        atomic_fetch_add_explicit(&g_adoptable_count[page->class_idx], 1, memory_order_relaxed);
    }
}

// Dequeue one page from pending queue (called by owner thread or adopter)
// Uses CAS for correctness (multi-consumer in adoption path)
static MidPage* mf2_dequeue_pending(MF2_ThreadPages* tp, int class_idx) {
    if (!tp) return NULL;

    uintptr_t old_head;
    do {
        old_head = atomic_load_explicit(&tp->pages_remote_pending[class_idx], memory_order_acquire);
        if (old_head == 0) {
            return NULL;  // Queue empty
        }
        MidPage* page = (MidPage*)old_head;

        // CAS to pop head
        if (atomic_compare_exchange_weak_explicit(
                &tp->pages_remote_pending[class_idx],
                &old_head, (uintptr_t)page->next_pending,
                memory_order_acq_rel, memory_order_relaxed)) {
            // Successfully dequeued
            MidPage* next = page->next_pending;
            page->next_pending = NULL;  // Clear link

            // If queue became empty (next==NULL), decrement adoptable count
            // This enables O(1) early return in try_adopt when all queues empty
            if (next == NULL) {
                atomic_fetch_sub_explicit(&g_adoptable_count[class_idx], 1, memory_order_relaxed);
            }

            return page;
        }
    } while (1);
}

// ===========================================================================
// End of Pending Queue Operations
// ===========================================================================

#include "box/pool_mf2_helpers.inc.h"


#include "box/pool_mf2_adoption.inc.h"

// Fast allocation path (owner thread, NO LOCK!)
static inline void* mf2_alloc_fast(int class_idx, size_t size, uintptr_t site_id) {
    // Get thread-local page lists
    MF2_ThreadPages* tp = mf2_thread_pages_get();
    if (!tp) return NULL;

    // Get active page for this class
    MidPage* page = tp->active_page[class_idx];
    if (!page) {
        // No active page, go to slow path
        return mf2_alloc_slow(class_idx, size, site_id);
    }

    // FAST PATH: Pop from page-local freelist (NO LOCK!)
    if (page->freelist) {
        atomic_fetch_add(&g_mf2_alloc_fast_hit, 1);

        // Route P: Update activity tracking for idle detection
        atomic_store_explicit(&tp->last_alloc_tsc, mf2_rdtsc(), memory_order_relaxed);

        PoolBlock* block = page->freelist;
        page->freelist = block->next;
        page->free_count--;

        // Increment in-use count (atomic for cross-thread visibility)
        atomic_fetch_add_explicit(&page->in_use, 1, memory_order_relaxed);

        // Return user pointer (skip header)
        return (char*)block + HEADER_SIZE;
    }

    // Local freelist empty, go to slow path
    return mf2_alloc_slow(class_idx, size, site_id);
}

// Slow allocation path (drain remote or allocate new page)
static void* mf2_alloc_slow(int class_idx, size_t size, uintptr_t site_id) {
    (void)site_id; // Unused for now

    atomic_fetch_add(&g_mf2_alloc_slow_hit, 1);

    // Get thread-local page lists
    MF2_ThreadPages* tp = mf2_thread_pages_get();
    if (!tp) return NULL;

    // ===========================================================================
    // Allocation Strategy (Must-Reuse Order)
    // ===========================================================================
    // 1. MUST-REUSE GATE (Part 1): Drain own pending queue
    //    - Process up to 4 pages to avoid blocking
    //    - Direct handoff: activate first successful drain immediately
    if (mf2_try_reuse_own_pending(tp, class_idx)) {
        return mf2_alloc_fast(class_idx, size, site_id);
    }

    // 2. MUST-REUSE GATE (Part 2): Drain active page remotes
    //    - Check if current active page has remote frees
    //    - Drain and retry allocation if successful
    if (mf2_try_drain_active_remotes(tp, class_idx)) {
        return mf2_alloc_fast(class_idx, size, site_id);
    }

    // HISTORICAL NOTE: full_pages scan removed
    // Old approach: Scan full_pages looking for pages with remotes
    // Problem: Drained pages consumed before owner can scan them
    // New approach: Direct Handoff immediately activates drained pages
    // Result: full_pages scan always finds 0 pages (100% waste)
    //
    // Benchmark evidence (before removal):
    // - Full scan checked: 1,879,484 pages
    // - Full scan found:   0 pages (0% success rate!)

    // 3. Consumer-Driven Adoption (Route P with idle detection)
    //    - Only adopt from idle owners (haven't allocated in >150µs)
    //    - Prevents "adoption stealing" from active owners
    if (mf2_try_adopt_pending(tp, class_idx)) {
        return mf2_alloc_fast(class_idx, size, site_id);
    }

    // 4. MUST-REUSE GATE (Final): Allocate new page (last resort)
    //    - Only reached after exhausting all reuse opportunities
    //    - Order: pending queue → active drain → adoption → NEW
    MidPage* page = mf2_alloc_and_activate_new_page(tp, class_idx);
    if (!page) {
        return NULL; // OOM
    }

    // Retry allocation from new page
    return mf2_alloc_fast(class_idx, size, site_id);
}

// Forward declaration of slow free path
static void mf2_free_slow(MidPage* page, void* ptr);

// Strategy A: Global Round-Robin Drain (Cross-Thread Pending Queue)
// Fast free path (owner thread, NO LOCK!)
static inline void mf2_free_fast(MidPage* page, void* ptr) {
    if (!page || !ptr) return;

    atomic_fetch_add(&g_mf2_free_owner_count, 1);

    // Get block pointer (rewind to header)
    PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);

    // FAST PATH: Push to page-local freelist (NO LOCK!)
    block->next = page->freelist;
    page->freelist = block;
    page->free_count++;

    // Decrement in-use count (atomic for cross-thread visibility)
    int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);

    // Check if page is now empty (all blocks free)
    if (old_in_use == 1 && page->free_count == page->capacity) {
        // Memory efficiency: Return empty pages to OS via MADV_DONTNEED
        // Keeps VA mapped (no munmap), but releases physical memory
        hak_batch_add_page(page->base, POOL_PAGE_SIZE);
    }
}

// Slow free path (cross-thread free to remote stack)
static void mf2_free_slow(MidPage* page, void* ptr) {
    if (!page || !ptr) return;

    atomic_fetch_add(&g_mf2_free_remote_count, 1);

    // Get block pointer
    PoolBlock* block = (PoolBlock*)((char*)ptr - HEADER_SIZE);

    // Push to page's remote stack (lock-free MPSC)
    uintptr_t old_head;
    do {
        old_head = atomic_load_explicit(&page->remote_head, memory_order_acquire);
        block->next = (PoolBlock*)old_head;
    } while (!atomic_compare_exchange_weak_explicit(
        &page->remote_head, &old_head, (uintptr_t)block,
        memory_order_release, memory_order_relaxed));

    // Increment remote count and detect threshold for enqueueing
    unsigned int old_count = atomic_fetch_add_explicit(&page->remote_count, 1, memory_order_seq_cst);

    // CRITICAL FIX: Use threshold-based enqueueing instead of 0→1 edge
    // Problem: 0→1 causes ping-pong (drain 1 block → next free triggers 0→1 again)
    // Solution: Only enqueue when remotes accumulate to threshold (better batching)
    //
    // Threshold values (configurable via HAKMEM_MF2_ENQUEUE_THRESHOLD, default=4):
    //   1 = immediate (0→1 edge, causes ping-pong)
    //   4 = balanced (batch 4 blocks before notifying owner)
    //   8 = aggressive batching (higher latency, but better efficiency)
    //
    // We enqueue on transitions TO the threshold (old_count == threshold-1)
    static int g_enqueue_threshold = 1;  // 1=immediate (0→1 edge), 2=batch-2, 4=batch-4
    if (old_count + 1 == (unsigned int)g_enqueue_threshold) {
        // Remote count just reached threshold, notify owner
        if (page->owner_tp) {
            mf2_enqueue_pending(page->owner_tp, page);
        }
    }

    // DEBUG: Sample first 10 remote frees - Disabled for performance
    // static _Atomic int remote_free_samples = 0;
    // int sample = atomic_fetch_add_explicit(&remote_free_samples, 1, memory_order_relaxed);
    // if (sample < 10) {
    //     fprintf(stderr, "[REMOTE_FREE %d] ptr=%p → page=%p (base=%p), remote_count=%u (was %u), EDGE=%s\n",
    //             sample, ptr, page, page->base, old_count + 1, old_count, (old_count == 0) ? "YES" : "NO");
    // }

    // Decrement in-use count
    int old_in_use = atomic_fetch_sub_explicit(&page->in_use, 1, memory_order_release);

    // Check if page is now empty (FIX #6: acquire to see all remote frees)
    if (old_in_use == 1 && page->free_count + atomic_load_explicit(&page->remote_count, memory_order_acquire) >= page->capacity) {
        // Memory efficiency: Return empty pages to OS via MADV_DONTNEED
        // Keeps VA mapped (no munmap), but releases physical memory
        hak_batch_add_page(page->base, POOL_PAGE_SIZE);
    }
}

// Top-level free dispatcher
static void mf2_free(void* ptr) {
    if (!ptr) return;

    // O(1) page lookup (mimalloc's magic!)
    MidPage* page = mf2_addr_to_page(ptr);
    if (!page) {
        // Not a MF2 page (shouldn't happen if MF2 is enabled properly)
        return;
    }

    // Check if we're the owner (fast path)
    MF2_ThreadPages* tp = mf2_thread_pages_get();

    if (tp && page->owner_tid == tp->my_tid) {
        // Fast: Owner thread, push to local freelist (NO LOCK!)
        mf2_free_fast(page, ptr);
    } else {
        // Slow: Cross-thread free, push to remote stack (lock-free)
        mf2_free_slow(page, ptr);
    }
}

// ===========================================================================
// Global pool state (simplified: single-threaded for MVP)
static struct {
    PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS];

    // Locks: per (class, shard) freelist to allow concurrent operations
    PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS];

    // Non-empty bitmap (O(1) empty class skip)
    // Bit i = 1 if freelist[class][shard] is non-empty
    // Use atomic to avoid class-wide locks
    atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES];  // 1 bit per shard

    // Remote-free MPSC stacks per (class, shard): lock-free producers, drained under lock on alloc
    atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
    atomic_uint      remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS];

    // Statistics
    uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t total_bytes_allocated __attribute__((aligned(64)));
    uint64_t total_pages_allocated __attribute__((aligned(64)));

    // Per-class page accounting (for Soft CAP guidance)
    uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64)));

    // ACE: per-class bundle factor for refill (1..4) + last snapshot
    int bundle_factor[POOL_NUM_CLASSES];
    uint64_t last_hits[POOL_NUM_CLASSES];
    uint64_t last_misses[POOL_NUM_CLASSES];

    int initialized;
    int tls_free_enabled;  // env: HAKMEM_POOL_TLS_FREE (default: 1)

    // Extra metrics (for learner logging): all relaxed atomics
    atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64)));
    atomic_uint_fast64_t trylock_success __attribute__((aligned(64)));
    atomic_uint_fast64_t ring_underflow  __attribute__((aligned(64)));
} g_pool;

static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers
static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing
static int g_tls_ring_enabled = 1;  // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring
static int g_trylock_probes = 3;     // env: HAKMEM_TRYLOCK_PROBES (1..8)
static int g_ring_return_div = 2;    // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third)
static int g_tls_lo_max = 256;       // env: HAKMEM_TLS_LO_MAX (LIFO size cap)
int g_hdr_light_enabled = 0;  // env: HAKMEM_HDR_LIGHT=1 (minimize extra fields), =2 (no header writes/validation)
static int g_pool_min_bundle = 2;  // env: HAKMEM_POOL_MIN_BUNDLE (default 2)
// Sampled counter updates to reduce hot-path stores: 1/2^k
static int g_count_sample_exp = 10;      // env: HAKMEM_POOL_COUNT_SAMPLE (0..16)
static __thread uint32_t t_pool_rng = 0x243f6a88u;  // per-thread RNG for sampling

// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap.
// 7 classes including Bridge classes (40KB, 52KB) to fill 32-64KB gap
static size_t g_class_sizes[POOL_NUM_CLASSES] = {
    POOL_CLASS_2KB,     // 2 KB
    POOL_CLASS_4KB,     // 4 KB
    POOL_CLASS_8KB,     // 8 KB
    POOL_CLASS_16KB,    // 16 KB
    POOL_CLASS_32KB,    // 32 KB
    POOL_CLASS_40KB,    // 40 KB (Bridge class 0)
    POOL_CLASS_52KB     // 52 KB (Bridge class 1)
};

// Blocks per page (for each class)
__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = {
    POOL_PAGE_SIZE / POOL_CLASS_2KB,   // 32 blocks (2KiB)
    POOL_PAGE_SIZE / POOL_CLASS_4KB,   // 16 blocks (4KiB)
    POOL_PAGE_SIZE / POOL_CLASS_8KB,   // 8 blocks (8KiB)
    POOL_PAGE_SIZE / POOL_CLASS_16KB,  // 4 blocks (16KiB)
    POOL_PAGE_SIZE / POOL_CLASS_32KB,  // 2 blocks (32KiB)
    POOL_PAGE_SIZE / POOL_CLASS_40KB,  // 1 block (40KiB Bridge)
    POOL_PAGE_SIZE / POOL_CLASS_52KB   // 1 block (52KiB Bridge)
};

// ===========================================================================
// Helper Functions
// ===========================================================================

// Write minimal header for Mid allocation (fast-return friendly)
static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) {
    // For Mid, prefer headerless operation when HDR_LIGHT>=1.
    // Debug or non-Mid callers can still write full headers elsewhere.
    if (g_hdr_light_enabled >= 1) return; // skip header on alloc hot path
    hdr->magic = HAKMEM_MAGIC;
    hdr->method = ALLOC_METHOD_POOL;
    hdr->size = class_sz;
    if (!g_hdr_light_enabled) {
        hdr->alloc_site = site_id;
        hdr->class_bytes = 0;
        hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
    }
}

// Branchless LUT (Lookup Table) for O(1) class determination
// Expanded to 53 entries for Bridge classes (40KB, 52KB)
static const uint8_t SIZE_TO_CLASS[53] = {
    0,0,0,     // 0-2KB → Class 0
    1,1,       // 3-4KB → Class 1
    2,2,2,2,   // 5-8KB → Class 2
    3,3,3,3,3,3,3,3,  // 9-16KB → Class 3
    4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,4,  // 17-32KB → Class 4
    5,5,5,5,5,5,5,5,  // 33-40KB → Class 5 (Bridge class 0)
    6,6,6,6,6,6,6,6,6,6,6,6  // 41-52KB → Class 6 (Bridge class 1)
};

// Get size class index from size (0-6, or -1 if out of range)
// Updated range check for Bridge classes (0-52KB)
static inline int hak_pool_get_class_index(size_t size) {
    // Fast path: exact match against configured class sizes (covers Bridge classes)
    // Note: size passed here should already be a rounded class size from ACE.
    for (int i = 0; i < POOL_NUM_CLASSES; i++) {
        size_t cs = g_class_sizes[i];
        if (cs != 0 && size == cs) return i;
    }
    // Fallback: map arbitrary size to nearest fixed class range via LUT (legacy behavior)
    uint32_t kb = (uint32_t)((size + 1023) >> 10);  // Round up to KB units
    return (kb < 53) ? SIZE_TO_CLASS[kb] : -1;  // Expanded to 53KB for Bridge classes
}

// Get shard index from site_id (0-63)
int hak_pool_get_shard_index(uintptr_t site_id) {
    if (!g_shard_mix_enabled) {
        // Legacy: Shift by 4 to reduce collision (instruction alignment)
        return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1));
    }
    // SplitMix64-like mixer with thread id salt for better dispersion
    uint64_t x = (uint64_t)site_id;
    uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
    x ^= (tid << 1);
    x += 0x9e3779b97f4a7c15ULL;
    x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
    x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
    x = (x ^ (x >> 31));
    return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
}

// TLS helpers
#include "box/pool_tls_core.inc.h"


// Refill/ACE (boxed)
#include "box/pool_refill.inc.h"

// Init/Shutdown + MF2 debug (boxed)
#include "box/pool_init_api.inc.h"

// Pool statistics (boxed)
#include "box/pool_stats.inc.h"

// Public API (boxed): alloc/free/lookup/free_fast
#include "box/pool_api.inc.h"