hakmem/core/hakmem_l25_pool.c

// ============================================================================
// hakmem_l25_pool.c - L2.5 LargePool Implementation (64KB-1MB)
// ============================================================================
//
// サイズクラス定義:
// ┌──────────┬─────────┬──────────────┬─────────────┐
// │ クラス   │ サイズ  │ 初期CAP      │ ページ構成  │
// ├──────────┼─────────┼──────────────┼─────────────┤
// │ Class 0  │  64 KB  │  8 bundles   │  1 page/b   │
// │ Class 1  │ 128 KB  │  8 bundles   │  2 pages/b  │
// │ Class 2  │ 256 KB  │  4 bundles   │  4 pages/b  │
// │ Class 3  │ 512 KB  │  2 bundles   │  8 pages/b  │
// │ Class 4  │   1 MB  │  1 bundle    │ 16 pages/b  │
// └──────────┴─────────┴──────────────┴─────────────┘
//
// W_MAX_LARGE (切り上げ許容倍率):
//   - 意味: 要求サイズの何倍までのクラスを許容するか
//   - デフォルト: 1.30 (30%までの切り上げを許容) - **保守的**
//   - 推奨値: 1.60 (60%までの切り上げを許容) - ギャップ対策
//   - 例: 40KBの要求 → 64KBクラス使用OK (1.60倍 < 1.60)
//   - 環境変数: HAKMEM_WMAX_LARGE=1.6 で変更可能
//
// 重要: 32-64KB ギャップ対策
//   - 32KBを超える要求は、L2 Mid Poolでカバーできない
//   - W_MAX_LARGE=1.30だと、32KB要求は64KBに丸められない (2.0倍 > 1.30)
//   - W_MAX_LARGE=1.60に緩和することで、40KB以上を64KBクラスでカバー
//   - これにより32-64KBギャップの一部を解消
//
// CAP (在庫量):
//   - 意味: 各クラスで保持する最大バンドル数
//   - 初期値: {8,8,4,2,1} - 保守的（フットプリント優先）
//   - 推奨値: {32,32,16,8,4} - パフォーマンス優先（4倍化）
//   - 環境変数: HAKMEM_CAP_LARGE=32,32,16,8,4 で設定
//
// TLS構造:
//   - リング: POOL_L25_RING_CAP（デフォルト16）
//   - ActiveRun: bump-run方式（連続メモリから切り出し）
//   - LIFO overflow: リングから溢れた分
//   - Remote-free: MPSC queue（クロススレッドfree処理）
//
// パフォーマンスチューニング:
//   1. ⭐⭐⭐ W_MAX_LARGE緩和: HAKMEM_WMAX_LARGE=1.6
//   2. ⭐⭐ 初期CAP 4倍化: HAKMEM_CAP_LARGE=32,32,16,8,4
//   3. BG drain有効化: HAKMEM_L25_BG_DRAIN=1
//
// License: MIT
// Date: 2025-10-24 (Phase 6.x - 綺麗綺麗大作戦)

#include "hakmem_l25_pool.h"
#include "hakmem_config.h"
#include "hakmem_internal.h"  // For AllocHeader and HAKMEM_MAGIC
#include "hakmem_syscall.h"   // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <pthread.h>
#include <stdatomic.h>
#include "hakmem_prof.h"
#include "hakmem_debug.h"
#include "hakmem_policy.h"    // FrozenPolicy caps (Soft CAP guidance)
#include <assert.h>
// False sharing mitigation: padded mutex type (64B)
typedef struct { pthread_mutex_t m; char _pad[64 - (sizeof(pthread_mutex_t) % 64)]; } PaddedMutex;

// ===========================================================================
// Internal Data Structures
// ===========================================================================

// Freelist node header (embedded in allocated bundle, same as L2 Pool pattern)
typedef struct L25Block {
    struct L25Block* next;  // Next block in freelist
} L25Block;

// Phase 6.17: TLS two-tier cache（リング + ローカルLIFO）
#ifndef POOL_L25_RING_CAP
#define POOL_L25_RING_CAP 16
#endif
typedef struct { L25Block* items[POOL_L25_RING_CAP]; int top; } L25TLSRing;
typedef struct { L25TLSRing ring; L25Block* lo_head; size_t lo_count; } L25TLSBin;
static __thread L25TLSBin g_l25_tls_bin[L25_NUM_CLASSES];

// TLS ActiveRun (bump-run). We mmap a run that holds multiple class-sized blocks
// and hand out addresses by simple pointer arithmetic (no per-block linking).
typedef struct {
    char* base;    // start of run (raw, header at base)
    char* cursor;  // next header address to serve
    char* end;     // end of run (exclusive)
} L25ActiveRun;
static __thread L25ActiveRun g_l25_active[L25_NUM_CLASSES];

// Global L2.5 pool state (simplified: single-threaded for MVP)
static struct {
    L25Block* freelist[L25_NUM_CLASSES][L25_NUM_SHARDS];

    // Fine-grained locks per (class, shard) freelist (padded)
    PaddedMutex freelist_locks[L25_NUM_CLASSES][L25_NUM_SHARDS];

    // Phase 6.10.1 pattern: non-empty bitmap (O(1) empty class skip)
    // Use atomic bit operations to avoid class-wide locks
    atomic_uint_fast64_t nonempty_mask[L25_NUM_CLASSES];  // 1 bit per shard

    // Statistics
    uint64_t hits[L25_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t misses[L25_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t refills[L25_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t frees[L25_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t total_bytes_allocated __attribute__((aligned(64)));
    uint64_t total_bundles_allocated __attribute__((aligned(64)));

    // Per-class bundle accounting (for Soft CAP guidance)
    uint64_t bundles_by_class[L25_NUM_CLASSES] __attribute__((aligned(64)));

    int initialized;
    int demand_zero;  // env: HAKMEM_L25_DZ=1
    // Remote-free MPSC stacks per (class, shard)
    atomic_uintptr_t remote_head[L25_NUM_CLASSES][L25_NUM_SHARDS];
    atomic_uint      remote_count[L25_NUM_CLASSES][L25_NUM_SHARDS];
} g_l25_pool;

static int g_wrap_l25_enabled = 0; // env: HAKMEM_WRAP_L25=1 to allow in wrappers
static int g_l25_tls_ring_enabled = 1; // env: HAKMEM_POOL_TLS_RING
static int g_l25_trylock_probes = 3;   // env: HAKMEM_TRYLOCK_PROBES
static int g_l25_tls_lo_max = 256;     // env: HAKMEM_TLS_LO_MAX
static int g_l25_ring_return_div = 3;  // env: HAKMEM_RING_RETURN_DIV
static int g_l25_ring_trigger = 2;     // env: HAKMEM_L25_RING_TRIGGER
static int g_l25_owner_inbound = 0;    // env: HAKMEM_L25_OWNER_INBOUND (0/1)
static int g_l25_in_slots = 512;       // env: HAKMEM_L25_INBOUND_SLOTS (<= compiled max)
extern int g_hdr_light_enabled;        // shared with Mid pool
static int g_l25_run_blocks_override = 0; // env: HAKMEM_L25_RUN_BLOCKS (0=per-class defaults)
static int g_l25_shard_mix = 1;            // env: HAKMEM_SHARD_MIX (0/1, default ON)
static int g_l25_pref_remote_first = 1;    // env: HAKMEM_L25_PREF=remote|run (default remote)
static int g_l25_tc_spill = 32;            // env: HAKMEM_L25_TC_SPILL (spill threshold)
static int g_l25_bg_drain_enabled = 0;     // env: HAKMEM_L25_BG_DRAIN=1 to enable BG drain of remote
static int g_l25_bg_interval_ms = 5;       // env: HAKMEM_L25_BG_MS
static int g_l25_bg_remote_enable = 0;     // env: HAKMEM_L25_BG_REMOTE
static int g_l25_probe_auto = 0;           // env: HAKMEM_L25_PROBE_AUTO
static int g_l25_remote_threshold = 32;    // env: HAKMEM_L25_REMOTE_THRESHOLD
static int g_l25_bg_remote_batch = 64;     // env: HAKMEM_L25_BG_REMOTE_BATCH
static pthread_t g_l25_bg_thread;

// Size class table (for reference)
static const size_t g_class_sizes[L25_NUM_CLASSES] = {
    L25_CLASS_64KB,
    L25_CLASS_128KB,
    L25_CLASS_256KB,
    L25_CLASS_512KB,
    L25_CLASS_1MB
};

// Phase 6.11.5 P0: Pre-initialized header templates for fast allocation
// Reduces AllocHeader reconstruction from 100-150 cycles to 40-50 cycles
static const AllocHeader g_header_templates[L25_NUM_CLASSES] = {
    {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_64KB,  0, 0, 0},  // 64KB
    {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_128KB, 0, 0, 0},  // 128KB
    {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_256KB, 0, 0, 0},  // 256KB
    {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_512KB, 0, 0, 0},  // 512KB
    {HAKMEM_MAGIC, ALLOC_METHOD_L25_POOL, L25_CLASS_1MB,   0, 0, 0}   // 1MB
};

// Pages per bundle (for each class)
static const int g_pages_per_bundle[L25_NUM_CLASSES] = {
    1,   // 64KB = 1 × 64KB page
    2,   // 128KB = 2 × 64KB pages
    4,   // 256KB = 4 × 64KB pages
    8,   // 512KB = 8 × 64KB pages
    16   // 1MB = 16 × 64KB pages
};

// Default blocks per bump-run per class (≈2MB per run)
static const int g_blocks_per_run_default[L25_NUM_CLASSES] = { 32, 16, 8, 4, 2 };

static inline size_t l25_stride_bytes(int class_idx) {
    return HEADER_SIZE + g_class_sizes[class_idx];
}

static int g_l25_run_factor = 1; // env: HAKMEM_L25_RUN_FACTOR (1..8)

static inline int l25_blocks_per_run(int class_idx) {
    if (g_l25_run_blocks_override > 0) return g_l25_run_blocks_override;
    int base = g_blocks_per_run_default[class_idx];
    long long val = (long long)base * (long long)g_l25_run_factor;
    if (val < 1) val = 1;
    if (val > 1024) val = 1024;
    return (int)val;
}

static inline void l25_write_header(AllocHeader* hdr, int class_idx, uintptr_t site_id) {
    if (g_hdr_light_enabled >= 2) {
        return; // no writes
    } else if (g_hdr_light_enabled >= 1) {
        hdr->magic = HAKMEM_MAGIC;
        hdr->method = ALLOC_METHOD_L25_POOL;
        hdr->size = g_class_sizes[class_idx];
        hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
    } else {
        memcpy(hdr, &g_header_templates[class_idx], sizeof(AllocHeader));
        hdr->alloc_site = site_id;
        hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
    }
}

// ===========================================================================
// Helper Functions - inline綺麗綺麗大作戦！
// ===========================================================================

// Phase 6.10.1 pattern: branchless LUT (Lookup Table) for O(1) class determination
// SIZE_TO_CLASS[i] = class for (i * 64KB)
// Index 0: invalid, 1: 64KB (class 0), 2: 128KB (class 1), 4: 256KB (class 2), etc.
static const int8_t SIZE_TO_CLASS[] = {
    -1,  // index 0: 0KB - invalid
    0,   // index 1: 64KB → Class 0
    1,   // index 2: 128KB → Class 1
    -1,  // index 3: 192KB (between 128KB and 256KB)
    2,   // index 4: 256KB → Class 2
    -1, -1, -1,  // index 5-7: 320KB-448KB
    3,   // index 8: 512KB → Class 3
    -1, -1, -1, -1, -1, -1, -1,  // index 9-15: 576KB-960KB
    4    // index 16: 1MB → Class 4
};

// Get size class index from size (0-4, or -1 if out of range)
// inline綺麗綺麗: O(1) branchless lookup, zero function call overhead
static inline int hak_l25_pool_get_class_index(size_t size) {
    // Round to 64KB units
    size_t kb64 = (size + L25_PAGE_SIZE - 1) / L25_PAGE_SIZE;

    // Direct LUT lookup (O(1), branchless)
    if (kb64 == 0 || kb64 > 16) return -1;
    return SIZE_TO_CLASS[kb64];
}

// Get shard index from site_id (0-63)
// inline綺麗綺麗: Same pattern as L2 Pool
static inline uint64_t splitmix64(uint64_t x) {
    x += 0x9e3779b97f4a7c15ULL;
    x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
    x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
    return x ^ (x >> 31);
}

int hak_l25_pool_get_shard_index(uintptr_t site_id) {
    if (g_l25_shard_mix) {
        uint64_t h = splitmix64((uint64_t)site_id);
        return (int)(h & (L25_NUM_SHARDS - 1));
    }
    // Shift by 4 to reduce collision (instruction alignment)
    return (int)((site_id >> 4) & (L25_NUM_SHARDS - 1));
}

// Phase 6.10.1 pattern: Bitmap helpers (O(1) empty class detection)
// inline綺麗綺麗: Zero overhead, perfect for hot path
static inline void set_nonempty_bit(int class_idx, int shard_idx) {
    // Atomic OR with release semantics (ensures freelist write is visible)
    atomic_fetch_or_explicit(&g_l25_pool.nonempty_mask[class_idx],
                            (uint64_t)(1ULL << shard_idx),
                            memory_order_release);
}

static inline void clear_nonempty_bit(int class_idx, int shard_idx) {
    // Atomic AND with release semantics (ensures freelist clear is visible)
    atomic_fetch_and_explicit(&g_l25_pool.nonempty_mask[class_idx],
                             ~(uint64_t)(1ULL << shard_idx),
                             memory_order_release);
}

static inline int is_shard_nonempty(int class_idx, int shard_idx) {
    // Atomic load with acquire semantics (ensures freelist read is valid)
    uint64_t mask = atomic_load_explicit(&g_l25_pool.nonempty_mask[class_idx],
                                        memory_order_acquire);
    return (mask & (1ULL << shard_idx)) != 0;
}

// Choose a non-empty shard near preferred using the nonempty mask. If none, return preferred.
static inline int l25_choose_nonempty_shard(int class_idx, int preferred) {
    uint64_t mask = atomic_load_explicit(&g_l25_pool.nonempty_mask[class_idx], memory_order_acquire);
    if (!mask) return preferred;
    int shift = preferred & 63;
    uint64_t rot = (mask >> shift) | (mask << (64 - shift));
    if (!rot) return preferred;
    int off = __builtin_ctzll(rot);
    return (preferred + off) & (L25_NUM_SHARDS - 1);
}

// Drain remote-free MPSC stack into freelist under the shard lock
static inline void l25_drain_remote_locked(int class_idx, int shard_idx) {
    uintptr_t head = atomic_exchange_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], (uintptr_t)0, memory_order_acq_rel);
    int drained = 0;
    L25Block* list = (L25Block*)head;
    if (list) {
        // Tail-link pattern: find tail, then link entire chain at once (MT-safe)
        L25Block* tail = list;
        int count = 1;
        while (tail->next) {
            tail = tail->next;
            count++;
        }
        // Single atomic write to freelist (prevents race with concurrent alloc)
        tail->next = g_l25_pool.freelist[class_idx][shard_idx];
        g_l25_pool.freelist[class_idx][shard_idx] = list;
        drained = count;
    }
    if (drained) {
        atomic_fetch_sub_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], drained, memory_order_relaxed);
        if (g_l25_pool.freelist[class_idx][shard_idx]) set_nonempty_bit(class_idx, shard_idx);
    }
}

// =====================
// Bump-run TLS helpers
// =====================
static inline int l25_refill_tls_from_active(int class_idx, L25TLSRing* ring, int need) {
    if (need <= 0) need = POOL_L25_RING_CAP;
    L25ActiveRun* ar = &g_l25_active[class_idx];
    if (!ar->base) return 0;
    size_t stride = l25_stride_bytes(class_idx);
    size_t avail = (size_t)((ar->end - ar->cursor) / (ptrdiff_t)stride);
    if (avail == 0) { ar->base = ar->cursor = ar->end = NULL; return 0; }
    int k = (int)((size_t)need < avail ? (size_t)need : avail);
    int pushed = 0;
    while (pushed < k && ring->top < POOL_L25_RING_CAP) {
        L25Block* b = (L25Block*)ar->cursor;
        ring->items[ring->top++] = b;
        ar->cursor += stride;
        pushed++;
    }
    if (ar->cursor >= ar->end) { ar->base = ar->cursor = ar->end = NULL; }
    return pushed;
}

// Forward decl for descriptor registration
static void l25_desc_insert_range(void* base, void* end, int class_idx);

static inline int l25_alloc_new_run(int class_idx) {
    int blocks = l25_blocks_per_run(class_idx);
    size_t stride = l25_stride_bytes(class_idx);
    size_t run_bytes = (size_t)blocks * stride;
    void* raw = mmap(NULL, run_bytes, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (raw == MAP_FAILED || raw == NULL) return 0;
    L25ActiveRun* ar = &g_l25_active[class_idx];
    ar->base = (char*)raw;
    ar->cursor = (char*)raw;
    ar->end = ar->base + run_bytes;
    // Register page descriptors for headerless free
    l25_desc_insert_range(ar->base, ar->end, class_idx);

    // Stats (best-effort)
    g_l25_pool.total_bytes_allocated += run_bytes;
    g_l25_pool.total_bundles_allocated += blocks;
    return 1;
}

// =====================
// L2.5 Page Descriptors
// =====================
typedef struct L25PageDesc {
    void* page;               // 64KB-aligned page base
    int class_idx;            // L2.5 class index (0..4)
    uintptr_t owner_tid;      // Hint: owning thread (at run allocation)
    struct L25PageDesc* next;
} L25PageDesc;

#define L25_DESC_BUCKETS 4096
static L25PageDesc* g_l25_desc_head[L25_DESC_BUCKETS];

static inline size_t l25_desc_hash(void* page) {
    return ((((uintptr_t)page) >> 16) & (L25_DESC_BUCKETS - 1));
}

static inline void* l25_page_base(void* addr) {
    return (void*)((uintptr_t)addr & ~((uintptr_t)L25_PAGE_SIZE - 1));
}

static void l25_desc_insert_range(void* base, void* end, int class_idx) {
    char* p = (char*)(((uintptr_t)base) & ~((uintptr_t)L25_PAGE_SIZE - 1));
    char* e = (char*)end;
    uintptr_t owner = (uintptr_t)(uintptr_t)pthread_self();
    for (; p < e; p += L25_PAGE_SIZE) {
        size_t h = l25_desc_hash(p);
        L25PageDesc* d = (L25PageDesc*)hkm_libc_malloc(sizeof(L25PageDesc));  // Phase 6.X P0 Fix
        if (!d) continue; // best-effort
        d->page = p;
        d->class_idx = class_idx;
        d->owner_tid = owner;
        d->next = g_l25_desc_head[h];
        g_l25_desc_head[h] = d;
    }
}

static inline L25PageDesc* l25_desc_lookup_ptr(void* ptr) {
    void* page = l25_page_base(ptr);
    size_t h = l25_desc_hash(page);
    for (L25PageDesc* d = g_l25_desc_head[h]; d; d = d->next) {
        if (d->page == page) return d;
    }
    return NULL;
}

static inline void l25_desc_update_owner(void* ptr, uintptr_t owner) {
    L25PageDesc* d = l25_desc_lookup_ptr(ptr);
    if (d) d->owner_tid = owner;
}

// ------------------------------
// Owner inbound registry (per-owner MPSC stacks)
// ------------------------------
#ifndef L25_INBOUND_SLOTS
#define L25_INBOUND_SLOTS 512
#endif
typedef struct {
    atomic_uintptr_t head[L25_NUM_CLASSES];
    atomic_uintptr_t tid;  // 0 = empty
} L25InboundSlot;
static L25InboundSlot g_l25_inbound[L25_INBOUND_SLOTS];

static inline size_t inb_hash(uintptr_t tid) {
    return (size_t)((tid ^ (tid >> 17) ^ (tid << 9)));
}

static int inbound_get_slot(uintptr_t tid) {
    if (tid == 0) return -1;
    int limit = g_l25_in_slots;
    if (limit <= 0 || limit > L25_INBOUND_SLOTS) limit = L25_INBOUND_SLOTS;
    size_t h = inb_hash(tid) % (size_t)limit;
    for (int i = 0; i < limit; i++) {
        int idx = (int)((h + (size_t)i) % (size_t)limit);
        uintptr_t cur = atomic_load_explicit(&g_l25_inbound[idx].tid, memory_order_acquire);
        if (cur == tid) return idx;
        if (cur == 0) {
            uintptr_t zero = 0;
            if (atomic_compare_exchange_weak_explicit(&g_l25_inbound[idx].tid, &zero, tid, memory_order_acq_rel, memory_order_relaxed)) {
                // initialize heads lazily (they start at 0 by BSS)
                return idx;
            }
        }
    }
    return -1;
}

static inline void inbound_push_block(int slot, int class_idx, L25Block* b) {
    if (slot < 0) return;
    uintptr_t old_head;
    do {
        old_head = atomic_load_explicit(&g_l25_inbound[slot].head[class_idx], memory_order_acquire);
        b->next = (L25Block*)old_head;
    } while (!atomic_compare_exchange_weak_explicit(&g_l25_inbound[slot].head[class_idx], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed));
}

static inline int inbound_drain_to_tls(uintptr_t self_tid, int class_idx, L25TLSRing* ring) {
    if (!g_l25_owner_inbound) return 0;
    int slot = inbound_get_slot(self_tid);
    if (slot < 0) return 0;
    uintptr_t head = atomic_exchange_explicit(&g_l25_inbound[slot].head[class_idx], (uintptr_t)0, memory_order_acq_rel);
    int moved = 0;
    L25Block* cur = (L25Block*)head;
    while (cur) {
        L25Block* nxt = cur->next;
        if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = cur; }
        else { cur->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = cur; g_l25_tls_bin[class_idx].lo_count++; }
        moved++;
        cur = nxt;
    }
    return moved;
}

// Exposed to hak_free_at(): headerless lookup (returns 1 on success)
int hak_l25_lookup(void* user_ptr, size_t* out_size) {
    L25PageDesc* d = l25_desc_lookup_ptr(user_ptr);
    if (!d) return 0;
    if (out_size) *out_size = g_class_sizes[d->class_idx];
    return 1;
}

// ------------------------------
// Transfer Cache (per-thread)
// ------------------------------
// Per-thread Transfer Cache as array ring (no block writes on fast-path)
#ifndef L25_TC_CAP
#define L25_TC_CAP 64
#endif
typedef struct {
    L25Block* items[L25_TC_CAP];
    int count;  // 0..cap
} L25TCRing;
static __thread L25TCRing g_l25_tc[L25_NUM_CLASSES];

static int g_l25_tc_cap = L25_TC_CAP;     // env: HAKMEM_L25_TC_CAP

static inline void l25_tc_append(int class_idx, L25Block* b) {
    L25TCRing* tc = &g_l25_tc[class_idx];
    if (tc->count < g_l25_tc_cap) {
        tc->items[tc->count++] = b;
    } else {
        // overflow handled by caller via l25_tc_flush
    }
}

static inline int l25_tc_flush(int class_idx, int shard_idx) {
    L25TCRing* tc = &g_l25_tc[class_idx];
    int n = tc->count;
    if (n <= 0) return 0;

    // Build a linked list from ring (LIFO order) only during flush
    L25Block* head = NULL;
    for (int i = 0; i < n; i++) {
        L25Block* b = tc->items[i];
        b->next = head;
        head = b;
    }
    tc->count = 0;

    // CRITICAL FIX: Find tail ONCE before CAS loop (prevents list corruption)
    // Bug: Previous code found tail inside CAS loop, overwriting tail->next on each retry
    L25Block* tail = head;
    while (tail && tail->next) tail = tail->next;

    // Single CAS to remote_head
    uintptr_t old_head;
    HKM_TIME_START(t_l25_remote_push_tc);
    do {
        old_head = atomic_load_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], memory_order_acquire);
        // Link tail to current remote_head (safe to update on each retry)
        if (tail) tail->next = (L25Block*)old_head;
    } while (!atomic_compare_exchange_weak_explicit(&g_l25_pool.remote_head[class_idx][shard_idx], &old_head, (uintptr_t)head, memory_order_release, memory_order_relaxed));
    atomic_fetch_add_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], n, memory_order_relaxed);
    HKM_TIME_END(HKM_CAT_L25_REMOTE_PUSH, t_l25_remote_push_tc);
    set_nonempty_bit(class_idx, shard_idx);
    return n;
}

// Exposed to hak_free_at(): headerless fast free using descriptor
void hak_l25_pool_free_fast(void* user_ptr, uintptr_t site_id) {
    L25PageDesc* d = l25_desc_lookup_ptr(user_ptr);
    if (!d) return;  // unknown → drop
    int class_idx = d->class_idx;
    void* raw = (char*)user_ptr - HEADER_SIZE;

    // Optional: demand-zero for larger classes
    if (g_l25_pool.demand_zero && class_idx >= 3) {
        madvise((char*)raw, HEADER_SIZE + g_class_sizes[class_idx], MADV_DONTNEED);
    }

    // Same-thread hint: prefer per-block owner if header present (HDR_LIGHT>=1), else page owner
    uintptr_t self = (uintptr_t)(uintptr_t)pthread_self();
    uintptr_t owner_hint = d->owner_tid;
    if (g_hdr_light_enabled >= 1) {
        AllocHeader* hdr = (AllocHeader*)raw;
        owner_hint = hdr->owner_tid;
    }
    if (owner_hint == self) {
        L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring;
        if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) {
            ring->items[ring->top++] = (L25Block*)raw;
        } else {
            L25Block* b = (L25Block*)raw;
            b->next = g_l25_tls_bin[class_idx].lo_head;
            g_l25_tls_bin[class_idx].lo_head = b;
            g_l25_tls_bin[class_idx].lo_count++;
        }
    } else {
        // Remote: push to per-thread TC; spill in batch when threshold reached
        int shard = hak_l25_pool_get_shard_index(site_id);
        L25Block* block = (L25Block*)raw;
        l25_tc_append(class_idx, block);
        if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) {
            l25_tc_flush(class_idx, shard);
        }
    }
    g_l25_pool.frees[class_idx]++;
}

// =====================
// BG Drain (remote → freelist)
// =====================
static void* l25_bg_main(void* arg) {
    (void)arg;
    struct timespec ts; ts.tv_sec = 0; ts.tv_nsec = (long)g_l25_bg_interval_ms * 1000000L;
    while (g_l25_pool.initialized) {
        for (int c = 0; c < L25_NUM_CLASSES; c++) {
            for (int s = 0; s < L25_NUM_SHARDS; s++) {
                if (atomic_load_explicit(&g_l25_pool.remote_count[c][s], memory_order_relaxed) != 0) {
                    pthread_mutex_t* l = &g_l25_pool.freelist_locks[c][s].m;
                    pthread_mutex_lock(l);
                    if (atomic_load_explicit(&g_l25_pool.remote_count[c][s], memory_order_relaxed) != 0) {
                        l25_drain_remote_locked(c, s);
                    }
                    pthread_mutex_unlock(l);
                }
            }
        }
        nanosleep(&ts, NULL);
    }
    return NULL;
}

// ===========================================================================
// Page Bundle Management (L2 Pool pattern)
// ===========================================================================

// Refill freelist by allocating a new page bundle
// Args: class_idx - size class index (0-4)
//       shard_idx - shard index (0-63)
// Returns: 1 on success, 0 on failure
//
// Pattern: Same as L2 Pool - allocate raw memory, write header, return to freelist
static int refill_freelist(int class_idx, int shard_idx) {
    if (class_idx < 0 || class_idx >= L25_NUM_CLASSES) return 0;
    if (shard_idx < 0 || shard_idx >= L25_NUM_SHARDS) return 0;

    size_t user_size = g_class_sizes[class_idx];
    size_t bundle_size = HEADER_SIZE + user_size;  // Header + user data

    // Soft CAP guidance: decide how many bundles to allocate (1..2)
    int bundles = 1;
    const FrozenPolicy* pol = hkm_policy_get();
    static int g_l25_min_bundle = -1; // lazy init from env
    if (g_l25_min_bundle < 0) {
        const char* e = getenv("HAKMEM_L25_MIN_BUNDLE");
        int v = (e ? atoi(e) : 1);
        if (v < 1) v = 1; if (v > 2) v = 2; // L2.5 is large; keep conservative
        g_l25_min_bundle = v;
    }
    if (pol) {
        uint16_t cap = pol->large_cap[class_idx];
        if (cap > 0) {
            uint64_t have = g_l25_pool.bundles_by_class[class_idx];
            if (have >= cap) {
                bundles = 1; // over cap: allocate minimally
            } else {
                uint64_t deficit = cap - have;
                bundles = (deficit >= (uint64_t)g_l25_min_bundle) ? g_l25_min_bundle : 1;
                if (bundles > 2) bundles = 2;
            }
        }
    }

    int ok_any = 0;
    for (int b = 0; b < bundles; b++) {
        // Allocate bundle via mmap to avoid malloc contention and allow THP policy later
        void* raw = mmap(NULL, bundle_size, PROT_READ | PROT_WRITE,
                         MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
        if (!raw) {
            if (ok_any) break; else return 0;
        }

        // Write AllocHeader at start
        AllocHeader* hdr = (AllocHeader*)raw;
        hdr->magic = HAKMEM_MAGIC;
        hdr->method = ALLOC_METHOD_L25_POOL;
        hdr->size = user_size;
        hdr->alloc_site = 0;     // Set by hak_l25_pool_try_alloc
        hdr->class_bytes = 0;    // L2.5 blocks not cacheable

        // Freelist uses raw pointer (header start)
        L25Block* block = (L25Block*)raw;
        block->next = g_l25_pool.freelist[class_idx][shard_idx];
        g_l25_pool.freelist[class_idx][shard_idx] = block;
        ok_any = 1;

        // Set non-empty bit (freelist now has blocks)
        set_nonempty_bit(class_idx, shard_idx);

        // Update statistics
        g_l25_pool.refills[class_idx]++;
        g_l25_pool.total_bundles_allocated++;
        g_l25_pool.total_bytes_allocated += bundle_size;
        g_l25_pool.bundles_by_class[class_idx]++;
    }

    return ok_any ? 1 : 0;
}

// ===========================================================================
// Public API
// ===========================================================================

void hak_l25_pool_init(void) {
    if (g_l25_pool.initialized) return;

    memset(&g_l25_pool, 0, sizeof(g_l25_pool));
    for (int c = 0; c < L25_NUM_CLASSES; c++) {
        atomic_store(&g_l25_pool.nonempty_mask[c], 0);
        for (int s = 0; s < L25_NUM_SHARDS; s++) {
            pthread_mutex_init(&g_l25_pool.freelist_locks[c][s].m, NULL);
            atomic_store(&g_l25_pool.remote_head[c][s], (uintptr_t)0);
            atomic_store(&g_l25_pool.remote_count[c][s], 0);
        }
        g_l25_pool.bundles_by_class[c] = 0;
    }
    // Demand-zero toggle
    char* dz = getenv("HAKMEM_L25_DZ");
    g_l25_pool.demand_zero = (dz && atoi(dz) != 0) ? 1 : 0;
    const char* e_wrap = getenv("HAKMEM_WRAP_L25");
    g_wrap_l25_enabled = (e_wrap && atoi(e_wrap) != 0) ? 1 : 0;
    const char* e_ring = getenv("HAKMEM_POOL_TLS_RING");
    if (e_ring) g_l25_tls_ring_enabled = (atoi(e_ring) != 0);
    const char* e_probe = getenv("HAKMEM_TRYLOCK_PROBES");
    if (e_probe) { int v = atoi(e_probe); if (v>=1 && v<=8) g_l25_trylock_probes = v; }
    const char* e_lo = getenv("HAKMEM_TLS_LO_MAX");
    if (e_lo) { int v = atoi(e_lo); if (v>=32 && v<=16384) g_l25_tls_lo_max = v; }
    const char* e_div = getenv("HAKMEM_RING_RETURN_DIV");
    if (e_div) { int v = atoi(e_div); if (v>=2 && v<=4) g_l25_ring_return_div = v; }
    const char* e_run = getenv("HAKMEM_L25_RUN_BLOCKS");
    if (e_run) { int v = atoi(e_run); if (v>=1 && v<=1024) g_l25_run_blocks_override = v; }
    const char* e_rfac = getenv("HAKMEM_L25_RUN_FACTOR");
    if (e_rfac) { int v = atoi(e_rfac); if (v>=1 && v<=8) g_l25_run_factor = v; }
    const char* e_mix = getenv("HAKMEM_SHARD_MIX");
    if (e_mix) g_l25_shard_mix = (atoi(e_mix) != 0);
    const char* e_pref = getenv("HAKMEM_L25_PREF");
    if (e_pref) {
        if (strcmp(e_pref, "remote") == 0) g_l25_pref_remote_first = 1;
        else if (strcmp(e_pref, "run") == 0) g_l25_pref_remote_first = 0;
    }
    const char* e_tc = getenv("HAKMEM_L25_TC_SPILL");
    if (e_tc) { int v = atoi(e_tc); if (v>=0 && v<=4096) g_l25_tc_spill = v; }
    const char* e_tcc = getenv("HAKMEM_L25_TC_CAP");
    if (e_tcc) { int v = atoi(e_tcc); if (v>=8 && v<=L25_TC_CAP) g_l25_tc_cap = v; }
    const char* e_bg = getenv("HAKMEM_L25_BG_DRAIN");
    if (e_bg) g_l25_bg_drain_enabled = (atoi(e_bg) != 0);
    const char* e_bgms = getenv("HAKMEM_L25_BG_MS");
    if (e_bgms) { int v = atoi(e_bgms); if (v>=1 && v<=1000) g_l25_bg_interval_ms = v; }
    const char* e_rtr = getenv("HAKMEM_L25_RING_TRIGGER");
    if (e_rtr) { int v = atoi(e_rtr); if (v>=0 && v<=POOL_L25_RING_CAP) g_l25_ring_trigger = v; }
    const char* e_inb = getenv("HAKMEM_L25_OWNER_INBOUND");
    if (e_inb) g_l25_owner_inbound = (atoi(e_inb) != 0);
    const char* e_ins = getenv("HAKMEM_L25_INBOUND_SLOTS");
    if (e_ins) { int v = atoi(e_ins); if (v>=64 && v<=L25_INBOUND_SLOTS) g_l25_in_slots = v; }

    // Safe-mode: disable aggressive remote/inbound features by default.
    // Set HAKMEM_L25_REMOTE_SAFE=0 to re-enable legacy behaviour.
    const char* e_safe = getenv("HAKMEM_L25_REMOTE_SAFE");
    int safe_mode = 1;
    if (e_safe && atoi(e_safe) == 0) safe_mode = 0;
    if (safe_mode) {
        g_l25_owner_inbound = 0;
        g_l25_pref_remote_first = 0;
        g_l25_trylock_probes = 0;
        g_l25_bg_drain_enabled = 0;
        g_l25_bg_remote_enable = 0;
        g_l25_probe_auto = 0;
    }

    // init inbound table tid=0
    for (int i = 0; i < g_l25_in_slots && i < L25_INBOUND_SLOTS; i++) {
        atomic_store(&g_l25_inbound[i].tid, (uintptr_t)0);
        for (int c = 0; c < L25_NUM_CLASSES; c++) atomic_store(&g_l25_inbound[i].head[c], (uintptr_t)0);
    }
    g_l25_pool.initialized = 1;

    HAKMEM_LOG("[L2.5] Initialized (LargePool)\n");
    HAKMEM_LOG("[L2.5] Classes: 64KB, 128KB, 256KB, 512KB, 1MB\n");
    HAKMEM_LOG("[L2.5] Page size: %d KB\n", L25_PAGE_SIZE / 1024);
    HAKMEM_LOG("[L2.5] Shards: %d (site-based)\n", L25_NUM_SHARDS);
    if (g_l25_bg_drain_enabled) {
        pthread_create(&g_l25_bg_thread, NULL, l25_bg_main, NULL);
        HAKMEM_LOG("[L2.5] BG drain enabled (interval=%d ms)\n", g_l25_bg_interval_ms);
    }
}

void hak_l25_pool_shutdown(void) {
    if (!g_l25_pool.initialized) return;

    hak_l25_pool_print_stats();

    // Free all blocks (L2 Pool pattern: just free raw pointers)
    for (int class_idx = 0; class_idx < L25_NUM_CLASSES; class_idx++) {
        for (int shard_idx = 0; shard_idx < L25_NUM_SHARDS; shard_idx++) {
            L25Block* block = g_l25_pool.freelist[class_idx][shard_idx];
            while (block) {
                L25Block* next = block->next;
                free(block);  // Free raw allocation (includes header + user data)
                block = next;
            }
        }
    }

    g_l25_pool.initialized = 0;
}

void* hak_l25_pool_try_alloc(size_t size, uintptr_t site_id) {
    if (!g_l25_pool.initialized) hak_l25_pool_init();
    // P1.7 approach: Avoid using L2.5 during ALL wrapper calls (conservative but safe)
    extern int hak_in_wrapper(void);
    if (hak_in_wrapper() && !g_wrap_l25_enabled) return NULL;
    if (!hak_l25_pool_is_poolable(size)) return NULL;

    // Get class index (inline綺麗綺麗!)
    int class_idx = hak_l25_pool_get_class_index(size);
    if (class_idx < 0) return NULL;

    // Inbound drain (owner inbound → TLS) when ring low
    if (g_l25_owner_inbound && g_l25_tls_ring_enabled && (&g_l25_tls_bin[class_idx].ring)->top <= g_l25_ring_trigger) {
        inbound_drain_to_tls((uintptr_t)(uintptr_t)pthread_self(), class_idx, &g_l25_tls_bin[class_idx].ring);
    }

    // TLS two-tier fast path
    L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring;
    if (g_l25_tls_ring_enabled && ring->top > 0) {
        HKM_TIME_START(t_l25_ring_pop0);
        L25Block* tlsb = ring->items[--ring->top];
        HKM_TIME_END(HKM_CAT_L25_TLS_RING_POP, t_l25_ring_pop0);
        void* raw = (void*)tlsb;
        AllocHeader* hdr = (AllocHeader*)raw;
        l25_write_header(hdr, class_idx, site_id);
        g_l25_pool.hits[class_idx]++;
        return (char*)raw + HEADER_SIZE;
    }
    L25Block* block = g_l25_tls_bin[class_idx].lo_head;
    if (block) {
        g_l25_tls_bin[class_idx].lo_head = block->next;
        if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--;
        void* raw = (void*)block; AllocHeader* hdr = (AllocHeader*)raw;
        l25_write_header(hdr, class_idx, site_id);
        g_l25_pool.hits[class_idx]++;
        return (char*)raw + HEADER_SIZE;
    }

    if (!block) {
        // TLS cache empty: choose order by preference (remote-first or run-first)
        if (g_l25_pref_remote_first) {
            // Remote-first: only if ring below trigger and remote likely non-empty
            int shard_idx = hak_l25_pool_get_shard_index(site_id);
            if (g_l25_tls_ring_enabled && ring->top <= g_l25_ring_trigger) {
                // prefetch remote head
                __builtin_prefetch((const void*)&g_l25_pool.remote_head[class_idx][shard_idx], 0, 1);
                int s0 = l25_choose_nonempty_shard(class_idx, shard_idx);
                for (int probe = 0; probe < g_l25_trylock_probes; ++probe) {
                    int s = (s0 + probe) & (L25_NUM_SHARDS - 1);
                    pthread_mutex_t* l = &g_l25_pool.freelist_locks[class_idx][s].m;
                    if (pthread_mutex_trylock(l) == 0) {
                        if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {
                            l25_drain_remote_locked(class_idx, s);
                        }
                        L25Block* head = g_l25_pool.freelist[class_idx][s];
                        int to_ring = POOL_L25_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
                        while (head && to_ring-- > 0) { L25Block* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
                        while (head) { L25Block* nxt = head->next; head->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = head; g_l25_tls_bin[class_idx].lo_count++; head = nxt; }
                        g_l25_pool.freelist[class_idx][s] = head;
                        if (!head) clear_nonempty_bit(class_idx, s);
                        pthread_mutex_unlock(l);
                        if (ring->top > 0) {
                            L25Block* tlsb = ring->items[--ring->top];
                            void* rawA = (void*)tlsb; AllocHeader* hdrA = (AllocHeader*)rawA;
                            l25_write_header(hdrA, class_idx, site_id);
                            g_l25_pool.hits[class_idx]++;
                            return (char*)rawA + HEADER_SIZE;
                        }
                    }
                }
            }
            // Fall back to bump-run ActiveRun
            if (g_l25_tls_ring_enabled) {
                HKM_TIME_START(t_l25_alloc_page0);
                int need = POOL_L25_RING_CAP - ring->top; if (need < 1) need = POOL_L25_RING_CAP;
                int pushed = l25_refill_tls_from_active(class_idx, ring, need);
                if (pushed == 0) {
                    if (l25_alloc_new_run(class_idx)) {
                        pushed = l25_refill_tls_from_active(class_idx, ring, need);
                    }
                }
                HKM_TIME_END(HKM_CAT_L25_ALLOC_TLS_PAGE, t_l25_alloc_page0);
                if (g_l25_tls_ring_enabled && ring->top > 0) {
                    L25Block* tlsb = ring->items[--ring->top];
                    void* raw0 = (void*)tlsb; AllocHeader* hdr0 = (AllocHeader*)raw0;
                    l25_write_header(hdr0, class_idx, site_id);
                    g_l25_pool.hits[class_idx]++;
                    return (char*)raw0 + HEADER_SIZE;
                }
            }
        } else {
            // Run-first (previous behavior)
            if (g_l25_tls_ring_enabled) {
                HKM_TIME_START(t_l25_alloc_page0);
                int need = POOL_L25_RING_CAP - ring->top; if (need < 1) need = POOL_L25_RING_CAP;
                int pushed = l25_refill_tls_from_active(class_idx, ring, need);
                if (pushed == 0) {
                    if (l25_alloc_new_run(class_idx)) {
                        pushed = l25_refill_tls_from_active(class_idx, ring, need);
                    }
                }
                HKM_TIME_END(HKM_CAT_L25_ALLOC_TLS_PAGE, t_l25_alloc_page0);
                if (g_l25_tls_ring_enabled && ring->top > 0) {
                    L25Block* tlsb = ring->items[--ring->top];
                    void* raw0 = (void*)tlsb; AllocHeader* hdr0 = (AllocHeader*)raw0;
                    l25_write_header(hdr0, class_idx, site_id);
                    g_l25_pool.hits[class_idx]++;
                    return (char*)raw0 + HEADER_SIZE;
                }
            }
        }

        // TLS cache still empty, refill from global freelist (slow path)
        int shard_idx = hak_l25_pool_get_shard_index(site_id);

        // Try batch-steal via trylock to fill TLS ring; drain remote under lock
        if (g_l25_tls_ring_enabled) {
            int s0 = l25_choose_nonempty_shard(class_idx, shard_idx);
            for (int probe = 0; probe < g_l25_trylock_probes; ++probe) {
                int s = (s0 + probe) & (L25_NUM_SHARDS - 1);
                pthread_mutex_t* l = &g_l25_pool.freelist_locks[class_idx][s].m;
                if (pthread_mutex_trylock(l) == 0) {
                    if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {
                        l25_drain_remote_locked(class_idx, s);
                    }
                    L25Block* head = g_l25_pool.freelist[class_idx][s];
                    int to_ring = POOL_L25_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
                    while (head && to_ring-- > 0) { L25Block* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
                    while (head) { L25Block* nxt = head->next; head->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = head; g_l25_tls_bin[class_idx].lo_count++; head = nxt; }
                    g_l25_pool.freelist[class_idx][s] = head;
                    if (!head) clear_nonempty_bit(class_idx, s);
                    pthread_mutex_unlock(l);
                    if (ring->top > 0) {
                        L25Block* tlsb = ring->items[--ring->top];
                        void* raw = (void*)tlsb; AllocHeader* hdr = (AllocHeader*)raw;
                        memcpy(hdr, &g_header_templates[class_idx], sizeof(AllocHeader));
                        if (!g_hdr_light_enabled) { hdr->alloc_site = site_id; hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); }
                        g_l25_pool.hits[class_idx]++;
                        return (char*)raw + HEADER_SIZE;
                    }
                }
            }
        }
        // Try to pop from global freelist (lock shard)
        pthread_mutex_t* lock = &g_l25_pool.freelist_locks[class_idx][shard_idx].m;
        struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1);
        HKM_TIME_START(t_l25_lock);
        pthread_mutex_lock(lock);
        HKM_TIME_END(HKM_CAT_L25_LOCK, t_l25_lock);
        hkm_prof_end(lk1, HKP_L25_LOCK, &ts_lk1);
        if (atomic_load_explicit(&g_l25_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) {
            l25_drain_remote_locked(class_idx, shard_idx);
        }
        block = g_l25_pool.freelist[class_idx][shard_idx];

        if (!block) {
            // Try simple shard steal if over Soft CAP (avoid over-refill)
            int stole = 0;
            const FrozenPolicy* pol = hkm_policy_get();
            if (pol) {
                uint16_t cap = pol->large_cap[class_idx];
                if (cap > 0 && g_l25_pool.bundles_by_class[class_idx] >= cap) {
                    // probe ±1..2 neighboring shards
                    for (int d = 1; d <= 2 && !stole; d++) {
                        int s1 = (shard_idx + d) & (L25_NUM_SHARDS - 1);
                        int s2 = (shard_idx - d) & (L25_NUM_SHARDS - 1);
                        if (is_shard_nonempty(class_idx, s1)) {
                            pthread_mutex_t* l2 = &g_l25_pool.freelist_locks[class_idx][s1].m;
                            pthread_mutex_lock(l2);
                            L25Block* b2 = g_l25_pool.freelist[class_idx][s1];
                            if (b2) {
                                g_l25_pool.freelist[class_idx][s1] = b2->next;
                                if (!g_l25_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1);
                                block = b2;
                                stole = 1;
                            }
                            pthread_mutex_unlock(l2);
                        }
                        if (!stole && is_shard_nonempty(class_idx, s2)) {
                            pthread_mutex_t* l3 = &g_l25_pool.freelist_locks[class_idx][s2].m;
                            pthread_mutex_lock(l3);
                            L25Block* b3 = g_l25_pool.freelist[class_idx][s2];
                            if (b3) {
                                g_l25_pool.freelist[class_idx][s2] = b3->next;
                                if (!g_l25_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2);
                                block = b3;
                                stole = 1;
                            }
                            pthread_mutex_unlock(l3);
                        }
                    }
                }
            }

            if (!stole && !block) {
                // Global freelist empty or no steal, allocate new bundle
                {
                    struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf);
            HKM_TIME_START(t_l25_refill);
            int ok = refill_freelist(class_idx, shard_idx);
            HKM_TIME_END(HKM_CAT_L25_REFILL, t_l25_refill);
                    hkm_prof_end(rf, HKP_L25_REFILL, &ts_rf);
                    if (!ok) {
                    g_l25_pool.misses[class_idx]++;
                    pthread_mutex_unlock(lock);
                    return NULL;  // Out of memory
                    }
                }
                // Try again after refill
                block = g_l25_pool.freelist[class_idx][shard_idx];
                if (!block) {
                    g_l25_pool.misses[class_idx]++;
                    pthread_mutex_unlock(lock);
                    return NULL;  // Refill failed
                }
            }

        }

        // Batch-pop under lock: move many blocks to TLS (ring first, then LIFO)
        L25Block* head2 = g_l25_pool.freelist[class_idx][shard_idx];
        if (head2) {
            int to_ring2 = POOL_L25_RING_CAP - ring->top; if (to_ring2 < 0) to_ring2 = 0;
            L25Block* h = head2;
            // Fill ring
            while (h && to_ring2-- > 0) {
                L25Block* nxt = h->next;
                // update owner for same-thread hint
                l25_desc_update_owner((void*)h, (uintptr_t)(uintptr_t)pthread_self());
                ring->items[ring->top++] = h;
                h = nxt;
            }
            // Fill local LIFO
            while (h) { L25Block* nxt = h->next; h->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = h; g_l25_tls_bin[class_idx].lo_count++; h = nxt; }
            // Shard freelist becomes empty after batch-pop
            g_l25_pool.freelist[class_idx][shard_idx] = NULL;
            clear_nonempty_bit(class_idx, shard_idx);
        }
        pthread_mutex_unlock(lock);
        // Fast return if ring gained items
        if (g_l25_tls_ring_enabled && ring->top > 0) {
            L25Block* tlsb = ring->items[--ring->top];
            void* raw2 = (void*)tlsb; AllocHeader* hdr2 = (AllocHeader*)raw2;
            memcpy(hdr2, &g_header_templates[class_idx], sizeof(AllocHeader));
            if (!g_hdr_light_enabled) { hdr2->alloc_site = site_id; hdr2->owner_tid = (uintptr_t)(uintptr_t)pthread_self(); }
            g_l25_pool.hits[class_idx]++;
            return (char*)raw2 + HEADER_SIZE;
        }
        // Or pop from local LIFO if available
        if (g_l25_tls_bin[class_idx].lo_head) {
            L25Block* b2 = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = b2->next; if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--;
            void* raw3 = (void*)b2; AllocHeader* hdr3 = (AllocHeader*)raw3;
            l25_write_header(hdr3, class_idx, site_id);
            g_l25_pool.hits[class_idx]++;
            return (char*)raw3 + HEADER_SIZE;
        }
    }

    // Push to TLS and return one
    if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = block; }
    else { block->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = block; g_l25_tls_bin[class_idx].lo_count++; }
    L25Block* take;
    if (g_l25_tls_ring_enabled && ring->top > 0) { HKM_TIME_START(t_l25_ring_pop1); take = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_L25_TLS_RING_POP, t_l25_ring_pop1); }
    else { HKM_TIME_START(t_l25_lifo_pop0); take = g_l25_tls_bin[class_idx].lo_head; if (take) { g_l25_tls_bin[class_idx].lo_head = take->next; if (g_l25_tls_bin[class_idx].lo_count) g_l25_tls_bin[class_idx].lo_count--; } HKM_TIME_END(HKM_CAT_L25_TLS_LIFO_POP, t_l25_lifo_pop0); }
    void* raw = (void*)take; AllocHeader* hdr = (AllocHeader*)raw; l25_write_header(hdr, class_idx, site_id);
    g_l25_pool.hits[class_idx]++;
    return (char*)raw + HEADER_SIZE;
}

void hak_l25_pool_free(void* ptr, size_t size, uintptr_t site_id) {
    if (!ptr) return;
    if (!g_l25_pool.initialized) return;
    if (!hak_l25_pool_is_poolable(size)) return;

    // ptr is user pointer, get raw pointer (header start) - L2 Pool pattern
    void* raw = (char*)ptr - HEADER_SIZE;

    // Validate header
    AllocHeader* hdr = (AllocHeader*)raw;
    if (hdr->magic != HAKMEM_MAGIC) {
        extern int g_invalid_free_log; // from hakmem.c
        if (g_invalid_free_log) {
            fprintf(stderr, "[L2.5] ERROR: Invalid magic 0x%X in l25_pool_free, expected 0x%X\n",
                    hdr->magic, HAKMEM_MAGIC);
        }
        return;  // Skip free (corruption detected)
    }

    if (hdr->method != ALLOC_METHOD_L25_POOL) {
        extern int g_invalid_free_log; // from hakmem.c
        if (g_invalid_free_log) {
            fprintf(stderr, "[L2.5] ERROR: Wrong method %d in l25_pool_free, expected L25_POOL\n",
                    hdr->method);
        }
        return;  // Skip free (not an L2.5 allocation)
    }

    // Get class index
    int class_idx = hak_l25_pool_get_class_index(size);
    if (class_idx < 0) return;

    // Optional: demand-zero large classes (512KB/1MB) to reduce future soft-fault cost
    if (g_l25_pool.demand_zero) {
        int class_idx_dz = hak_l25_pool_get_class_index(size);
        if (class_idx_dz >= 3) {
            madvise((char*)raw, HEADER_SIZE + size, MADV_DONTNEED);
        }
    }

    // Same-thread hint via header owner (if light header present)
    uintptr_t self = (uintptr_t)(uintptr_t)pthread_self();
    if (g_hdr_light_enabled >= 1 && hdr->owner_tid == self) {
        L25TLSRing* ring = &g_l25_tls_bin[class_idx].ring;
        if (g_l25_tls_ring_enabled && ring->top < POOL_L25_RING_CAP) { ring->items[ring->top++] = (L25Block*)raw; }
        else { L25Block* b = (L25Block*)raw; b->next = g_l25_tls_bin[class_idx].lo_head; g_l25_tls_bin[class_idx].lo_head = b; g_l25_tls_bin[class_idx].lo_count++; }
    } else {
        // Cross-thread path: owner inbound or TC
        uintptr_t owner = 0;
        if (g_hdr_light_enabled >= 1) owner = hdr->owner_tid;
        if (g_l25_owner_inbound && owner != 0) {
            int slot = inbound_get_slot(owner);
            if (slot >= 0) {
                inbound_push_block(slot, class_idx, (L25Block*)raw);
            } else {
                int shard = hak_l25_pool_get_shard_index(site_id);
                L25Block* block = (L25Block*)raw;
                l25_tc_append(class_idx, block);
                if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) {
                    l25_tc_flush(class_idx, shard);
                }
            }
        } else {
            int shard = hak_l25_pool_get_shard_index(site_id);
            L25Block* block = (L25Block*)raw;
            l25_tc_append(class_idx, block);
            if (g_l25_tc_spill > 0 && g_l25_tc[class_idx].count >= g_l25_tc_spill) {
                l25_tc_flush(class_idx, shard);
            }
        }
    }

    g_l25_pool.frees[class_idx]++;
}

// ---------------------------------------------------------------------------
// Runtime tuning setters (exposed via HAKX tuner)
// ---------------------------------------------------------------------------
void hak_l25_set_run_factor(int v) {
    if (v >= 1 && v <= 8) g_l25_run_factor = v;
}

void hak_l25_set_remote_threshold(int v) {
    if (v >= 1 && v <= 4096) g_l25_remote_threshold = v;
}

void hak_l25_set_bg_remote_batch(int v) {
    if (v >= 1 && v <= 4096) g_l25_bg_remote_batch = v;
}

void hak_l25_set_bg_remote_enable(int on) {
    g_l25_bg_remote_enable = (on != 0);
}

void hak_l25_set_pref_remote_first(int remote_first) {
    g_l25_pref_remote_first = (remote_first != 0);
}

void hak_l25_pool_print_stats(void) {
    if (!g_l25_pool.initialized) return;

    printf("\n");
    printf("========================================\n");
    printf("L2.5 Pool Statistics (LargePool)\n");
    printf("========================================\n");

    const char* class_names[L25_NUM_CLASSES] = {
        "64KB", "128KB", "256KB", "512KB", "1MB"
    };

    for (int i = 0; i < L25_NUM_CLASSES; i++) {
        uint64_t total = g_l25_pool.hits[i] + g_l25_pool.misses[i];
        double hit_rate = (total > 0) ? (100.0 * g_l25_pool.hits[i] / total) : 0.0;

        printf("Class %-6s: hits=%7lu misses=%7lu refills=%7lu frees=%7lu (%.1f%% hit)\n",
               class_names[i],
               (unsigned long)g_l25_pool.hits[i],
               (unsigned long)g_l25_pool.misses[i],
               (unsigned long)g_l25_pool.refills[i],
               (unsigned long)g_l25_pool.frees[i],
               hit_rate);
    }

    printf("----------------------------------------\n");
    printf("Total bytes allocated: %lu MB\n",
           (unsigned long)(g_l25_pool.total_bytes_allocated / (1024 * 1024)));
    printf("Total bundles allocated: %lu\n",
           (unsigned long)g_l25_pool.total_bundles_allocated);
    printf("========================================\n");
}

void hak_l25_pool_stats_snapshot(uint64_t hits[], uint64_t misses[], uint64_t refills[], uint64_t frees[]) {
    if (!g_l25_pool.initialized) {
        for (int i = 0; i < L25_NUM_CLASSES; i++) {
            if (hits) hits[i] = 0;
            if (misses) misses[i] = 0;
            if (refills) refills[i] = 0;
            if (frees) frees[i] = 0;
        }
        return;
    }
    for (int i = 0; i < L25_NUM_CLASSES; i++) {
        if (hits) hits[i] = g_l25_pool.hits[i];
        if (misses) misses[i] = g_l25_pool.misses[i];
        if (refills) refills[i] = g_l25_pool.refills[i];
        if (frees) frees[i] = g_l25_pool.frees[i];
    }
}