hakmem/core/box/pool_core_api.inc.h

// pool_core_api.inc.h — Box: L2 Pool core state and basic config
#ifndef POOL_CORE_API_INC_H
#define POOL_CORE_API_INC_H

// Global knobs (env-configurable)
static int g_wrap_l2_enabled = 0; // env: HAKMEM_WRAP_L2=1 to allow in wrappers
static int g_shard_mix_enabled = 0; // env: HAKMEM_SHARD_MIX=1 to enable stronger hashing
static int g_tls_ring_enabled = 1;  // env: HAKMEM_POOL_TLS_RING=1 to enable TLS ring
static int g_trylock_probes = 3;     // env: HAKMEM_TRYLOCK_PROBES (1..8)
static int g_ring_return_div = 2;    // env: HAKMEM_RING_RETURN_DIV (2=half, 3=third)
static int g_tls_lo_max = 256;       // env: HAKMEM_TLS_LO_MAX (LIFO size cap)
int g_hdr_light_enabled = 0;         // env: HAKMEM_HDR_LIGHT=1/2
static int g_pool_min_bundle = 2;    // env: HAKMEM_POOL_MIN_BUNDLE
static int g_count_sample_exp = 10;  // env: HAKMEM_POOL_COUNT_SAMPLE (0..16)
static __thread uint32_t t_pool_rng = 0x243f6a88u;  // per-thread RNG for sampling

// Size class table (for O(1) lookup). Index 5/6 are Bridge classes for 32-64KB gap.
static size_t g_class_sizes[POOL_NUM_CLASSES] = {
    POOL_CLASS_2KB, POOL_CLASS_4KB, POOL_CLASS_8KB, POOL_CLASS_16KB,
    POOL_CLASS_32KB, POOL_CLASS_40KB, POOL_CLASS_52KB
};

__attribute__((unused)) static const int g_blocks_per_page[POOL_NUM_CLASSES] = {
    POOL_PAGE_SIZE / POOL_CLASS_2KB,
    POOL_PAGE_SIZE / POOL_CLASS_4KB,
    POOL_PAGE_SIZE / POOL_CLASS_8KB,
    POOL_PAGE_SIZE / POOL_CLASS_16KB,
    POOL_PAGE_SIZE / POOL_CLASS_32KB,
    POOL_PAGE_SIZE / POOL_CLASS_40KB,
    POOL_PAGE_SIZE / POOL_CLASS_52KB
};

// Global pool state
typedef struct {
    PoolBlock* freelist[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
    PaddedMutex freelist_locks[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
    atomic_uint_fast64_t nonempty_mask[POOL_NUM_CLASSES];
    atomic_uintptr_t remote_head[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
    atomic_uint      remote_count[POOL_NUM_CLASSES][POOL_NUM_SHARDS];
    uint64_t hits[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t misses[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t refills[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t frees[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    uint64_t total_bytes_allocated __attribute__((aligned(64)));
    uint64_t total_pages_allocated __attribute__((aligned(64)));
    uint64_t pages_by_class[POOL_NUM_CLASSES] __attribute__((aligned(64)));
    int bundle_factor[POOL_NUM_CLASSES];
    uint64_t last_hits[POOL_NUM_CLASSES];
    uint64_t last_misses[POOL_NUM_CLASSES];
    int initialized;
    int tls_free_enabled;
    atomic_uint_fast64_t trylock_attempts __attribute__((aligned(64)));
    atomic_uint_fast64_t trylock_success __attribute__((aligned(64)));
    atomic_uint_fast64_t ring_underflow  __attribute__((aligned(64)));
} PoolGlobal;

static PoolGlobal g_pool;

// --- Boxed Public/Core API implementations moved from hakmem_pool.c ---

// Adjust bundle factor based on window stats
static inline void pool_update_bundle_factor(int class_idx) {
    uint64_t h = g_pool.hits[class_idx];
    uint64_t m = g_pool.misses[class_idx];
    uint64_t dh = h - g_pool.last_hits[class_idx];
    uint64_t dm = m - g_pool.last_misses[class_idx];
    uint64_t dt = dh + dm;
    if (dt < 256) return;
    int bf = g_pool.bundle_factor[class_idx];
    if (bf <= 0) bf = 1;
    if (dt > 0) {
        double hit_rate = (double)dh / (double)dt;
        if (hit_rate < 0.60 && dm > (dh + 16)) { if (bf < 4) bf++; }
        else if (hit_rate > 0.90 && dh > (dm + 32)) { if (bf > 1) bf--; }
    }
    g_pool.bundle_factor[class_idx] = bf;
    g_pool.last_hits[class_idx] = h;
    g_pool.last_misses[class_idx] = m;
}

// Refill freelist by allocating a new 64KiB page and splitting to blocks
static int refill_freelist(int class_idx, int shard_idx) {
    if (class_idx < 0 || class_idx >= POOL_NUM_CLASSES) return 0;
    if (shard_idx < 0 || shard_idx >= POOL_NUM_SHARDS) return 0;
    size_t user_size = g_class_sizes[class_idx];
    size_t block_size = HEADER_SIZE + user_size;
    int blocks_per_page = POOL_PAGE_SIZE / block_size;
    if (blocks_per_page == 0) return 0;
    void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (!page) return 0;
    pool_update_bundle_factor(class_idx);
    int bundles = g_pool.bundle_factor[class_idx];
    if (bundles < 1) bundles = 1; if (bundles > 4) bundles = 4;
    const FrozenPolicy* pol = hkm_policy_get();
    if (pol) {
        uint16_t cap = 0;
        if (class_idx < 5) cap = pol->mid_cap[class_idx];
        else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1;
        else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2;
        if (cap > 0) {
            uint64_t have = g_pool.pages_by_class[class_idx];
            if (have >= cap) bundles = 1; else {
                uint64_t deficit = (cap - have);
                if (deficit < (uint64_t)bundles) bundles = (int)deficit;
                if (bundles < 1) bundles = 1; if (bundles > 4) bundles = 4;
                if (deficit >= (uint64_t)g_pool_min_bundle && bundles < g_pool_min_bundle) bundles = g_pool_min_bundle;
            }
        }
    }
    int pages_allocated_this_call = 0;
    for (int b = 0; b < bundles; b++) {
        PoolBlock* freelist_head = NULL;
        for (int i = 0; i < blocks_per_page; i++) {
            void* raw_block = (char*)page + (i * block_size);
            __builtin_prefetch((char*)raw_block + block_size, 1, 1);
            PoolBlock* block = (PoolBlock*)raw_block;
            block->next = freelist_head; freelist_head = block;
        }
        if (g_pool.freelist[class_idx][shard_idx]) {
            PoolBlock* tail = freelist_head; while (tail->next) tail = tail->next;
            tail->next = g_pool.freelist[class_idx][shard_idx];
        }
        g_pool.freelist[class_idx][shard_idx] = freelist_head;
        mid_desc_register(page, class_idx, 0);
        if (b + 1 < bundles) {
            page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
            if (!page) break;
        }
        pages_allocated_this_call++;
    }
    set_nonempty_bit(class_idx, shard_idx);
    g_pool.refills[class_idx]++;
    g_pool.total_pages_allocated += pages_allocated_this_call;
    g_pool.pages_by_class[class_idx] += pages_allocated_this_call;
    g_pool.total_bytes_allocated += (uint64_t)pages_allocated_this_call * (uint64_t)POOL_PAGE_SIZE;
    return 1;
}

// Initialization and teardown
#ifndef HAKMEM_POOL_API_NO_PUBLIC
static pthread_once_t hak_pool_init_once_control = PTHREAD_ONCE_INIT;
static void hak_pool_init_impl(void) {
    const FrozenPolicy* pol = hkm_policy_get();
    if (pol && pol->mid_dyn1_bytes >= POOL_MIN_SIZE && pol->mid_dyn1_bytes <= POOL_MAX_SIZE) g_class_sizes[5] = pol->mid_dyn1_bytes; else g_class_sizes[5] = 0;
    if (pol && pol->mid_dyn2_bytes >= POOL_MIN_SIZE && pol->mid_dyn2_bytes <= POOL_MAX_SIZE) g_class_sizes[6] = pol->mid_dyn2_bytes; else g_class_sizes[6] = 0;
    for (int c = 0; c < POOL_NUM_CLASSES; c++) {
        for (int s = 0; s < POOL_NUM_SHARDS; s++) { g_pool.freelist[c][s] = NULL; }
        atomic_store(&g_pool.nonempty_mask[c], 0);
        for (int s = 0; s < POOL_NUM_SHARDS; s++) {
            pthread_mutex_init(&g_pool.freelist_locks[c][s].m, NULL);
            atomic_store(&g_pool.remote_head[c][s], (uintptr_t)0);
            atomic_store(&g_pool.remote_count[c][s], 0);
        }
        g_pool.hits[c] = 0; g_pool.misses[c] = 0; g_pool.refills[c] = 0; g_pool.frees[c] = 0; g_pool.pages_by_class[c] = 0;
        g_pool.bundle_factor[c] = 1; g_pool.last_hits[c] = 0; g_pool.last_misses[c] = 0;
    }
    g_pool.total_bytes_allocated = 0; g_pool.total_pages_allocated = 0;
    atomic_store(&g_pool.trylock_attempts, 0); atomic_store(&g_pool.trylock_success, 0); atomic_store(&g_pool.ring_underflow, 0);
    const char* e_tls = getenv("HAKMEM_POOL_TLS_FREE"); g_pool.tls_free_enabled = (e_tls == NULL) ? 1 : (atoi(e_tls) != 0);
    const char* e_wrap = getenv("HAKMEM_WRAP_L2"); g_wrap_l2_enabled = (e_wrap && atoi(e_wrap) != 0) ? 1 : 0;
    const char* e_minb = getenv("HAKMEM_POOL_MIN_BUNDLE"); if (e_minb) { int v = atoi(e_minb); if (v>=1 && v<=8) g_pool_min_bundle = v; }
    const char* e_mix = getenv("HAKMEM_SHARD_MIX"); g_shard_mix_enabled = (e_mix && atoi(e_mix) != 0) ? 1 : 0;
    const char* e_ring = getenv("HAKMEM_POOL_TLS_RING"); if (e_ring) g_tls_ring_enabled = (atoi(e_ring) != 0);
    const char* e_hdr = getenv("HAKMEM_HDR_LIGHT"); if (e_hdr) g_hdr_light_enabled = atoi(e_hdr);
    const char* e_probe = getenv("HAKMEM_TRYLOCK_PROBES"); if (e_probe) { int v = atoi(e_probe); if (v>=1 && v<=8) g_trylock_probes = v; }
    const char* e_div = getenv("HAKMEM_RING_RETURN_DIV"); if (e_div) { int v = atoi(e_div); if (v>=2 && v<=4) g_ring_return_div = v; }
    const char* e_lo = getenv("HAKMEM_TLS_LO_MAX"); if (e_lo) { int v = atoi(e_lo); if (v>=32 && v<=16384) g_tls_lo_max = v; }
    const char* e_cs = getenv("HAKMEM_POOL_COUNT_SAMPLE"); if (e_cs) { int v = atoi(e_cs); if (v>=0 && v<=16) g_count_sample_exp = v; }
    const char* e_tc = getenv("HAKMEM_TC_ENABLE"); if (e_tc) g_tc_enabled = (atoi(e_tc) != 0);
    const char* e_tcu = getenv("HAKMEM_TC_UNBOUNDED"); if (e_tcu) g_tc_drain_unbounded = (atoi(e_tcu) != 0);
    const char* e_tcm = getenv("HAKMEM_TC_DRAIN_MAX"); if (e_tcm) { int v = atoi(e_tcm); if (v>=0 && v<=65536) g_tc_drain_max = v; }
    const char* e_tct = getenv("HAKMEM_TC_DRAIN_TRIGGER"); if (e_tct) { int v = atoi(e_tct); if (v>=0 && v<=POOL_L2_RING_CAP) g_tc_drain_trigger = v; }
    const char* e_mf2 = getenv("HAKMEM_MF2_ENABLE");
    if (e_mf2 && atoi(e_mf2) != 0) {
        g_mf2_enabled = 1; mf2_page_registry_init();
        const char* e_maxq = getenv("HAKMEM_MF2_MAX_QUEUES"); if (e_maxq) { int v = atoi(e_maxq); if (v>=1 && v<=256) g_mf2_max_queues = v; }
        const char* e_lease = getenv("HAKMEM_MF2_LEASE_MS"); if (e_lease) { int v = atoi(e_lease); if (v>=0 && v<=1000) g_mf2_lease_ms = v; }
        const char* e_idle = getenv("HAKMEM_MF2_IDLE_THRESHOLD_US"); if (e_idle) { int v = atoi(e_idle); if (v>=0 && v<=10000) g_mf2_idle_threshold_us = v; }
        HAKMEM_LOG("[Pool] MF2 Per-Page Sharding enabled\n");
        HAKMEM_LOG("[MF2] max_queues=%d, lease_ms=%d, idle_threshold_us=%d\n", g_mf2_max_queues, g_mf2_lease_ms, g_mf2_idle_threshold_us);
    }
    g_pool.initialized = 1;
    HAKMEM_LOG("[Pool] Initialized (L2 Hybrid Pool)\n");
    if (g_class_sizes[5] != 0 || g_class_sizes[6] != 0) {
        HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB%s%s%s\n", g_class_sizes[5]?", dyn1=" : "", g_class_sizes[5]?"":(g_class_sizes[6]?",":""), (g_class_sizes[5]||g_class_sizes[6])?"":"");
    } else {
        HAKMEM_LOG("[Pool] Classes: 2KB, 4KB, 8KB, 16KB, 32KB\n");
    }
    HAKMEM_LOG("[Pool] Page size: %d KB\n", POOL_PAGE_SIZE/1024);
    HAKMEM_LOG("[Pool] Shards: %d (site-based)\n", POOL_NUM_SHARDS);
}
static void mf2_print_debug_stats(void) {
    if (!g_mf2_enabled) return;
    fprintf(stderr, "\n[MF2 DEBUG STATS]\n");
    fprintf(stderr, "Alloc fast hits:  %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_fast_hit));
    fprintf(stderr, "Alloc slow hits:  %12lu\n", (unsigned long)atomic_load(&g_mf2_alloc_slow_hit));
    fprintf(stderr, "Page reuses:      %12lu\n", (unsigned long)atomic_load(&g_mf2_page_reuse_count));
    fprintf(stderr, "New pages:        %12lu\n", (unsigned long)atomic_load(&g_mf2_new_page_count));
    fprintf(stderr, "Owner frees:      %12lu\n", (unsigned long)atomic_load(&g_mf2_free_owner_count));
    fprintf(stderr, "Remote frees:     %12lu\n", (unsigned long)atomic_load(&g_mf2_free_remote_count));
    fprintf(stderr, "Slow checked:     %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_checked_drain));
    fprintf(stderr, "Slow found rem:   %12lu\n", (unsigned long)atomic_load(&g_mf2_slow_found_remote));
    fprintf(stderr, "Full scan chk:    %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_checked));
    fprintf(stderr, "Full scan rem:    %12lu\n", (unsigned long)atomic_load(&g_mf2_full_scan_found_remote));
    fprintf(stderr, "Eager scan:       %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_scanned));
    fprintf(stderr, "Eager found:      %12lu\n", (unsigned long)atomic_load(&g_mf2_eager_drain_found));
    fprintf(stderr, "Drain attempts:   %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_attempts));
    fprintf(stderr, "Drain successes:  %12lu\n", (unsigned long)atomic_load(&g_mf2_drain_success));
    fprintf(stderr, "Remote drains:    %12lu (blocks: %lu)\n",
            (unsigned long)atomic_load(&g_mf2_drain_count), (unsigned long)atomic_load(&g_mf2_drain_blocks));
    fprintf(stderr, "\n[PENDING QUEUE]\n");
    fprintf(stderr, "Pending enqueued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_enqueued));
    fprintf(stderr, "Pending drained:  %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_drained));
    fprintf(stderr, "Pending requeued: %12lu\n", (unsigned long)atomic_load(&g_mf2_pending_requeued));
    uint64_t total_allocs = atomic_load(&g_mf2_alloc_fast_hit) + atomic_load(&g_mf2_alloc_slow_hit);
    uint64_t total_frees = atomic_load(&g_mf2_free_owner_count) + atomic_load(&g_mf2_free_remote_count);
    if (total_allocs > 0) fprintf(stderr, "\nFast path hit rate:  %.2f%%\n", 100.0 * atomic_load(&g_mf2_alloc_fast_hit) / total_allocs);
    if (total_frees > 0) fprintf(stderr, "Owner free rate:     %.2f%%\n", 100.0 * atomic_load(&g_mf2_free_owner_count) / total_frees);
    fflush(stderr);
}
__attribute__((destructor)) static void mf2_destructor(void) { mf2_print_debug_stats(); }

void hak_pool_init(void) { pthread_once(&hak_pool_init_once_control, hak_pool_init_impl); }
void hak_pool_shutdown(void) {
    if (!g_pool.initialized) return; extern void hak_pool_print_stats(void); hak_pool_print_stats(); mf2_print_debug_stats(); g_pool.initialized = 0;
}

// Try-alloc: legacy TLS path or MF2
void* hak_pool_try_alloc(size_t size, uintptr_t site_id) {
    hak_pool_init(); extern int hak_in_wrapper(void); if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL; if (!hak_pool_is_poolable(size)) return NULL;
    int class_idx = hak_pool_get_class_index(size); if (class_idx < 0) return NULL;
    if (g_mf2_enabled) { return mf2_alloc_fast(class_idx, size, site_id); }
    PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
    if (g_tc_enabled && ring->top < g_tc_drain_trigger && mid_tc_has_items(class_idx)) {
        HKM_TIME_START(t_tc_drain); if (mid_tc_drain_into_tls(class_idx, ring, &g_tls_bin[class_idx])) { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain); if (ring->top > 0) { HKM_TIME_START(t_ring_pop0); PoolBlock* tlsb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop0); void* raw = (void*)tlsb; AllocHeader* hdr = (AllocHeader*)raw; mid_set_header(hdr, g_class_sizes[class_idx], site_id); mid_page_inuse_inc(raw); t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; } } else { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain); } }
    if (g_tls_ring_enabled) { if (ring->top == 0) { atomic_fetch_add_explicit(&g_pool.ring_underflow, 1, memory_order_relaxed); } if (ring->top > 0) { HKM_TIME_START(t_ring_pop1); PoolBlock* tlsb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop1); void* raw = (void*)tlsb; AllocHeader* hdr = (AllocHeader*)raw; mid_set_header(hdr, g_class_sizes[class_idx], site_id); t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; } }
    if (g_tls_bin[class_idx].lo_head) { HKM_TIME_START(t_lifo_pop0); PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--; HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop0); void* raw = (void*)b; AllocHeader* hdr = (AllocHeader*)raw; mid_set_header(hdr, g_class_sizes[class_idx], site_id); mid_page_inuse_inc(raw); t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; }
    int shard_idx = hak_pool_get_shard_index(site_id);
    if (g_tls_ring_enabled) {
        int s0 = choose_nonempty_shard(class_idx, shard_idx);
        for (int probe = 0; probe < g_trylock_probes; ++probe) {
            int s = (s0 + probe) & (POOL_NUM_SHARDS - 1);
            pthread_mutex_t* l = &g_pool.freelist_locks[class_idx][s].m;
            atomic_fetch_add_explicit(&g_pool.trylock_attempts, 1, memory_order_relaxed);
            if (pthread_mutex_trylock(l) == 0) {
                atomic_fetch_add_explicit(&g_pool.trylock_success, 1, memory_order_relaxed);
                if (atomic_load_explicit(&g_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) drain_remote_locked(class_idx, s);
                PoolBlock* head = g_pool.freelist[class_idx][s];
                int to_ring = POOL_L2_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
                while (head && to_ring-- > 0) { PoolBlock* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
                while (head) { PoolBlock* nxt = head->next; head->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = head; g_tls_bin[class_idx].lo_count++; head = nxt; }
                g_pool.freelist[class_idx][s] = head; if (!head) clear_nonempty_bit(class_idx, s);
                pthread_mutex_unlock(l);
                if (ring->top > 0) { PoolBlock* tlsb = ring->items[--ring->top]; void* raw = (void*)tlsb; AllocHeader* hdr = (AllocHeader*)raw; mid_set_header(hdr, g_class_sizes[class_idx], site_id); mid_page_inuse_inc(raw); t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; }
            }
        }
    }
    PoolTLSPage* ap = NULL;
    if (g_tls_active_page_a[class_idx].page && g_tls_active_page_a[class_idx].count > 0 && g_tls_active_page_a[class_idx].bump < g_tls_active_page_a[class_idx].end) ap = &g_tls_active_page_a[class_idx];
    else if (g_tls_active_page_b[class_idx].page && g_tls_active_page_b[class_idx].count > 0 && g_tls_active_page_b[class_idx].bump < g_tls_active_page_b[class_idx].end) ap = &g_tls_active_page_b[class_idx];
    else if (g_tls_active_page_c[class_idx].page && g_tls_active_page_c[class_idx].count > 0 && g_tls_active_page_c[class_idx].bump < g_tls_active_page_c[class_idx].end) ap = &g_tls_active_page_c[class_idx];
    if (ap) {
        if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { int need = POOL_L2_RING_CAP - ring->top; (void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need); }
        PoolBlock* b = NULL; if (ring->top > 0) { b = ring->items[--ring->top]; } else if (ap->page && ap->count > 0 && ap->bump < ap->end) { b = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count<=0){ ap->page=NULL; ap->count=0; } }
        if (b) { void* raw = (void*)b; AllocHeader* hdr = (AllocHeader*)raw; mid_set_header(hdr, g_class_sizes[class_idx], site_id); mid_page_inuse_inc(raw); g_pool.hits[class_idx]++; return (char*)raw + HEADER_SIZE; }
    }
    pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m; HKM_TIME_START(t_lock); struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1); (void)ts_lk1; (void)lk1; pthread_mutex_lock(lock); HKM_TIME_END(HKM_CAT_POOL_LOCK, t_lock); hkm_prof_end(lk1, HKP_POOL_LOCK, &ts_lk1);
    PoolBlock* block = g_pool.freelist[class_idx][shard_idx];
    if (!block) {
        int stole = 0; const FrozenPolicy* pol2 = hkm_policy_get();
        if (pol2) {
            uint16_t cap = 0; if (class_idx < 5) cap = pol2->mid_cap[class_idx]; else if (class_idx == 5 && pol2->mid_dyn1_bytes != 0) cap = pol2->mid_cap_dyn1; else if (class_idx == 6 && pol2->mid_dyn2_bytes != 0) cap = pol2->mid_cap_dyn2;
            if (atomic_load_explicit(&g_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) { drain_remote_locked(class_idx, shard_idx); }
            int neighbor = (shard_idx + 1) & (POOL_NUM_SHARDS - 1);
            if (is_shard_nonempty(class_idx, neighbor)) {
                PoolBlock* nb = g_pool.freelist[class_idx][neighbor]; if (nb) { g_pool.freelist[class_idx][neighbor] = nb->next; nb->next = NULL; block = nb; stole = 1; }
                if (!g_pool.freelist[class_idx][neighbor]) clear_nonempty_bit(class_idx, neighbor);
            }
        }
        if (!stole && !block) { (void)refill_freelist(class_idx, shard_idx); block = g_pool.freelist[class_idx][shard_idx]; }
    }
    if (!block) { pthread_mutex_unlock(lock); g_pool.misses[class_idx]++; return NULL; }
    g_pool.freelist[class_idx][shard_idx] = block->next; if (!g_pool.freelist[class_idx][shard_idx]) clear_nonempty_bit(class_idx, shard_idx); pthread_mutex_unlock(lock);
    void* raw = (void*)block; AllocHeader* hdr = (AllocHeader*)raw; mid_set_header(hdr, g_class_sizes[class_idx], site_id); mid_page_inuse_inc(raw); t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
    return (char*)raw + HEADER_SIZE;
}

void hak_pool_free(void* ptr, size_t size, uintptr_t site_id) {
    if (!ptr) return; hak_pool_init(); if (!hak_pool_is_poolable(size)) return;
    if (g_mf2_enabled) { mf2_free(ptr); return; }
    void* raw = (char*)ptr - HEADER_SIZE; AllocHeader* hdr = (AllocHeader*)raw; int mid_by_desc = 0; MidPageDesc* d_desc = mid_desc_lookup(ptr); if (d_desc) mid_by_desc = 1;
    if (!mid_by_desc && g_hdr_light_enabled < 2) { if (hdr->magic != HAKMEM_MAGIC) { MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X", hdr->magic, HAKMEM_MAGIC); return; } if (hdr->method != ALLOC_METHOD_POOL) { MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)", hdr->method, ALLOC_METHOD_POOL); return; } }
    int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size); if (class_idx < 0) return;
    PoolBlock* block = (PoolBlock*)raw;
    if (g_pool.tls_free_enabled) {
        int same_thread = 0;
        if (g_hdr_light_enabled >= 1) { MidPageDesc* d = mid_desc_lookup(raw); if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { same_thread = 1; } }
        else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) { same_thread = 1; }
        if (same_thread) {
            PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
            if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; }
            else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++; if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) { size_t spill = g_tls_bin[class_idx].lo_count / 2; int shard = hak_pool_get_shard_index(site_id); while (spill-- && g_tls_bin[class_idx].lo_head) { PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; g_tls_bin[class_idx].lo_count--; HKM_TIME_START(t_remote_push1); uintptr_t old_head; do { old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); b->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed)); atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1); } set_nonempty_bit(class_idx, shard); } }
        } else {
            if (g_tc_enabled) { uint64_t owner_tid = 0; if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid; if (owner_tid == 0) { MidPageDesc* d = mid_desc_lookup(raw); if (d) owner_tid = d->owner_tid; } if (owner_tid != 0) { MidTC* otc = mid_tc_lookup_by_tid(owner_tid); if (otc) { mid_tc_push(otc, class_idx, block); return; } } }
            int shard = hak_pool_get_shard_index(site_id); uintptr_t old_head; HKM_TIME_START(t_remote_push2);
            do { old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); block->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed));
            atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2); set_nonempty_bit(class_idx, shard);
        }
    } else {
        int shard_idx2 = hak_pool_get_shard_index(site_id); pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx2].m; pthread_mutex_lock(lock); block->next = g_pool.freelist[class_idx][shard_idx2]; g_pool.freelist[class_idx][shard_idx2] = block; set_nonempty_bit(class_idx, shard_idx2); pthread_mutex_unlock(lock);
    }
    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
    mid_page_inuse_dec_and_maybe_dn(raw);
}

// Mid lookups (MF2-aware) and fast free wrapper
int hak_pool_mid_lookup(void* ptr, size_t* out_size) {
    if (g_mf2_enabled) { MidPage* page = mf2_addr_to_page(ptr); if (page) { int c = (int)page->class_idx; if (c < 0 || c >= POOL_NUM_CLASSES) return 0; size_t sz = g_class_sizes[c]; if (sz == 0) return 0; if (out_size) *out_size = sz; return 1; } }
    MidPageDesc* d = mid_desc_lookup(ptr); if (!d) return 0; int c = (int)d->class_idx; if (c < 0 || c >= POOL_NUM_CLASSES) return 0; size_t sz = g_class_sizes[c]; if (sz == 0) return 0; if (out_size) *out_size = sz; return 1;
}
void hak_pool_free_fast(void* ptr, uintptr_t site_id) {
    if (!ptr || !g_pool.initialized) return; if (g_mf2_enabled) { MidPage* page = mf2_addr_to_page(ptr); if (page) { mf2_free(ptr); return; } }
    MidPageDesc* d = mid_desc_lookup(ptr); if (!d) return; size_t sz = g_class_sizes[(int)d->class_idx]; if (sz == 0) return; hak_pool_free(ptr, sz, site_id);
}
#endif // HAKMEM_POOL_API_NO_PUBLIC

#endif // POOL_CORE_API_INC_H