hakmem/core/box/pool_api.inc.h

// pool_api.inc.h — Box: L2 Pool public API (alloc/free/lookup)
#ifndef POOL_API_INC_H
#define POOL_API_INC_H

void* hak_pool_try_alloc(size_t size, uintptr_t site_id) {
    hak_pool_init();  // pthread_once() ensures thread-safe init (no data race!)
    // P1.7 approach: Avoid using pool during ALL wrapper calls (conservative but safe)
    extern int hak_in_wrapper(void);
    if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL;
    if (!hak_pool_is_poolable(size)) return NULL;

    // Get class and shard indices
    int class_idx = hak_pool_get_class_index(size);
    if (class_idx < 0) return NULL;

    // MF2: Per-Page Sharding path
    if (g_mf2_enabled) {
        return mf2_alloc_fast(class_idx, size, site_id);
    }

    // OLD PATH: TLS fast path (ring then local LIFO); drain TC only when needed
    PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
    if (g_tc_enabled && ring->top < g_tc_drain_trigger && mid_tc_has_items(class_idx)) {
        HKM_TIME_START(t_tc_drain);
        if (mid_tc_drain_into_tls(class_idx, ring, &g_tls_bin[class_idx])) {
            HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain);
            if (ring->top > 0) {
                HKM_TIME_START(t_ring_pop0);
                PoolBlock* tlsb = ring->items[--ring->top];
                HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop0);
                void* raw = (void*)tlsb;
                AllocHeader* hdr = (AllocHeader*)raw;
                mid_set_header(hdr, g_class_sizes[class_idx], site_id);
                mid_page_inuse_inc(raw);
                t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
                if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
                return (char*)raw + HEADER_SIZE;
            }
        } else { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain); }
    }
    if (g_tls_ring_enabled) {
        if (ring->top == 0) {
            atomic_fetch_add_explicit(&g_pool.ring_underflow, 1, memory_order_relaxed);
        }
        if (ring->top > 0) {
            HKM_TIME_START(t_ring_pop1);
            PoolBlock* tlsb = ring->items[--ring->top];
            HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop1);
            void* raw = (void*)tlsb;
            AllocHeader* hdr = (AllocHeader*)raw;
            mid_set_header(hdr, g_class_sizes[class_idx], site_id);
            t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
            if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
            return (char*)raw + HEADER_SIZE;
        }
    }
    if (g_tls_bin[class_idx].lo_head) {
        HKM_TIME_START(t_lifo_pop0);
        PoolBlock* b = g_tls_bin[class_idx].lo_head;
        g_tls_bin[class_idx].lo_head = b->next;
        if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--;
        HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop0);
        void* raw = (void*)b; AllocHeader* hdr = (AllocHeader*)raw;
        mid_set_header(hdr, g_class_sizes[class_idx], site_id);
        mid_page_inuse_inc(raw);
        t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
        if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
        return (char*)raw + HEADER_SIZE;
    }

    // Compute shard only when we need to access shared structures
    int shard_idx = hak_pool_get_shard_index(site_id);

    // Try to batch-pop from a non-empty shard using trylock to fill TLS ring
    if (g_tls_ring_enabled) {
        int s0 = choose_nonempty_shard(class_idx, shard_idx);
        for (int probe = 0; probe < g_trylock_probes; ++probe) {
            int s = (s0 + probe) & (POOL_NUM_SHARDS - 1);
            pthread_mutex_t* l = &g_pool.freelist_locks[class_idx][s].m;
            atomic_fetch_add_explicit(&g_pool.trylock_attempts, 1, memory_order_relaxed);
            if (pthread_mutex_trylock(l) == 0) {
                atomic_fetch_add_explicit(&g_pool.trylock_success, 1, memory_order_relaxed);
                // First, drain any remote frees into freelist
                if (atomic_load_explicit(&g_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {
                    drain_remote_locked(class_idx, s);
                }
                PoolBlock* head = g_pool.freelist[class_idx][s];
                int to_ring = POOL_L2_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;
                while (head && to_ring-- > 0) { PoolBlock* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }
                while (head) { PoolBlock* nxt = head->next; head->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = head; g_tls_bin[class_idx].lo_count++; head = nxt; }
                g_pool.freelist[class_idx][s] = head;
                if (!head) clear_nonempty_bit(class_idx, s);
                pthread_mutex_unlock(l);
                if (ring->top > 0) {
                    PoolBlock* tlsb = ring->items[--ring->top];
                    void* raw = (void*)tlsb;
                    AllocHeader* hdr = (AllocHeader*)raw;
                    mid_set_header(hdr, g_class_sizes[class_idx], site_id);
                    mid_page_inuse_inc(raw);
                    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
                    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
                    return (char*)raw + HEADER_SIZE;
                }
            }
        }
    }

    // Try TLS active pages (owner-only local bump-run, up to 3)
    PoolTLSPage* ap = NULL;
    if (g_tls_active_page_a[class_idx].page && g_tls_active_page_a[class_idx].count > 0 && g_tls_active_page_a[class_idx].bump < g_tls_active_page_a[class_idx].end) ap = &g_tls_active_page_a[class_idx];
    else if (g_tls_active_page_b[class_idx].page && g_tls_active_page_b[class_idx].count > 0 && g_tls_active_page_b[class_idx].bump < g_tls_active_page_b[class_idx].end) ap = &g_tls_active_page_b[class_idx];
    else if (g_tls_active_page_c[class_idx].page && g_tls_active_page_c[class_idx].count > 0 && g_tls_active_page_c[class_idx].bump < g_tls_active_page_c[class_idx].end) ap = &g_tls_active_page_c[class_idx];
    if (ap) {
        if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
            int need = POOL_L2_RING_CAP - ring->top;
            (void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);
        }
        PoolBlock* b = NULL;
        if (ring->top > 0) { b = ring->items[--ring->top]; }
        else if (ap->page && ap->count > 0 && ap->bump < ap->end) {
            b = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count<=0){ ap->page=NULL; ap->count=0; }
        }
        if (b) {
            void* raw = (void*)b; AllocHeader* hdr = (AllocHeader*)raw;
            mid_set_header(hdr, g_class_sizes[class_idx], site_id);
            mid_page_inuse_inc(raw);
            g_pool.hits[class_idx]++;
            return (char*)raw + HEADER_SIZE;
        }
    }

    // Lock the shard freelist for this (class, shard)
    pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m;
    HKM_TIME_START(t_lock);
    struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1);
    (void)ts_lk1; (void)lk1;  // Unused profiling variables
    pthread_mutex_lock(lock);
    HKM_TIME_END(HKM_CAT_POOL_LOCK, t_lock);
    hkm_prof_end(lk1, HKP_POOL_LOCK, &ts_lk1);

    // Try to pop from freelist
    PoolBlock* block = g_pool.freelist[class_idx][shard_idx];

    if (!block) {
        // Before refilling, try draining remote stack and simple shard steal
        int stole = 0;
        const FrozenPolicy* pol = hkm_policy_get();
        if (pol) {
            uint16_t cap = 0;
            if (class_idx < 5) cap = pol->mid_cap[class_idx];
            else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1;
            else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2;
            // Drain remotes
            if (atomic_load_explicit(&g_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) {
                drain_remote_locked(class_idx, shard_idx);
                block = g_pool.freelist[class_idx][shard_idx];
            }
            // Light shard steal when over cap
            if (!block && cap > 0 && g_pool.pages_by_class[class_idx] >= cap) {
                HKM_TIME_START(t_steal);
                for (int d = 1; d <= 4 && !stole; d++) {
                    int s1 = (shard_idx + d) & (POOL_NUM_SHARDS - 1);
                    int s2 = (shard_idx - d) & (POOL_NUM_SHARDS - 1);
                    if (is_shard_nonempty(class_idx, s1)) {
                        pthread_mutex_t* l2 = &g_pool.freelist_locks[class_idx][s1].m;
                        pthread_mutex_lock(l2);
                        PoolBlock* b2 = g_pool.freelist[class_idx][s1];
                        if (b2) {
                            g_pool.freelist[class_idx][s1] = b2->next;
                            if (!g_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1);
                            block = b2; stole = 1;
                        }
                        pthread_mutex_unlock(l2);
                    }
                    if (!stole && is_shard_nonempty(class_idx, s2)) {
                        pthread_mutex_t* l3 = &g_pool.freelist_locks[class_idx][s2].m;
                        pthread_mutex_lock(l3);
                        PoolBlock* b3 = g_pool.freelist[class_idx][s2];
                        if (b3) {
                            g_pool.freelist[class_idx][s2] = b3->next;
                            if (!g_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2);
                            block = b3; stole = 1;
                        }
                        pthread_mutex_unlock(l3);
                    }
                }
                HKM_TIME_END(HKM_CAT_SHARD_STEAL, t_steal);
            }
        }

        if (!stole && !block) {
            // Freelist empty, refill page
            PoolTLSPage* tap = NULL;
            if (g_tls_active_page_a[class_idx].page == NULL || g_tls_active_page_a[class_idx].count == 0) tap = &g_tls_active_page_a[class_idx];
            else if (g_tls_active_page_b[class_idx].page == NULL || g_tls_active_page_b[class_idx].count == 0) tap = &g_tls_active_page_b[class_idx];
            else if (g_tls_active_page_c[class_idx].page == NULL || g_tls_active_page_c[class_idx].count == 0) tap = &g_tls_active_page_c[class_idx];
            else tap = &g_tls_active_page_a[class_idx];
            HKM_TIME_START(t_alloc_page);
            if (alloc_tls_page(class_idx, tap)) {
                HKM_TIME_END(HKM_CAT_POOL_ALLOC_TLS_PAGE, t_alloc_page);
                pthread_mutex_unlock(lock);
                // Top-up ring and return
                ap = tap;
                if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
                    int need = POOL_L2_RING_CAP - ring->top;
                    (void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);
                }
                PoolBlock* takeb = NULL;
                if (ring->top > 0) { HKM_TIME_START(t_ring_pop2); takeb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop2);} 
                else if (ap->page && ap->count > 0 && ap->bump < ap->end) { takeb = (PoolBlock*)(void*)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end || ap->count==0){ ap->page=NULL; ap->count=0; } }
                void* raw2 = (void*)takeb; AllocHeader* hdr2 = (AllocHeader*)raw2;
                mid_set_header(hdr2, g_class_sizes[class_idx], site_id);
                mid_page_inuse_inc(raw2);
                g_pool.hits[class_idx]++;
                return (char*)raw2 + HEADER_SIZE;
            }
            HKM_TIME_START(t_refill);
            struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf);
            int ok = refill_freelist(class_idx, shard_idx);
            HKM_TIME_END(HKM_CAT_POOL_REFILL, t_refill);
            hkm_prof_end(rf, HKP_POOL_REFILL, &ts_rf);
            if (!ok) {
                t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
                if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.misses[class_idx]++;
                pthread_mutex_unlock(lock);
                return NULL;
            }
        }
    }

    // Pop block and adopt page
    g_pool.freelist[class_idx][shard_idx] = block->next;
    mid_desc_adopt(block, class_idx, (uint64_t)(uintptr_t)pthread_self());
    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;
    if (g_pool.freelist[class_idx][shard_idx] == NULL) clear_nonempty_bit(class_idx, shard_idx);
    pthread_mutex_unlock(lock);

    // Store to TLS then pop
    PoolBlock* take;
    if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; take = ring->items[--ring->top]; }
    else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++; 
           if (g_tls_ring_enabled && ring->top > 0) { take = ring->items[--ring->top]; }
           else { take = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = take->next; if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--; } }

    void* raw = (void*)take; AllocHeader* hdr = (AllocHeader*)raw;
    mid_set_header(hdr, g_class_sizes[class_idx], site_id);
    mid_page_inuse_inc(raw);
    return (char*)raw + HEADER_SIZE;
}

void hak_pool_free(void* ptr, size_t size, uintptr_t site_id) {
    if (!ptr) return;
    hak_pool_init();
    if (!hak_pool_is_poolable(size)) return;

    if (g_mf2_enabled) { mf2_free(ptr); return; }

    void* raw = (char*)ptr - HEADER_SIZE;
    AllocHeader* hdr = (AllocHeader*)raw;
    int mid_by_desc = 0; MidPageDesc* d_desc = mid_desc_lookup(ptr);
    if (d_desc) mid_by_desc = 1;
    if (!mid_by_desc && g_hdr_light_enabled < 2) {
        if (hdr->magic != HAKMEM_MAGIC) { MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X", hdr->magic, HAKMEM_MAGIC); return; }
        if (hdr->method != ALLOC_METHOD_POOL) { MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)", hdr->method, ALLOC_METHOD_POOL); return; }
    }
    int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size);
    if (class_idx < 0) return;
    PoolBlock* block = (PoolBlock*)raw;
    if (g_pool.tls_free_enabled) {
        int same_thread = 0;
        if (g_hdr_light_enabled >= 1) { MidPageDesc* d = mid_desc_lookup(raw); if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { same_thread = 1; } }
        else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) { same_thread = 1; }
        if (same_thread) {
            PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
            if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; }
            else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++; if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {
                    size_t spill = g_tls_bin[class_idx].lo_count / 2; int shard = hak_pool_get_shard_index(site_id);
                    while (spill-- && g_tls_bin[class_idx].lo_head) { PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; g_tls_bin[class_idx].lo_count--; HKM_TIME_START(t_remote_push1); uintptr_t old_head; do { old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); b->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed)); atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1); } set_nonempty_bit(class_idx, shard); } }
        } else {
            if (g_tc_enabled) { uint64_t owner_tid = 0; if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid; if (owner_tid == 0) { MidPageDesc* d = mid_desc_lookup(raw); if (d) owner_tid = d->owner_tid; } if (owner_tid != 0) { MidTC* otc = mid_tc_lookup_by_tid(owner_tid); if (otc) { mid_tc_push(otc, class_idx, block); return; } } }
            int shard = hak_pool_get_shard_index(site_id); uintptr_t old_head; HKM_TIME_START(t_remote_push2);
            do { old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); block->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed));
            atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2); set_nonempty_bit(class_idx, shard);
        }
    } else {
        int shard_idx2 = hak_pool_get_shard_index(site_id); pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx2].m; pthread_mutex_lock(lock); block->next = g_pool.freelist[class_idx][shard_idx2]; g_pool.freelist[class_idx][shard_idx2] = block; set_nonempty_bit(class_idx, shard_idx2); pthread_mutex_unlock(lock);
    }
    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
    mid_page_inuse_dec_and_maybe_dn(raw);
}

int hak_pool_mid_lookup(void* ptr, size_t* out_size) {
    if (g_mf2_enabled) { MidPage* page = mf2_addr_to_page(ptr); if (page) { int c = (int)page->class_idx; if (c < 0 || c >= POOL_NUM_CLASSES) return 0; size_t sz = g_class_sizes[c]; if (sz == 0) return 0; if (out_size) *out_size = sz; return 1; } }
    MidPageDesc* d = mid_desc_lookup(ptr); if (!d) return 0; int c = (int)d->class_idx; if (c < 0 || c >= POOL_NUM_CLASSES) return 0; size_t sz = g_class_sizes[c]; if (sz == 0) return 0; if (out_size) *out_size = sz; return 1;
}

void hak_pool_free_fast(void* ptr, uintptr_t site_id) {
    if (!ptr || !g_pool.initialized) return; if (g_mf2_enabled) { MidPage* page = mf2_addr_to_page(ptr); if (page) { mf2_free(ptr); return; } }
    MidPageDesc* d = mid_desc_lookup(ptr); if (!d) return; size_t sz = g_class_sizes[(int)d->class_idx]; if (sz == 0) return; hak_pool_free(ptr, sz, site_id);
}

#endif // POOL_API_INC_H
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消問題: - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV 根本原因: (Task agent ultrathink 調査結果) ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV 修正内容: 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. core/hakmem_tiny.c: - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. core/tiny_fastcache.c: - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. core/hakmem_tiny_magazine.c: - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. core/tiny_sticky.c: - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` 効果: ``` Before: 1T: 2.09M ✅ \| 4T: SEGV 💀 After: 1T: 2.41M ✅ \| 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` テスト: ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走（以前は SEGV） ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` 調査協力: Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-07 01:27:04 +09:00			`// pool_api.inc.h — Box: L2 Pool public API (alloc/free/lookup)`
			`#ifndef POOL_API_INC_H`
			`#define POOL_API_INC_H`

			`void* hak_pool_try_alloc(size_t size, uintptr_t site_id) {`
			`hak_pool_init(); // pthread_once() ensures thread-safe init (no data race!)`
			`// P1.7 approach: Avoid using pool during ALL wrapper calls (conservative but safe)`
			`extern int hak_in_wrapper(void);`
			`if (hak_in_wrapper() && !g_wrap_l2_enabled) return NULL;`
			`if (!hak_pool_is_poolable(size)) return NULL;`

			`// Get class and shard indices`
			`int class_idx = hak_pool_get_class_index(size);`
			`if (class_idx < 0) return NULL;`

			`// MF2: Per-Page Sharding path`
			`if (g_mf2_enabled) {`
			`return mf2_alloc_fast(class_idx, size, site_id);`
			`}`

			`// OLD PATH: TLS fast path (ring then local LIFO); drain TC only when needed`
			`PoolTLSRing* ring = &g_tls_bin[class_idx].ring;`
			`if (g_tc_enabled && ring->top < g_tc_drain_trigger && mid_tc_has_items(class_idx)) {`
			`HKM_TIME_START(t_tc_drain);`
			`if (mid_tc_drain_into_tls(class_idx, ring, &g_tls_bin[class_idx])) {`
			`HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain);`
			`if (ring->top > 0) {`
			`HKM_TIME_START(t_ring_pop0);`
			`PoolBlock* tlsb = ring->items[--ring->top];`
			`HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop0);`
			`void* raw = (void*)tlsb;`
			`AllocHeader* hdr = (AllocHeader*)raw;`
			`mid_set_header(hdr, g_class_sizes[class_idx], site_id);`
			`mid_page_inuse_inc(raw);`
			`t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;`
			`if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;`
			`return (char*)raw + HEADER_SIZE;`
			`}`
			`} else { HKM_TIME_END(HKM_CAT_TC_DRAIN, t_tc_drain); }`
			`}`
			`if (g_tls_ring_enabled) {`
			`if (ring->top == 0) {`
			`atomic_fetch_add_explicit(&g_pool.ring_underflow, 1, memory_order_relaxed);`
			`}`
			`if (ring->top > 0) {`
			`HKM_TIME_START(t_ring_pop1);`
			`PoolBlock* tlsb = ring->items[--ring->top];`
			`HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop1);`
			`void* raw = (void*)tlsb;`
			`AllocHeader* hdr = (AllocHeader*)raw;`
			`mid_set_header(hdr, g_class_sizes[class_idx], site_id);`
			`t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;`
			`if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;`
			`return (char*)raw + HEADER_SIZE;`
			`}`
			`}`
			`if (g_tls_bin[class_idx].lo_head) {`
			`HKM_TIME_START(t_lifo_pop0);`
			`PoolBlock* b = g_tls_bin[class_idx].lo_head;`
			`g_tls_bin[class_idx].lo_head = b->next;`
			`if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--;`
			`HKM_TIME_END(HKM_CAT_POOL_TLS_LIFO_POP, t_lifo_pop0);`
			`void* raw = (void)b; AllocHeader hdr = (AllocHeader*)raw;`
			`mid_set_header(hdr, g_class_sizes[class_idx], site_id);`
			`mid_page_inuse_inc(raw);`
			`t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;`
			`if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;`
			`return (char*)raw + HEADER_SIZE;`
			`}`

			`// Compute shard only when we need to access shared structures`
			`int shard_idx = hak_pool_get_shard_index(site_id);`

			`// Try to batch-pop from a non-empty shard using trylock to fill TLS ring`
			`if (g_tls_ring_enabled) {`
			`int s0 = choose_nonempty_shard(class_idx, shard_idx);`
			`for (int probe = 0; probe < g_trylock_probes; ++probe) {`
			`int s = (s0 + probe) & (POOL_NUM_SHARDS - 1);`
			`pthread_mutex_t* l = &g_pool.freelist_locks[class_idx][s].m;`
			`atomic_fetch_add_explicit(&g_pool.trylock_attempts, 1, memory_order_relaxed);`
			`if (pthread_mutex_trylock(l) == 0) {`
			`atomic_fetch_add_explicit(&g_pool.trylock_success, 1, memory_order_relaxed);`
			`// First, drain any remote frees into freelist`
			`if (atomic_load_explicit(&g_pool.remote_count[class_idx][s], memory_order_relaxed) != 0) {`
			`drain_remote_locked(class_idx, s);`
			`}`
			`PoolBlock* head = g_pool.freelist[class_idx][s];`
			`int to_ring = POOL_L2_RING_CAP - ring->top; if (to_ring < 0) to_ring = 0;`
			`while (head && to_ring-- > 0) { PoolBlock* nxt = head->next; ring->items[ring->top++] = head; head = nxt; }`
			`while (head) { PoolBlock* nxt = head->next; head->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = head; g_tls_bin[class_idx].lo_count++; head = nxt; }`
			`g_pool.freelist[class_idx][s] = head;`
			`if (!head) clear_nonempty_bit(class_idx, s);`
			`pthread_mutex_unlock(l);`
			`if (ring->top > 0) {`
			`PoolBlock* tlsb = ring->items[--ring->top];`
			`void* raw = (void*)tlsb;`
			`AllocHeader* hdr = (AllocHeader*)raw;`
			`mid_set_header(hdr, g_class_sizes[class_idx], site_id);`
			`mid_page_inuse_inc(raw);`
			`t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;`
			`if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;`
			`return (char*)raw + HEADER_SIZE;`
			`}`
			`}`
			`}`
			`}`

			`// Try TLS active pages (owner-only local bump-run, up to 3)`
			`PoolTLSPage* ap = NULL;`
			`if (g_tls_active_page_a[class_idx].page && g_tls_active_page_a[class_idx].count > 0 && g_tls_active_page_a[class_idx].bump < g_tls_active_page_a[class_idx].end) ap = &g_tls_active_page_a[class_idx];`
			`else if (g_tls_active_page_b[class_idx].page && g_tls_active_page_b[class_idx].count > 0 && g_tls_active_page_b[class_idx].bump < g_tls_active_page_b[class_idx].end) ap = &g_tls_active_page_b[class_idx];`
			`else if (g_tls_active_page_c[class_idx].page && g_tls_active_page_c[class_idx].count > 0 && g_tls_active_page_c[class_idx].bump < g_tls_active_page_c[class_idx].end) ap = &g_tls_active_page_c[class_idx];`
			`if (ap) {`
			`if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {`
			`int need = POOL_L2_RING_CAP - ring->top;`
			`(void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);`
			`}`
			`PoolBlock* b = NULL;`
			`if (ring->top > 0) { b = ring->items[--ring->top]; }`
			`else if (ap->page && ap->count > 0 && ap->bump < ap->end) {`
			`b = (PoolBlock)(void)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end \|\| ap->count<=0){ ap->page=NULL; ap->count=0; }`
			`}`
			`if (b) {`
			`void* raw = (void)b; AllocHeader hdr = (AllocHeader*)raw;`
			`mid_set_header(hdr, g_class_sizes[class_idx], site_id);`
			`mid_page_inuse_inc(raw);`
			`g_pool.hits[class_idx]++;`
			`return (char*)raw + HEADER_SIZE;`
			`}`
			`}`

			`// Lock the shard freelist for this (class, shard)`
			`pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx].m;`
			`HKM_TIME_START(t_lock);`
			`struct timespec ts_lk1; int lk1 = hkm_prof_begin(&ts_lk1);`
			`(void)ts_lk1; (void)lk1; // Unused profiling variables`
			`pthread_mutex_lock(lock);`
			`HKM_TIME_END(HKM_CAT_POOL_LOCK, t_lock);`
			`hkm_prof_end(lk1, HKP_POOL_LOCK, &ts_lk1);`

			`// Try to pop from freelist`
			`PoolBlock* block = g_pool.freelist[class_idx][shard_idx];`

			`if (!block) {`
			`// Before refilling, try draining remote stack and simple shard steal`
			`int stole = 0;`
			`const FrozenPolicy* pol = hkm_policy_get();`
			`if (pol) {`
			`uint16_t cap = 0;`
			`if (class_idx < 5) cap = pol->mid_cap[class_idx];`
			`else if (class_idx == 5 && pol->mid_dyn1_bytes != 0) cap = pol->mid_cap_dyn1;`
			`else if (class_idx == 6 && pol->mid_dyn2_bytes != 0) cap = pol->mid_cap_dyn2;`
			`// Drain remotes`
			`if (atomic_load_explicit(&g_pool.remote_count[class_idx][shard_idx], memory_order_relaxed) != 0) {`
			`drain_remote_locked(class_idx, shard_idx);`
			`block = g_pool.freelist[class_idx][shard_idx];`
			`}`
			`// Light shard steal when over cap`
			`if (!block && cap > 0 && g_pool.pages_by_class[class_idx] >= cap) {`
			`HKM_TIME_START(t_steal);`
			`for (int d = 1; d <= 4 && !stole; d++) {`
			`int s1 = (shard_idx + d) & (POOL_NUM_SHARDS - 1);`
			`int s2 = (shard_idx - d) & (POOL_NUM_SHARDS - 1);`
			`if (is_shard_nonempty(class_idx, s1)) {`
			`pthread_mutex_t* l2 = &g_pool.freelist_locks[class_idx][s1].m;`
			`pthread_mutex_lock(l2);`
			`PoolBlock* b2 = g_pool.freelist[class_idx][s1];`
			`if (b2) {`
			`g_pool.freelist[class_idx][s1] = b2->next;`
			`if (!g_pool.freelist[class_idx][s1]) clear_nonempty_bit(class_idx, s1);`
			`block = b2; stole = 1;`
			`}`
			`pthread_mutex_unlock(l2);`
			`}`
			`if (!stole && is_shard_nonempty(class_idx, s2)) {`
			`pthread_mutex_t* l3 = &g_pool.freelist_locks[class_idx][s2].m;`
			`pthread_mutex_lock(l3);`
			`PoolBlock* b3 = g_pool.freelist[class_idx][s2];`
			`if (b3) {`
			`g_pool.freelist[class_idx][s2] = b3->next;`
			`if (!g_pool.freelist[class_idx][s2]) clear_nonempty_bit(class_idx, s2);`
			`block = b3; stole = 1;`
			`}`
			`pthread_mutex_unlock(l3);`
			`}`
			`}`
			`HKM_TIME_END(HKM_CAT_SHARD_STEAL, t_steal);`
			`}`
			`}`

			`if (!stole && !block) {`
			`// Freelist empty, refill page`
			`PoolTLSPage* tap = NULL;`
			`if (g_tls_active_page_a[class_idx].page == NULL \|\| g_tls_active_page_a[class_idx].count == 0) tap = &g_tls_active_page_a[class_idx];`
			`else if (g_tls_active_page_b[class_idx].page == NULL \|\| g_tls_active_page_b[class_idx].count == 0) tap = &g_tls_active_page_b[class_idx];`
			`else if (g_tls_active_page_c[class_idx].page == NULL \|\| g_tls_active_page_c[class_idx].count == 0) tap = &g_tls_active_page_c[class_idx];`
			`else tap = &g_tls_active_page_a[class_idx];`
			`HKM_TIME_START(t_alloc_page);`
			`if (alloc_tls_page(class_idx, tap)) {`
			`HKM_TIME_END(HKM_CAT_POOL_ALLOC_TLS_PAGE, t_alloc_page);`
			`pthread_mutex_unlock(lock);`
			`// Top-up ring and return`
			`ap = tap;`
			`if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {`
			`int need = POOL_L2_RING_CAP - ring->top;`
			`(void)refill_tls_from_active_page(class_idx, ring, &g_tls_bin[class_idx], ap, need);`
			`}`
			`PoolBlock* takeb = NULL;`
			`if (ring->top > 0) { HKM_TIME_START(t_ring_pop2); takeb = ring->items[--ring->top]; HKM_TIME_END(HKM_CAT_POOL_TLS_RING_POP, t_ring_pop2);}`
			`else if (ap->page && ap->count > 0 && ap->bump < ap->end) { takeb = (PoolBlock)(void)ap->bump; ap->bump += (HEADER_SIZE + g_class_sizes[class_idx]); ap->count--; if (ap->bump >= ap->end \|\| ap->count==0){ ap->page=NULL; ap->count=0; } }`
			`void* raw2 = (void)takeb; AllocHeader hdr2 = (AllocHeader*)raw2;`
			`mid_set_header(hdr2, g_class_sizes[class_idx], site_id);`
			`mid_page_inuse_inc(raw2);`
			`g_pool.hits[class_idx]++;`
			`return (char*)raw2 + HEADER_SIZE;`
			`}`
			`HKM_TIME_START(t_refill);`
			`struct timespec ts_rf; int rf = hkm_prof_begin(&ts_rf);`
			`int ok = refill_freelist(class_idx, shard_idx);`
			`HKM_TIME_END(HKM_CAT_POOL_REFILL, t_refill);`
			`hkm_prof_end(rf, HKP_POOL_REFILL, &ts_rf);`
			`if (!ok) {`
			`t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;`
			`if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.misses[class_idx]++;`
			`pthread_mutex_unlock(lock);`
			`return NULL;`
			`}`
			`}`
			`}`

			`// Pop block and adopt page`
			`g_pool.freelist[class_idx][shard_idx] = block->next;`
			`mid_desc_adopt(block, class_idx, (uint64_t)(uintptr_t)pthread_self());`
			`t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;`
			`if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.hits[class_idx]++;`
			`if (g_pool.freelist[class_idx][shard_idx] == NULL) clear_nonempty_bit(class_idx, shard_idx);`
			`pthread_mutex_unlock(lock);`

			`// Store to TLS then pop`
			`PoolBlock* take;`
			`if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; take = ring->items[--ring->top]; }`
			`else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++;`
			`if (g_tls_ring_enabled && ring->top > 0) { take = ring->items[--ring->top]; }`
			`else { take = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = take->next; if (g_tls_bin[class_idx].lo_count) g_tls_bin[class_idx].lo_count--; } }`

			`void* raw = (void)take; AllocHeader hdr = (AllocHeader*)raw;`
			`mid_set_header(hdr, g_class_sizes[class_idx], site_id);`
			`mid_page_inuse_inc(raw);`
			`return (char*)raw + HEADER_SIZE;`
			`}`

			`void hak_pool_free(void* ptr, size_t size, uintptr_t site_id) {`
			`if (!ptr) return;`
			`hak_pool_init();`
			`if (!hak_pool_is_poolable(size)) return;`

			`if (g_mf2_enabled) { mf2_free(ptr); return; }`

			`void* raw = (char*)ptr - HEADER_SIZE;`
			`AllocHeader* hdr = (AllocHeader*)raw;`
			`int mid_by_desc = 0; MidPageDesc* d_desc = mid_desc_lookup(ptr);`
			`if (d_desc) mid_by_desc = 1;`
			`if (!mid_by_desc && g_hdr_light_enabled < 2) {`
			`if (hdr->magic != HAKMEM_MAGIC) { MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X", hdr->magic, HAKMEM_MAGIC); return; }`
			`if (hdr->method != ALLOC_METHOD_POOL) { MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)", hdr->method, ALLOC_METHOD_POOL); return; }`
			`}`
			`int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size);`
			`if (class_idx < 0) return;`
			`PoolBlock* block = (PoolBlock*)raw;`
			`if (g_pool.tls_free_enabled) {`
			`int same_thread = 0;`
			`if (g_hdr_light_enabled >= 1) { MidPageDesc* d = mid_desc_lookup(raw); if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { same_thread = 1; } }`
			`else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) { same_thread = 1; }`
			`if (same_thread) {`
			`PoolTLSRing* ring = &g_tls_bin[class_idx].ring;`
			`if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; }`
			`else { block->next = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = block; g_tls_bin[class_idx].lo_count++; if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {`
			`size_t spill = g_tls_bin[class_idx].lo_count / 2; int shard = hak_pool_get_shard_index(site_id);`
			while (spill-- && g_tls_bin[class_idx].lo_head) { PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; g_tls_bin[class_idx].lo_count--; HKM_TIME_START(t_remote_push1); uintptr_t old_head; do { old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); b->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)b, memory_order_release, memory_order_relaxed)); atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1); } set_nonempty_bit(class_idx, shard); } }
			`} else {`
			`if (g_tc_enabled) { uint64_t owner_tid = 0; if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid; if (owner_tid == 0) { MidPageDesc* d = mid_desc_lookup(raw); if (d) owner_tid = d->owner_tid; } if (owner_tid != 0) { MidTC* otc = mid_tc_lookup_by_tid(owner_tid); if (otc) { mid_tc_push(otc, class_idx, block); return; } } }`
			`int shard = hak_pool_get_shard_index(site_id); uintptr_t old_head; HKM_TIME_START(t_remote_push2);`
			`do { old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); block->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed));`
			`atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2); set_nonempty_bit(class_idx, shard);`
			`}`
			`} else {`
			`int shard_idx2 = hak_pool_get_shard_index(site_id); pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx2].m; pthread_mutex_lock(lock); block->next = g_pool.freelist[class_idx][shard_idx2]; g_pool.freelist[class_idx][shard_idx2] = block; set_nonempty_bit(class_idx, shard_idx2); pthread_mutex_unlock(lock);`
			`}`
			`t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;`
			`mid_page_inuse_dec_and_maybe_dn(raw);`
			`}`

			`int hak_pool_mid_lookup(void* ptr, size_t* out_size) {`
			`if (g_mf2_enabled) { MidPage* page = mf2_addr_to_page(ptr); if (page) { int c = (int)page->class_idx; if (c < 0 \|\| c >= POOL_NUM_CLASSES) return 0; size_t sz = g_class_sizes[c]; if (sz == 0) return 0; if (out_size) *out_size = sz; return 1; } }`
			`MidPageDesc* d = mid_desc_lookup(ptr); if (!d) return 0; int c = (int)d->class_idx; if (c < 0 \|\| c >= POOL_NUM_CLASSES) return 0; size_t sz = g_class_sizes[c]; if (sz == 0) return 0; if (out_size) *out_size = sz; return 1;`
			`}`

			`void hak_pool_free_fast(void* ptr, uintptr_t site_id) {`
			`if (!ptr \|\| !g_pool.initialized) return; if (g_mf2_enabled) { MidPage* page = mf2_addr_to_page(ptr); if (page) { mf2_free(ptr); return; } }`
			`MidPageDesc* d = mid_desc_lookup(ptr); if (!d) return; size_t sz = g_class_sizes[(int)d->class_idx]; if (sz == 0) return; hak_pool_free(ptr, sz, site_id);`
			`}`

			`#endif // POOL_API_INC_H`