Phase: Pool API Modularization - Step 3: Extract pool_free_v1_box.h

Extracted pool v1 free implementation into separate box module: - hak_pool_free_v1_fast_impl(): L1-FastBox (TLS-only path, no mid_desc_lookup) - hak_pool_free_v1_slow_impl(): L1-SlowBox (full impl with lookup) - hak_pool_free_v1_impl(): L0-SplitBox (fast predicate router) Benefits: - Reduced pool_api.inc.h from ~950 to ~840 lines - Clear separation of concern (fast vs slow paths) - Enables future phase extensions (e.g., POOL-MID-DN-BATCH) - Maintains zero-cost abstraction (all inline) Testing: - Build: ✓ (no errors) - Benchmark: ✓ (7.99M ops/s, consistent with baseline) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-12 21:46:26 +09:00
parent b01c99f209
commit a92f3e52c3
2 changed files with 216 additions and 175 deletions
--- a/core/box/pool_api.inc.h
+++ b/core/box/pool_api.inc.h
@ -9,6 +9,7 @@
 #include "box/pool_config_box.h"      // Pool configuration & ENV gates
 #include "box/pool_stats_box.h"       // Pool statistics & monitoring
 #include "box/pool_mid_desc_cache_box.h"  // Mid descriptor TLS cache
 #include "box/pool_free_v1_box.h"     // Pool v1 free implementation (L0-SplitBox + L1-FastBox/SlowBox)
 #include <stdint.h>
 // Thin helper to keep the hot path straight-line when converting a PoolBlock to
@ -736,181 +737,6 @@ static inline void* hak_pool_try_alloc_v1_impl(size_t size, uintptr_t site_id) {
    return user4;
 }
 // ============================================================================
 // Phase POOL-FREE-V1-OPT: L1-FastBox (same-thread TLS free, no mid_desc_lookup)
 // ============================================================================
 // Precondition: fast predicate already verified by caller
 //   - g_pool.tls_free_enabled == true
 //   - g_hdr_light_enabled == 0 (header owner_tid trusted)
 //   - hdr->owner_tid == self (same-thread)
 // Effect: Skips 2x mid_desc_lookup calls vs slow path
 static inline void hak_pool_free_v1_fast_impl(void* raw, int class_idx, uintptr_t site_id) {
    PoolBlock* block = (PoolBlock*)raw;
    // Same-thread TLS free path (ring → lo_head → spill)
    PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
    if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
        ring->items[ring->top++] = block;
    } else {
        block->next = g_tls_bin[class_idx].lo_head;
        g_tls_bin[class_idx].lo_head = block;
        g_tls_bin[class_idx].lo_count++;
        if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {
            size_t spill = g_tls_bin[class_idx].lo_count / 2;
            int shard = hak_pool_get_shard_index(site_id);
            while (spill-- && g_tls_bin[class_idx].lo_head) {
                PoolBlock* b = g_tls_bin[class_idx].lo_head;
                g_tls_bin[class_idx].lo_head = b->next;
                g_tls_bin[class_idx].lo_count--;
                HKM_TIME_START(t_remote_push1);
                uintptr_t old_head;
                do {
                    old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
                    b->next = (PoolBlock*)old_head;
                } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard],
                                                                 &old_head, (uintptr_t)b,
                                                                 memory_order_release, memory_order_relaxed));
                atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
                HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1);
            }
            set_nonempty_bit(class_idx, shard);
        }
    }
    // Common tail: sample counter + inuse dec
    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
    mid_page_inuse_dec_and_maybe_dn(raw);
 }
 // ============================================================================
 // Phase POOL-FREE-V1-OPT: L1-SlowBox (full v1 impl with mid_desc_lookup)
 // ============================================================================
 static inline void hak_pool_free_v1_slow_impl(void* ptr, size_t size, uintptr_t site_id) {
    void* raw = (char*)ptr - HEADER_SIZE;
    AllocHeader* hdr = (AllocHeader*)raw;
    // Mid desc lookup for validation and class_idx
    int mid_by_desc = 0;
    MidPageDesc* d_desc = mid_desc_lookup_cached(ptr);
    if (d_desc) mid_by_desc = 1;
    if (!mid_by_desc && g_hdr_light_enabled < 2) {
        if (hdr->magic != HAKMEM_MAGIC) { MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X", hdr->magic, HAKMEM_MAGIC); return; }
        if (hdr->method != ALLOC_METHOD_POOL) { MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)", hdr->method, ALLOC_METHOD_POOL); return; }
    }
    int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size);
    if (class_idx < 0) return;
    PoolBlock* block = (PoolBlock*)raw;
    if (g_pool.tls_free_enabled) {
        int same_thread = 0;
        if (g_hdr_light_enabled >= 1) {
            MidPageDesc* d = mid_desc_lookup_cached(raw);
            if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { same_thread = 1; }
        } else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) {
            same_thread = 1;
        }
        if (same_thread) {
            PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
            if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; }
            else {
                block->next = g_tls_bin[class_idx].lo_head;
                g_tls_bin[class_idx].lo_head = block;
                g_tls_bin[class_idx].lo_count++;
                if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {
                    size_t spill = g_tls_bin[class_idx].lo_count / 2;
                    int shard = hak_pool_get_shard_index(site_id);
                    while (spill-- && g_tls_bin[class_idx].lo_head) {
                        PoolBlock* b = g_tls_bin[class_idx].lo_head;
                        g_tls_bin[class_idx].lo_head = b->next;
                        g_tls_bin[class_idx].lo_count--;
                        HKM_TIME_START(t_remote_push1);
                        uintptr_t old_head;
                        do {
                            old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
                            b->next = (PoolBlock*)old_head;
                        } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard],
                                                                         &old_head, (uintptr_t)b,
                                                                         memory_order_release, memory_order_relaxed));
                        atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
                        HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1);
                    }
                    set_nonempty_bit(class_idx, shard);
                }
            }
        } else {
            if (g_tc_enabled) {
                uint64_t owner_tid = 0;
                if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid;
                if (owner_tid == 0) { MidPageDesc* d = mid_desc_lookup_cached(raw); if (d) owner_tid = d->owner_tid; }
                if (owner_tid != 0) { MidTC* otc = mid_tc_lookup_by_tid(owner_tid); if (otc) { mid_tc_push(otc, class_idx, block); goto tail; } }
            }
            int shard = hak_pool_get_shard_index(site_id);
            uintptr_t old_head;
            HKM_TIME_START(t_remote_push2);
            do {
                old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
                block->next = (PoolBlock*)old_head;
            } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed));
            atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
            HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2);
            set_nonempty_bit(class_idx, shard);
        }
    } else {
        int shard_idx2 = hak_pool_get_shard_index(site_id);
        pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx2].m;
        pthread_mutex_lock(lock);
        block->next = g_pool.freelist[class_idx][shard_idx2];
        g_pool.freelist[class_idx][shard_idx2] = block;
        set_nonempty_bit(class_idx, shard_idx2);
        pthread_mutex_unlock(lock);
    }
 tail:
    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
    mid_page_inuse_dec_and_maybe_dn(raw);
 }
 // ============================================================================
 // Phase POOL-FREE-V1-OPT: L0-SplitBox (routes to fast or slow)
 // ============================================================================
 static inline void hak_pool_free_v1_impl(void* ptr, size_t size, uintptr_t site_id) {
    if (!ptr) return;
    hak_pool_init();
    if (!hak_pool_is_poolable(size)) return;
    if (g_mf2_enabled) { mf2_free(ptr); return; }
    // L0 SplitBox: Check fast predicate
    if (hak_pool_v1_free_fastsplit_enabled() && g_pool.tls_free_enabled && g_hdr_light_enabled == 0) {
        void* raw = (char*)ptr - HEADER_SIZE;
        AllocHeader* hdr = (AllocHeader*)raw;
        uint64_t self = (uint64_t)(uintptr_t)pthread_self();
        // Fast predicate: header-based same-thread detection
        if (hdr->magic == HAKMEM_MAGIC &&
            hdr->method == ALLOC_METHOD_POOL &&
            hdr->owner_tid != 0 &&
            hdr->owner_tid == self) {
            // class_idx from size (caller provided)
            int class_idx = hak_pool_get_class_index(size);
            if (class_idx >= 0) {
                if (__builtin_expect(hak_pool_v1_flatten_stats_enabled(), 0)) {
                    atomic_fetch_add_explicit(&g_pool_v1_flat_stats.fastsplit_fast_hit, 1, memory_order_relaxed);
                }
                hak_pool_free_v1_fast_impl(raw, class_idx, site_id);
                return;
            }
        }
        // Fast predicate failed, fall through to slow
        if (__builtin_expect(hak_pool_v1_flatten_stats_enabled(), 0)) {
            atomic_fetch_add_explicit(&g_pool_v1_flat_stats.fastsplit_slow_hit, 1, memory_order_relaxed);
        }
    }
    // Fallback to slow path
    hak_pool_free_v1_slow_impl(ptr, size, site_id);
 }
 // --- v1 flatten (opt-in) ----------------------------------------------------
 static inline void* hak_pool_try_alloc_v1_flat(size_t size, uintptr_t site_id) {
--- a/core/box/pool_free_v1_box.h
+++ b/core/box/pool_free_v1_box.h
@ -0,0 +1,215 @@
 // pool_free_v1_box.h — Box: Pool V1 Free Implementation (Fast/Slow Split)
 //
 // Purpose: Pool v1 free path with L0-SplitBox + L1-FastBox/SlowBox
 // Pattern: Header-based predicate for same-thread fast path
 // Phase: POOL-FREE-V1-OPT Steps 1-2 (reject stats + fast split)
 // Dependencies: Assumes pool_api.inc.h includes this after hakmem_internal.h
 //               (provides AllocHeader, PoolBlock, PoolTLSRing, g_pool, etc.)
 #ifndef POOL_FREE_V1_BOX_H
 #define POOL_FREE_V1_BOX_H
 #include "pool_config_box.h"      // For hak_pool_v1_free_fastsplit_enabled, etc
 #include "pool_stats_box.h"       // For g_pool_v1_flat_stats
 #include "pool_mid_desc_cache_box.h"  // For mid_desc_lookup_cached
 #include <stdint.h>
 #include <stdatomic.h>
 // Forward declaration only (full definitions available from hakmem_internal.h)
 struct MidPageDesc;
 typedef struct MidPageDesc MidPageDesc;
 // External functions
 extern void hak_pool_init(void);
 extern int hak_pool_is_poolable(size_t size);
 extern int hak_pool_get_class_index(size_t size);
 extern int hak_pool_get_shard_index(uintptr_t site_id);
 extern void set_nonempty_bit(int class_idx, int shard);
 extern void mid_page_inuse_dec_and_maybe_dn(void* raw);
 extern void mf2_free(void* ptr);
 // Assumed available from caller includes:
 // - AllocHeader (from hakmem_internal.h)
 // - PoolBlock (from pool_tls_types.inc.h or hakmem_pool.c)
 // - PoolTLSRing (from pool_tls_types.inc.h)
 // - g_pool, g_tls_bin, g_pool_v1_flat_stats (from hakmem_pool.c)
 // - g_tls_ring_enabled, g_tls_lo_max, g_hdr_light_enabled, g_mf2_enabled, t_pool_rng
 // - HEADER_SIZE, ALLOC_METHOD_POOL, POOL_L2_RING_CAP, HAKMEM_MAGIC
 // ============================================================================
 // Phase POOL-FREE-V1-OPT: L1-FastBox (same-thread TLS free, no mid_desc_lookup)
 // ============================================================================
 // Precondition: fast predicate already verified by caller
 //   - g_pool.tls_free_enabled == true
 //   - g_hdr_light_enabled == 0 (header owner_tid trusted)
 //   - hdr->owner_tid == self (same-thread)
 // Effect: Skips 2x mid_desc_lookup calls vs slow path
 static inline void hak_pool_free_v1_fast_impl(void* raw, int class_idx, uintptr_t site_id) {
    PoolBlock* block = (PoolBlock*)raw;
    // Same-thread TLS free path (ring → lo_head → spill)
    PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
    if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) {
        ring->items[ring->top++] = block;
    } else {
        block->next = g_tls_bin[class_idx].lo_head;
        g_tls_bin[class_idx].lo_head = block;
        g_tls_bin[class_idx].lo_count++;
        if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {
            size_t spill = g_tls_bin[class_idx].lo_count / 2;
            int shard = hak_pool_get_shard_index(site_id);
            while (spill-- && g_tls_bin[class_idx].lo_head) {
                PoolBlock* b = g_tls_bin[class_idx].lo_head;
                g_tls_bin[class_idx].lo_head = b->next;
                g_tls_bin[class_idx].lo_count--;
                HKM_TIME_START(t_remote_push1);
                uintptr_t old_head;
                do {
                    old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
                    b->next = (PoolBlock*)old_head;
                } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard],
                                                                 &old_head, (uintptr_t)b,
                                                                 memory_order_release, memory_order_relaxed));
                atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
                HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1);
            }
            set_nonempty_bit(class_idx, shard);
        }
    }
    // Common tail: sample counter + inuse dec
    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
    mid_page_inuse_dec_and_maybe_dn(raw);
 }
 // ============================================================================
 // Phase POOL-FREE-V1-OPT: L1-SlowBox (full v1 impl with mid_desc_lookup)
 // ============================================================================
 static inline void hak_pool_free_v1_slow_impl(void* ptr, size_t size, uintptr_t site_id) {
    void* raw = (char*)ptr - HEADER_SIZE;
    AllocHeader* hdr = (AllocHeader*)raw;
    // Mid desc lookup for validation and class_idx
    int mid_by_desc = 0;
    MidPageDesc* d_desc = mid_desc_lookup_cached(ptr);
    if (d_desc) mid_by_desc = 1;
    if (!mid_by_desc && g_hdr_light_enabled < 2) {
        if (hdr->magic != HAKMEM_MAGIC) { MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X", hdr->magic, HAKMEM_MAGIC); return; }
        if (hdr->method != ALLOC_METHOD_POOL) { MF2_ERROR_LOG("Wrong method %d in pool_free, expected POOL (%d)", hdr->method, ALLOC_METHOD_POOL); return; }
    }
    int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size);
    if (class_idx < 0) return;
    PoolBlock* block = (PoolBlock*)raw;
    if (g_pool.tls_free_enabled) {
        int same_thread = 0;
        if (g_hdr_light_enabled >= 1) {
            MidPageDesc* d = mid_desc_lookup_cached(raw);
            if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { same_thread = 1; }
        } else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) {
            same_thread = 1;
        }
        if (same_thread) {
            PoolTLSRing* ring = &g_tls_bin[class_idx].ring;
            if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; }
            else {
                block->next = g_tls_bin[class_idx].lo_head;
                g_tls_bin[class_idx].lo_head = block;
                g_tls_bin[class_idx].lo_count++;
                if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) {
                    size_t spill = g_tls_bin[class_idx].lo_count / 2;
                    int shard = hak_pool_get_shard_index(site_id);
                    while (spill-- && g_tls_bin[class_idx].lo_head) {
                        PoolBlock* b = g_tls_bin[class_idx].lo_head;
                        g_tls_bin[class_idx].lo_head = b->next;
                        g_tls_bin[class_idx].lo_count--;
                        HKM_TIME_START(t_remote_push1);
                        uintptr_t old_head;
                        do {
                            old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
                            b->next = (PoolBlock*)old_head;
                        } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard],
                                                                         &old_head, (uintptr_t)b,
                                                                         memory_order_release, memory_order_relaxed));
                        atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
                        HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1);
                    }
                    set_nonempty_bit(class_idx, shard);
                }
            }
        } else {
            if (g_tc_enabled) {
                uint64_t owner_tid = 0;
                if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid;
                if (owner_tid == 0) { MidPageDesc* d = mid_desc_lookup_cached(raw); if (d) owner_tid = d->owner_tid; }
                if (owner_tid != 0) { MidTC* otc = mid_tc_lookup_by_tid(owner_tid); if (otc) { mid_tc_push(otc, class_idx, block); goto tail; } }
            }
            int shard = hak_pool_get_shard_index(site_id);
            uintptr_t old_head;
            HKM_TIME_START(t_remote_push2);
            do {
                old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire);
                block->next = (PoolBlock*)old_head;
            } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed));
            atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed);
            HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2);
            set_nonempty_bit(class_idx, shard);
        }
    } else {
        int shard_idx2 = hak_pool_get_shard_index(site_id);
        pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx2].m;
        pthread_mutex_lock(lock);
        block->next = g_pool.freelist[class_idx][shard_idx2];
        g_pool.freelist[class_idx][shard_idx2] = block;
        set_nonempty_bit(class_idx, shard_idx2);
        pthread_mutex_unlock(lock);
    }
 tail:
    t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5;
    if ((t_pool_rng & ((1u<<g_count_sample_exp)-1u)) == 0u) g_pool.frees[class_idx]++;
    mid_page_inuse_dec_and_maybe_dn(raw);
 }
 // ============================================================================
 // Phase POOL-FREE-V1-OPT: L0-SplitBox (routes to fast or slow)
 // ============================================================================
 static inline void hak_pool_free_v1_impl(void* ptr, size_t size, uintptr_t site_id) {
    if (!ptr) return;
    hak_pool_init();
    if (!hak_pool_is_poolable(size)) return;
    if (g_mf2_enabled) { mf2_free(ptr); return; }
    // L0 SplitBox: Check fast predicate
    if (hak_pool_v1_free_fastsplit_enabled() && g_pool.tls_free_enabled && g_hdr_light_enabled == 0) {
        void* raw = (char*)ptr - HEADER_SIZE;
        AllocHeader* hdr = (AllocHeader*)raw;
        uint64_t self = (uint64_t)(uintptr_t)pthread_self();
        // Fast predicate: header-based same-thread detection
        if (hdr->magic == HAKMEM_MAGIC &&
            hdr->method == ALLOC_METHOD_POOL &&
            hdr->owner_tid != 0 &&
            hdr->owner_tid == self) {
            // class_idx from size (caller provided)
            int class_idx = hak_pool_get_class_index(size);
            if (class_idx >= 0) {
                if (__builtin_expect(hak_pool_v1_flatten_stats_enabled(), 0)) {
                    atomic_fetch_add_explicit(&g_pool_v1_flat_stats.fastsplit_fast_hit, 1, memory_order_relaxed);
                }
                hak_pool_free_v1_fast_impl(raw, class_idx, site_id);
                return;
            }
        }
        // Fast predicate failed, fall through to slow
        if (__builtin_expect(hak_pool_v1_flatten_stats_enabled(), 0)) {
            atomic_fetch_add_explicit(&g_pool_v1_flat_stats.fastsplit_slow_hit, 1, memory_order_relaxed);
        }
    }
    // Fallback to slow path
    hak_pool_free_v1_slow_impl(ptr, size, site_id);
 }
 #endif  // POOL_FREE_V1_BOX_H