diff --git a/core/box/pool_api.inc.h b/core/box/pool_api.inc.h index 35692e90..485423f1 100644 --- a/core/box/pool_api.inc.h +++ b/core/box/pool_api.inc.h @@ -75,6 +75,19 @@ static inline int hak_pool_free_v1_reject_stats_enabled(void) { return g; } +// Phase POOL-FREE-V1-OPT Step 2: Fast/Slow split for v1 free +// When enabled, same-thread TLS free skips mid_desc_lookup (1回→0回) +// Requires g_hdr_light_enabled == 0 for header-based owner_tid +// Default OFF for safety +static inline int hak_pool_v1_free_fastsplit_enabled(void) { + static int g = -1; + if (__builtin_expect(g == -1, 0)) { + const char* e = getenv("HAKMEM_POOL_V1_FREE_FASTSPLIT"); + g = (e && *e == '1') ? 1 : 0; // default OFF + } + return g; +} + // Mid desc lookup TLS cache (mid bench opt-in; default OFF) static inline int hak_mid_desc_cache_enabled(void) { static int g = -1; @@ -119,6 +132,10 @@ typedef struct PoolV1FlattenStats { // Phase POOL-FREE-V1-OPT Step 1: v2 reject reasons _Atomic uint64_t v2_reject_total; // Total v2 free rejects (fell through to v1) _Atomic uint64_t v2_reject_ptr_null; // ptr == NULL + + // Phase POOL-FREE-V1-OPT Step 2: fast split stats + _Atomic uint64_t fastsplit_fast_hit; // Fast path taken + _Atomic uint64_t fastsplit_slow_hit; // Slow path taken (fast predicate failed) _Atomic uint64_t v2_reject_not_init; // pool not initialized _Atomic uint64_t v2_reject_desc_null; // mid_desc_lookup returned NULL _Atomic uint64_t v2_reject_mf2_null; // MF2 path but mf2_addr_to_page returned NULL @@ -161,6 +178,15 @@ static inline void pool_v1_flat_stats_dump(void) { (unsigned long long)atomic_load_explicit(&g_pool_v1_flat_stats.v2_reject_mf2_null, memory_order_relaxed)); } + // Phase POOL-FREE-V1-OPT Step 2: fastsplit stats + if (hak_pool_v1_flatten_stats_enabled() && hak_pool_v1_free_fastsplit_enabled()) { + fprintf(stderr, + "[POOL_V1_FASTSPLIT] fast_hit=%llu slow_hit=%llu\n", + (unsigned long long)atomic_load_explicit(&g_pool_v1_flat_stats.fastsplit_fast_hit, + memory_order_relaxed), + (unsigned long long)atomic_load_explicit(&g_pool_v1_flat_stats.fastsplit_slow_hit, + memory_order_relaxed)); + } } __attribute__((destructor)) static void pool_v1_flatten_stats_destructor(void) { @@ -892,16 +918,63 @@ static inline void* hak_pool_try_alloc_v1_impl(size_t size, uintptr_t site_id) { return user4; } -static inline void hak_pool_free_v1_impl(void* ptr, size_t size, uintptr_t site_id) { - if (!ptr) return; - hak_pool_init(); - if (!hak_pool_is_poolable(size)) return; +// ============================================================================ +// Phase POOL-FREE-V1-OPT: L1-FastBox (same-thread TLS free, no mid_desc_lookup) +// ============================================================================ +// Precondition: fast predicate already verified by caller +// - g_pool.tls_free_enabled == true +// - g_hdr_light_enabled == 0 (header owner_tid trusted) +// - hdr->owner_tid == self (same-thread) +// Effect: Skips 2x mid_desc_lookup calls vs slow path +static inline void hak_pool_free_v1_fast_impl(void* raw, int class_idx, uintptr_t site_id) { + PoolBlock* block = (PoolBlock*)raw; - if (g_mf2_enabled) { mf2_free(ptr); return; } + // Same-thread TLS free path (ring → lo_head → spill) + PoolTLSRing* ring = &g_tls_bin[class_idx].ring; + if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { + ring->items[ring->top++] = block; + } else { + block->next = g_tls_bin[class_idx].lo_head; + g_tls_bin[class_idx].lo_head = block; + g_tls_bin[class_idx].lo_count++; + if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) { + size_t spill = g_tls_bin[class_idx].lo_count / 2; + int shard = hak_pool_get_shard_index(site_id); + while (spill-- && g_tls_bin[class_idx].lo_head) { + PoolBlock* b = g_tls_bin[class_idx].lo_head; + g_tls_bin[class_idx].lo_head = b->next; + g_tls_bin[class_idx].lo_count--; + HKM_TIME_START(t_remote_push1); + uintptr_t old_head; + do { + old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); + b->next = (PoolBlock*)old_head; + } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], + &old_head, (uintptr_t)b, + memory_order_release, memory_order_relaxed)); + atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); + HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push1); + } + set_nonempty_bit(class_idx, shard); + } + } + // Common tail: sample counter + inuse dec + t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; + if ((t_pool_rng & ((1u<magic != HAKMEM_MAGIC) { MF2_ERROR_LOG("Invalid magic 0x%X in pool_free, expected 0x%X", hdr->magic, HAKMEM_MAGIC); return; } @@ -909,11 +982,16 @@ static inline void hak_pool_free_v1_impl(void* ptr, size_t size, uintptr_t site_ } int class_idx = mid_by_desc ? (int)d_desc->class_idx : hak_pool_get_class_index(size); if (class_idx < 0) return; + PoolBlock* block = (PoolBlock*)raw; if (g_pool.tls_free_enabled) { int same_thread = 0; - if (g_hdr_light_enabled >= 1) { MidPageDesc* d = mid_desc_lookup_cached(raw); if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { same_thread = 1; } } - else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) { same_thread = 1; } + if (g_hdr_light_enabled >= 1) { + MidPageDesc* d = mid_desc_lookup_cached(raw); + if (d && d->owner_tid != 0 && d->owner_tid == (uint64_t)(uintptr_t)pthread_self()) { same_thread = 1; } + } else if (hdr->owner_tid != 0 && hdr->owner_tid == (uintptr_t)(uintptr_t)pthread_self()) { + same_thread = 1; + } if (same_thread) { PoolTLSRing* ring = &g_tls_bin[class_idx].ring; if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = block; } @@ -924,7 +1002,6 @@ static inline void hak_pool_free_v1_impl(void* ptr, size_t size, uintptr_t site_ if ((int)g_tls_bin[class_idx].lo_count > g_tls_lo_max) { size_t spill = g_tls_bin[class_idx].lo_count / 2; int shard = hak_pool_get_shard_index(site_id); - // Spill half of local freelist to remote freelist while (spill-- && g_tls_bin[class_idx].lo_head) { PoolBlock* b = g_tls_bin[class_idx].lo_head; g_tls_bin[class_idx].lo_head = b->next; @@ -944,18 +1021,78 @@ static inline void hak_pool_free_v1_impl(void* ptr, size_t size, uintptr_t site_ } } } else { - if (g_tc_enabled) { uint64_t owner_tid = 0; if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid; if (owner_tid == 0) { MidPageDesc* d = mid_desc_lookup_cached(raw); if (d) owner_tid = d->owner_tid; } if (owner_tid != 0) { MidTC* otc = mid_tc_lookup_by_tid(owner_tid); if (otc) { mid_tc_push(otc, class_idx, block); return; } } } - int shard = hak_pool_get_shard_index(site_id); uintptr_t old_head; HKM_TIME_START(t_remote_push2); - do { old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); block->next = (PoolBlock*)old_head; } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed)); - atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2); set_nonempty_bit(class_idx, shard); + if (g_tc_enabled) { + uint64_t owner_tid = 0; + if (g_hdr_light_enabled < 2) owner_tid = hdr->owner_tid; + if (owner_tid == 0) { MidPageDesc* d = mid_desc_lookup_cached(raw); if (d) owner_tid = d->owner_tid; } + if (owner_tid != 0) { MidTC* otc = mid_tc_lookup_by_tid(owner_tid); if (otc) { mid_tc_push(otc, class_idx, block); goto tail; } } + } + int shard = hak_pool_get_shard_index(site_id); + uintptr_t old_head; + HKM_TIME_START(t_remote_push2); + do { + old_head = atomic_load_explicit(&g_pool.remote_head[class_idx][shard], memory_order_acquire); + block->next = (PoolBlock*)old_head; + } while (!atomic_compare_exchange_weak_explicit(&g_pool.remote_head[class_idx][shard], &old_head, (uintptr_t)block, memory_order_release, memory_order_relaxed)); + atomic_fetch_add_explicit(&g_pool.remote_count[class_idx][shard], 1, memory_order_relaxed); + HKM_TIME_END(HKM_CAT_POOL_REMOTE_PUSH, t_remote_push2); + set_nonempty_bit(class_idx, shard); } } else { - int shard_idx2 = hak_pool_get_shard_index(site_id); pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx2].m; pthread_mutex_lock(lock); block->next = g_pool.freelist[class_idx][shard_idx2]; g_pool.freelist[class_idx][shard_idx2] = block; set_nonempty_bit(class_idx, shard_idx2); pthread_mutex_unlock(lock); + int shard_idx2 = hak_pool_get_shard_index(site_id); + pthread_mutex_t* lock = &g_pool.freelist_locks[class_idx][shard_idx2].m; + pthread_mutex_lock(lock); + block->next = g_pool.freelist[class_idx][shard_idx2]; + g_pool.freelist[class_idx][shard_idx2] = block; + set_nonempty_bit(class_idx, shard_idx2); + pthread_mutex_unlock(lock); } - t_pool_rng ^= t_pool_rng << 13; t_pool_rng ^= t_pool_rng >> 17; t_pool_rng ^= t_pool_rng << 5; if ((t_pool_rng & ((1u<> 17; t_pool_rng ^= t_pool_rng << 5; + if ((t_pool_rng & ((1u<magic == HAKMEM_MAGIC && + hdr->method == ALLOC_METHOD_POOL && + hdr->owner_tid != 0 && + hdr->owner_tid == self) { + // class_idx from size (caller provided) + int class_idx = hak_pool_get_class_index(size); + if (class_idx >= 0) { + if (__builtin_expect(hak_pool_v1_flatten_stats_enabled(), 0)) { + atomic_fetch_add_explicit(&g_pool_v1_flat_stats.fastsplit_fast_hit, 1, memory_order_relaxed); + } + hak_pool_free_v1_fast_impl(raw, class_idx, site_id); + return; + } + } + // Fast predicate failed, fall through to slow + if (__builtin_expect(hak_pool_v1_flatten_stats_enabled(), 0)) { + atomic_fetch_add_explicit(&g_pool_v1_flat_stats.fastsplit_slow_hit, 1, memory_order_relaxed); + } + } + + // Fallback to slow path + hak_pool_free_v1_slow_impl(ptr, size, site_id); +} + // --- v1 flatten (opt-in) ---------------------------------------------------- static inline void* hak_pool_try_alloc_v1_flat(size_t size, uintptr_t site_id) {