Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)

Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being implemented, causing all cache misses to go through expensive superslab_refill registry scans. Root Cause Analysis: - Warm pool was initialized once and pushed a single slab after each refill - When that slab was exhausted, it was discarded (not pushed back) - Next refill would push another single slab, which was immediately exhausted - Pool would oscillate between 0 and 1 items, yielding 0% hit rate Solution: Secondary Prefill on Cache Miss When warm pool becomes empty, we now do multiple superslab_refills and prefill the pool with 3 additional HOT superlslabs before attempting to carve. This builds a working set of slabs that can sustain allocation pressure. Implementation Details: - Modified unified_cache_refill() cold path to detect empty pool - Added prefill loop: when pool count == 0, load 3 extra superlslabs - Store extra slabs in warm pool, keep 1 in TLS for immediate carving - Track prefill events in g_warm_pool_stats[].prefilled counter Results (1M Random Mixed 256B allocations): - Before: C7 hits=1, misses=3976, hit_rate=0.0% - After: C7 hits=3929, misses=3143, hit_rate=55.6% - Throughput: 4.055M ops/s (maintained vs 4.07M baseline) - Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s) Performance Impact: - No regression: throughput remained stable at ~4.1M ops/s - Registry scan avoided in 55.6% of cache misses (significant savings) - Warm pool now functioning as intended with strong locality Configuration: - TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill - Prefill budget hardcoded to 3 (tunable via env var if needed later) - All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1 Next Steps: - Monitor for further optimization opportunities (prefill budget tuning) - Consider adaptive prefill budget based on class-specific hit rates - Validate at larger allocation counts (10M+ pending registry size fix) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
parent 2e3fcc92af
commit 5685c2f4c9
29 changed files with 6023 additions and 138 deletions
--- a/core/hakmem_shared_pool_acquire.c
+++ b/core/hakmem_shared_pool_acquire.c
@ -9,6 +9,7 @@
 #include "box/ss_tier_box.h"  // P-Tier: Tier filtering support
 #include "hakmem_policy.h"
 #include "hakmem_env_cache.h"  // Priority-2: ENV cache
+#include "front/tiny_warm_pool.h"  // Warm Pool: Prefill during registry scans

 #include <stdlib.h>
 #include <stdio.h>
@ -39,6 +40,11 @@ void shared_pool_print_measurements(void);
 // Stage 0.5: EMPTY slab direct scan（registry ベースの EMPTY 再利用）
 // Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
 // avoid Stage 3 (mmap) when freed slabs are available.
+//
+// WARM POOL OPTIMIZATION:
+// - During the registry scan, prefill warm pool with HOT SuperSlabs
+// - This eliminates future registry scans for cache misses
+// - Expected gain: +40-50% by reducing O(N) scan overhead
 static inline int
 sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire)
 {
@ -61,6 +67,13 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
    static _Atomic uint64_t stage05_attempts = 0;
    atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed);

+    // Initialize warm pool on first use (per-thread, one-time)
+    tiny_warm_pool_init_once();
+
+    // Track SuperSlabs scanned during this acquire call for warm pool prefill
+    SuperSlab* primary_result = NULL;
+    int primary_slab_idx = -1;
+
    for (int i = 0; i < scan_limit; i++) {
        SuperSlab* ss = g_super_reg_by_class[class_idx][i];
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
@ -68,6 +81,14 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
        if (!ss_tier_is_hot(ss)) continue;
        if (ss->empty_count == 0) continue;  // No EMPTY slabs in this SS

+        // WARM POOL PREFILL: Add HOT SuperSlabs to warm pool (if not already primary result)
+        // This is low-cost during registry scan and avoids future expensive scans
+        if (ss != primary_result && tiny_warm_pool_count(class_idx) < 4) {
+            tiny_warm_pool_push(class_idx, ss);
+            // Track prefilled SuperSlabs for metrics
+            g_warm_pool_stats[class_idx].prefilled++;
+        }
+
        uint32_t mask = ss->empty_mask;
        while (mask) {
            int empty_idx = __builtin_ctz(mask);
@ -84,32 +105,39 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
 #if !HAKMEM_BUILD_RELEASE
                if (dbg_acquire == 1) {
                    fprintf(stderr,
-                            "[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u)\n",
-                            class_idx, (void*)ss, empty_idx, ss->empty_count);
+                            "[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u warm_pool_size=%d)\n",
+                            class_idx, (void*)ss, empty_idx, ss->empty_count, tiny_warm_pool_count(class_idx));
                }
 #else
                (void)dbg_acquire;
 #endif

-                *ss_out = ss;
-                *slab_idx_out = empty_idx;
-                sp_stage_stats_init();
-                if (g_sp_stage_stats_enabled) {
-                    atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
+                // Store primary result but continue scanning to prefill warm pool
+                if (primary_result == NULL) {
+                    primary_result = ss;
+                    primary_slab_idx = empty_idx;
+                    *ss_out = ss;
+                    *slab_idx_out = empty_idx;
+                    sp_stage_stats_init();
+                    if (g_sp_stage_stats_enabled) {
+                        atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
+                    }
+                    atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
                }
-                atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
-                
-                // Stage 0.5 hit rate visualization (every 100 hits)
-                uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
-                if (hits % 100 == 1) {
-                    uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
-                    fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d)\n",
-                            hits, attempts, (double)hits * 100.0 / attempts, scan_limit);
-                }
-                return 0;
            }
        }
    }
+
+    if (primary_result != NULL) {
+        // Stage 0.5 hit rate visualization (every 100 hits)
+        uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
+        if (hits % 100 == 1) {
+            uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
+            fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d warm_pool=%d)\n",
+                    hits, attempts, (double)hits * 100.0 / attempts, scan_limit, tiny_warm_pool_count(class_idx));
+        }
+        return 0;
+    }
    return -1;
 }

@ -177,7 +205,7 @@ stage1_retry_after_tension_drain:
        if (ss_guard) {
            tiny_tls_slab_reuse_guard(ss_guard);

-            // P-Tier: Skip DRAINING tier SuperSlabs (reinsert to freelist and fallback)
+            // P-Tier: Skip DRAINING tier SuperSlabs
            if (!ss_tier_is_hot(ss_guard)) {
                // DRAINING SuperSlab - skip this slot and fall through to Stage 2
                if (g_lock_stats_enabled == 1) {