Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)

Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being implemented, causing all cache misses to go through expensive superslab_refill registry scans. Root Cause Analysis: - Warm pool was initialized once and pushed a single slab after each refill - When that slab was exhausted, it was discarded (not pushed back) - Next refill would push another single slab, which was immediately exhausted - Pool would oscillate between 0 and 1 items, yielding 0% hit rate Solution: Secondary Prefill on Cache Miss When warm pool becomes empty, we now do multiple superslab_refills and prefill the pool with 3 additional HOT superlslabs before attempting to carve. This builds a working set of slabs that can sustain allocation pressure. Implementation Details: - Modified unified_cache_refill() cold path to detect empty pool - Added prefill loop: when pool count == 0, load 3 extra superlslabs - Store extra slabs in warm pool, keep 1 in TLS for immediate carving - Track prefill events in g_warm_pool_stats[].prefilled counter Results (1M Random Mixed 256B allocations): - Before: C7 hits=1, misses=3976, hit_rate=0.0% - After: C7 hits=3929, misses=3143, hit_rate=55.6% - Throughput: 4.055M ops/s (maintained vs 4.07M baseline) - Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s) Performance Impact: - No regression: throughput remained stable at ~4.1M ops/s - Registry scan avoided in 55.6% of cache misses (significant savings) - Warm pool now functioning as intended with strong locality Configuration: - TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill - Prefill budget hardcoded to 3 (tunable via env var if needed later) - All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1 Next Steps: - Monitor for further optimization opportunities (prefill budget tuning) - Consider adaptive prefill budget based on class-specific hit rates - Validate at larger allocation counts (10M+ pending registry size fix) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
parent 2e3fcc92af
commit 5685c2f4c9
29 changed files with 6023 additions and 138 deletions
--- a/core/front/tiny_unified_cache.c
+++ b/core/front/tiny_unified_cache.c
@ -1,5 +1,6 @@
 // tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation
 #include "tiny_unified_cache.h"
+#include "tiny_warm_pool.h"                  // Warm Pool: O(1) SuperSlab lookup
 #include "../tiny_tls.h"                     // Phase 23-E: TinyTLSSlab, TinySlabMeta
 #include "../tiny_box_geometry.h"            // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry
 #include "../box/tiny_next_ptr_box.h"        // Phase 23-E: tiny_next_read (freelist traversal)
@ -7,6 +8,8 @@
 #include "../superslab/superslab_inline.h"   // Phase 23-E: ss_active_add, slab_index_for, ss_slabs_capacity
 #include "../hakmem_super_registry.h"        // For hak_super_lookup (pointer→SuperSlab)
 #include "../box/pagefault_telemetry_box.h"  // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
+#include "../box/ss_tier_box.h"              // For ss_tier_is_hot() tier checks
+#include "../box/ss_slab_meta_box.h"         // For ss_active_add() and slab metadata operations
 #include "../hakmem_env_cache.h"             // Priority-2: ENV cache (eliminate syscalls)
 #include <stdlib.h>
 #include <string.h>
@ -48,6 +51,7 @@ static inline int unified_cache_measure_enabled(void) {

 // Phase 23-E: Forward declarations
 extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];  // From hakmem_tiny_superslab.c
+extern void ss_active_add(SuperSlab* ss, uint32_t n);       // From hakmem_tiny_ss_active_box.inc

 // ============================================================================
 // TLS Variables (defined here, extern in header)
@ -55,6 +59,9 @@ extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];  // From hakmem_tiny_

 __thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];

+// Warm Pool: Per-thread warm SuperSlab pools (one per class)
+__thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};
+
 // ============================================================================
 // Metrics (Phase 23, optional for debugging)
 // ============================================================================
@ -66,6 +73,10 @@ __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
 __thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};
 #endif

+// Warm Pool metrics (definition - declared in tiny_warm_pool.h as extern)
+// Note: These are kept outside !HAKMEM_BUILD_RELEASE for profiling in release builds
+__thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES] = {0};
+
 // ============================================================================
 // Phase 8-Step1-Fix: unified_cache_enabled() implementation (non-static)
 // ============================================================================
@ -187,9 +198,48 @@ void unified_cache_print_stats(void) {
                full_rate);
    }
    fflush(stderr);
+
+    // Also print warm pool stats if enabled
+    tiny_warm_pool_print_stats();
 #endif
 }

+// ============================================================================
+// Warm Pool Stats (always compiled, ENV-gated at runtime)
+// ============================================================================
+
+static inline void tiny_warm_pool_print_stats(void) {
+    // Check if warm pool stats are enabled via ENV
+    static int g_print_stats = -1;
+    if (__builtin_expect(g_print_stats == -1, 0)) {
+        const char* e = getenv("HAKMEM_WARM_POOL_STATS");
+        g_print_stats = (e && *e && *e != '0') ? 1 : 0;
+    }
+
+    if (!g_print_stats) return;
+
+    fprintf(stderr, "\n[WarmPool-STATS] Warm Pool Metrics:\n");
+
+    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
+        uint64_t total = g_warm_pool_stats[i].hits + g_warm_pool_stats[i].misses;
+        if (total == 0) continue;  // Skip unused classes
+
+        float hit_rate = 100.0 * g_warm_pool_stats[i].hits / total;
+        fprintf(stderr, "  C%d: hits=%llu misses=%llu hit_rate=%.1f%% prefilled=%llu\n",
+                i,
+                (unsigned long long)g_warm_pool_stats[i].hits,
+                (unsigned long long)g_warm_pool_stats[i].misses,
+                hit_rate,
+                (unsigned long long)g_warm_pool_stats[i].prefilled);
+    }
+    fflush(stderr);
+}
+
+// Public wrapper for benchmarks
+void tiny_warm_pool_print_stats_public(void) {
+    tiny_warm_pool_print_stats();
+}
+
 // ============================================================================
 // Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)
 // ============================================================================
@ -324,9 +374,80 @@ static inline int unified_refill_validate_base(int class_idx,
 #endif
 }

+// ============================================================================
+// Warm Pool Enhanced: Direct carve from warm SuperSlab (bypass superslab_refill)
+// ============================================================================
+
+// Helper: Try to carve blocks directly from a SuperSlab (warm pool path)
+// Returns: Number of blocks produced (0 if failed)
+static inline int unified_cache_carve_from_ss(int class_idx, SuperSlab* ss,
+                                              void** out, int max_blocks) {
+    if (!ss || ss->magic != SUPERSLAB_MAGIC) return 0;
+
+    // Find an available slab in this SuperSlab
+    int cap = ss_slabs_capacity(ss);
+    for (int slab_idx = 0; slab_idx < cap; slab_idx++) {
+        TinySlabMeta* meta = &ss->slabs[slab_idx];
+
+        // Check if this slab matches our class and has capacity
+        if (meta->class_idx != (uint8_t)class_idx) continue;
+        if (meta->used >= meta->capacity && !meta->freelist) continue;
+
+        // Carve blocks from this slab
+        size_t bs = tiny_stride_for_class(class_idx);
+        uint8_t* base = tiny_slab_base_for_geometry(ss, slab_idx);
+        int produced = 0;
+
+        while (produced < max_blocks) {
+            void* p = NULL;
+
+            if (meta->freelist) {
+                // Pop from freelist
+                p = meta->freelist;
+                void* next_node = tiny_next_read(class_idx, p);
+
+                #if HAKMEM_TINY_HEADER_CLASSIDX
+                *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
+                __atomic_thread_fence(__ATOMIC_RELEASE);
+                #endif
+
+                meta->freelist = next_node;
+                meta->used++;
+
+            } else if (meta->carved < meta->capacity) {
+                // Linear carve
+                p = (void*)(base + ((size_t)meta->carved * bs));
+
+                #if HAKMEM_TINY_HEADER_CLASSIDX
+                *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
+                #endif
+
+                meta->carved++;
+                meta->used++;
+
+            } else {
+                break;  // This slab exhausted
+            }
+
+            if (p) {
+                pagefault_telemetry_touch(class_idx, p);
+                out[produced++] = p;
+            }
+        }
+
+        if (produced > 0) {
+            ss_active_add(ss, (uint32_t)produced);
+            return produced;
+        }
+    }
+
+    return 0;  // No suitable slab found in this SuperSlab
+}
+
 // Batch refill from SuperSlab (called on cache miss)
 // Returns: BASE pointer (first block, wrapped), or NULL-wrapped if failed
 // Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)
+// Warm Pool Integration: PRIORITIZE warm pool, use superslab_refill as fallback
 hak_base_ptr_t unified_cache_refill(int class_idx) {
    // Measure refill cost if enabled
    uint64_t start_cycles = 0;
@ -335,13 +456,8 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
        start_cycles = read_tsc();
    }

-    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
-
-    // Step 1: Ensure SuperSlab available
-    if (!tls->ss) {
-        if (!superslab_refill(class_idx)) return HAK_BASE_FROM_RAW(NULL);
-        tls = &g_tls_slabs[class_idx];  // Reload after refill
-    }
+    // Initialize warm pool on first use (per-thread)
+    tiny_warm_pool_init_once();

    TinyUnifiedCache* cache = &g_unified_cache[class_idx];

@ -354,7 +470,7 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
        }
    }

-    // Step 2: Calculate available room in unified cache
+    // Calculate available room in unified cache
    int room = (int)cache->capacity - 1;  // Leave 1 slot for full detection
    if (cache->head > cache->tail) {
        room = cache->head - cache->tail - 1;
@ -365,9 +481,92 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
    if (room <= 0) return HAK_BASE_FROM_RAW(NULL);
    if (room > 128) room = 128;  // Batch size limit

-    // Step 3: Direct carve from SuperSlab into local array (bypass TLS SLL!)
    void* out[128];
    int produced = 0;
+
+    // ========== WARM POOL HOT PATH: Check warm pool FIRST ==========
+    // This is the critical optimization - avoid superslab_refill() registry scan
+    SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx);
+    if (warm_ss) {
+        // HOT PATH: Warm pool hit, try to carve directly
+        produced = unified_cache_carve_from_ss(class_idx, warm_ss, out, room);
+
+        if (produced > 0) {
+            // Success! Return SuperSlab to warm pool for next use
+            tiny_warm_pool_push(class_idx, warm_ss);
+
+            // Track warm pool hit (always compiled, ENV-gated printing)
+            g_warm_pool_stats[class_idx].hits++;
+
+            // Store blocks into cache and return first
+            void* first = out[0];
+            for (int i = 1; i < produced; i++) {
+                cache->slots[cache->tail] = out[i];
+                cache->tail = (cache->tail + 1) & cache->mask;
+            }
+
+            #if !HAKMEM_BUILD_RELEASE
+            g_unified_cache_miss[class_idx]++;
+            #endif
+
+            if (measure) {
+                uint64_t end_cycles = read_tsc();
+                uint64_t delta = end_cycles - start_cycles;
+                atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global, delta, memory_order_relaxed);
+                atomic_fetch_add_explicit(&g_unified_cache_misses_global, 1, memory_order_relaxed);
+            }
+
+            return HAK_BASE_FROM_RAW(first);
+        }
+
+        // SuperSlab carve failed (produced == 0)
+        // This slab is either exhausted or has no more available capacity
+        // The statistics counter 'prefilled' tracks how often we try to prefill
+        // To improve: implement secondary prefill (scan for more HOT superlslabs)
+        static __thread int prefill_attempt_count = 0;
+        if (produced == 0 && tiny_warm_pool_count(class_idx) == 0) {
+            // Pool is empty and carve failed - prefill would help here
+            g_warm_pool_stats[class_idx].prefilled++;
+            prefill_attempt_count = 0;  // Reset counter
+        }
+    }
+
+    // ========== COLD PATH: Warm pool miss, use superslab_refill ==========
+    // Track warm pool miss (always compiled, ENV-gated printing)
+    g_warm_pool_stats[class_idx].misses++;
+
+    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
+
+    // Step 1: Ensure SuperSlab available via normal refill
+    // Enhanced: If pool is empty (just became empty), try prefill
+    // Prefill budget: Load 3 extra superlslabs when pool is empty for better hit rate
+    int pool_prefill_budget = (tiny_warm_pool_count(class_idx) == 0) ? 3 : 1;
+
+    while (pool_prefill_budget > 0) {
+        if (!tls->ss) {
+            if (!superslab_refill(class_idx)) return HAK_BASE_FROM_RAW(NULL);
+            tls = &g_tls_slabs[class_idx];  // Reload after refill
+        }
+
+        // Warm Pool: Cache this SuperSlab for potential future use
+        // This provides locality - same SuperSlab likely to have more available slabs
+        if (tls->ss && tls->ss->magic == SUPERSLAB_MAGIC) {
+            if (pool_prefill_budget > 1) {
+                // Prefill mode: push to warm pool and load another slab
+                tiny_warm_pool_push(class_idx, tls->ss);
+                g_warm_pool_stats[class_idx].prefilled++;
+                tls->ss = NULL;  // Force next iteration to refill
+                pool_prefill_budget--;
+            } else {
+                // Final slab: keep for carving, don't push yet
+                pool_prefill_budget = 0;
+            }
+        } else {
+            pool_prefill_budget = 0;
+        }
+    }
+
+    // Step 2: Direct carve from SuperSlab into local array (bypass TLS SLL!)
    TinySlabMeta* m = tls->meta;
    size_t bs = tiny_stride_for_class(class_idx);
    uint8_t* base = tls->slab_base