Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)

Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.

Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate

Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.

Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter

Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After:  C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)

Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality

Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1

Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-04 23:31:54 +09:00
parent 2e3fcc92af
commit 5685c2f4c9
29 changed files with 6023 additions and 138 deletions

View File

@ -9,6 +9,7 @@
#include "box/ss_tier_box.h" // P-Tier: Tier filtering support
#include "hakmem_policy.h"
#include "hakmem_env_cache.h" // Priority-2: ENV cache
#include "front/tiny_warm_pool.h" // Warm Pool: Prefill during registry scans
#include <stdlib.h>
#include <stdio.h>
@ -39,6 +40,11 @@ void shared_pool_print_measurements(void);
// Stage 0.5: EMPTY slab direct scanregistry ベースの EMPTY 再利用)
// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
// avoid Stage 3 (mmap) when freed slabs are available.
//
// WARM POOL OPTIMIZATION:
// - During the registry scan, prefill warm pool with HOT SuperSlabs
// - This eliminates future registry scans for cache misses
// - Expected gain: +40-50% by reducing O(N) scan overhead
static inline int
sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int dbg_acquire)
{
@ -61,6 +67,13 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
static _Atomic uint64_t stage05_attempts = 0;
atomic_fetch_add_explicit(&stage05_attempts, 1, memory_order_relaxed);
// Initialize warm pool on first use (per-thread, one-time)
tiny_warm_pool_init_once();
// Track SuperSlabs scanned during this acquire call for warm pool prefill
SuperSlab* primary_result = NULL;
int primary_slab_idx = -1;
for (int i = 0; i < scan_limit; i++) {
SuperSlab* ss = g_super_reg_by_class[class_idx][i];
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
@ -68,6 +81,14 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
if (!ss_tier_is_hot(ss)) continue;
if (ss->empty_count == 0) continue; // No EMPTY slabs in this SS
// WARM POOL PREFILL: Add HOT SuperSlabs to warm pool (if not already primary result)
// This is low-cost during registry scan and avoids future expensive scans
if (ss != primary_result && tiny_warm_pool_count(class_idx) < 4) {
tiny_warm_pool_push(class_idx, ss);
// Track prefilled SuperSlabs for metrics
g_warm_pool_stats[class_idx].prefilled++;
}
uint32_t mask = ss->empty_mask;
while (mask) {
int empty_idx = __builtin_ctz(mask);
@ -84,32 +105,39 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
#if !HAKMEM_BUILD_RELEASE
if (dbg_acquire == 1) {
fprintf(stderr,
"[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u)\n",
class_idx, (void*)ss, empty_idx, ss->empty_count);
"[SP_ACQUIRE_STAGE0.5_EMPTY] class=%d reusing EMPTY slab (ss=%p slab=%d empty_count=%u warm_pool_size=%d)\n",
class_idx, (void*)ss, empty_idx, ss->empty_count, tiny_warm_pool_count(class_idx));
}
#else
(void)dbg_acquire;
#endif
*ss_out = ss;
*slab_idx_out = empty_idx;
sp_stage_stats_init();
if (g_sp_stage_stats_enabled) {
atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
// Store primary result but continue scanning to prefill warm pool
if (primary_result == NULL) {
primary_result = ss;
primary_slab_idx = empty_idx;
*ss_out = ss;
*slab_idx_out = empty_idx;
sp_stage_stats_init();
if (g_sp_stage_stats_enabled) {
atomic_fetch_add(&g_sp_stage1_hits[class_idx], 1);
}
atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
}
atomic_fetch_add_explicit(&stage05_hits, 1, memory_order_relaxed);
// Stage 0.5 hit rate visualization (every 100 hits)
uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
if (hits % 100 == 1) {
uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d)\n",
hits, attempts, (double)hits * 100.0 / attempts, scan_limit);
}
return 0;
}
}
}
if (primary_result != NULL) {
// Stage 0.5 hit rate visualization (every 100 hits)
uint64_t hits = atomic_load_explicit(&stage05_hits, memory_order_relaxed);
if (hits % 100 == 1) {
uint64_t attempts = atomic_load_explicit(&stage05_attempts, memory_order_relaxed);
fprintf(stderr, "[STAGE0.5_STATS] hits=%lu attempts=%lu rate=%.1f%% (scan_limit=%d warm_pool=%d)\n",
hits, attempts, (double)hits * 100.0 / attempts, scan_limit, tiny_warm_pool_count(class_idx));
}
return 0;
}
return -1;
}
@ -177,7 +205,7 @@ stage1_retry_after_tension_drain:
if (ss_guard) {
tiny_tls_slab_reuse_guard(ss_guard);
// P-Tier: Skip DRAINING tier SuperSlabs (reinsert to freelist and fallback)
// P-Tier: Skip DRAINING tier SuperSlabs
if (!ss_tier_is_hot(ss_guard)) {
// DRAINING SuperSlab - skip this slot and fall through to Stage 2
if (g_lock_stats_enabled == 1) {