hakmem/core/front/tiny_warm_pool.h

// tiny_warm_pool.h - Warm Pool Optimization for Unified Cache
// Purpose: Eliminate registry O(N) scan on cache miss by using per-thread warm SuperSlab pools
// Expected Gain: +40-50% throughput (1.06M → 1.5M+ ops/s)
// License: MIT
// Date: 2025-12-04

#ifndef HAK_TINY_WARM_POOL_H
#define HAK_TINY_WARM_POOL_H

#include <stdint.h>
#include "../hakmem_tiny_config.h"
#include "../superslab/superslab_types.h"

// ============================================================================
// Warm Pool Design
// ============================================================================
//
// PROBLEM:
// - unified_cache_refill() scans registry O(N) on every cache miss
// - Registry scan is expensive (~50-100 cycles per miss)
// - Cost grows with number of SuperSlabs per class
//
// SOLUTION:
// - Per-thread warm pool of pre-qualified HOT SuperSlabs
// - O(1) pop from warm pool (no registry scan needed)
// - Pool pre-filled during registry scan (look-ahead)
//
// DESIGN:
// - Thread-local array per class (no synchronization needed)
// - Fixed capacity per class (default: 4 SuperSlabs)
// - LIFO stack (simple pop/push operations)
//
// EXPECTED GAIN:
// - Eliminate registry scan from hot path
// - +40-50% throughput improvement
// - Memory overhead: ~256-512 KB per thread (acceptable)
//
// ============================================================================

// Maximum warm SuperSlabs per thread per class (tunable)
// Trade-off: Working set size vs warm pool effectiveness
//   - 4: Original (90% hit rate expected, but broken implementation)
//   - 16: Increased to compensate for suboptimal push logic
//   - Higher values: More memory but better locality
#define TINY_WARM_POOL_MAX_PER_CLASS 16

typedef struct {
    SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS];
    int32_t count;
} TinyWarmPool;

// Per-thread warm pool (one per class)
extern __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES];

// Per-thread warm pool statistics structure
typedef struct {
    uint64_t hits;         // Warm pool hit count
    uint64_t misses;       // Warm pool miss count
    uint64_t prefilled;    // Total SuperSlabs prefilled during registry scans
} TinyWarmPoolStats;

// Per-thread warm pool statistics (for tracking prefill effectiveness)
extern __thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES];

// ============================================================================
// API: Warm Pool Operations
// ============================================================================

// Initialize warm pool once per thread (lazy)
// Called on first access, sets all counts to 0
static inline void tiny_warm_pool_init_once(void) {
    static __thread int initialized = 0;
    if (!initialized) {
        for (int i = 0; i < TINY_NUM_CLASSES; i++) {
            g_tiny_warm_pool[i].count = 0;
        }
        initialized = 1;
    }
}

// O(1) pop from warm pool
// Returns: SuperSlab* (pre-qualified HOT SuperSlab), or NULL if pool empty
static inline SuperSlab* tiny_warm_pool_pop(int class_idx) {
    if (g_tiny_warm_pool[class_idx].count > 0) {
        return g_tiny_warm_pool[class_idx].slabs[--g_tiny_warm_pool[class_idx].count];
    }
    return NULL;
}

// O(1) push to warm pool
// Returns: 1 if pushed successfully, 0 if pool full (caller should free to LRU)
static inline int tiny_warm_pool_push(int class_idx, SuperSlab* ss) {
    if (g_tiny_warm_pool[class_idx].count < TINY_WARM_POOL_MAX_PER_CLASS) {
        g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss;
        return 1;
    }
    return 0;
}

// Get current count (for metrics/debugging)
static inline int tiny_warm_pool_count(int class_idx) {
    return g_tiny_warm_pool[class_idx].count;
}

// ============================================================================
// Optional: Environment Variable Tuning
// ============================================================================

// Get warm pool capacity from environment (configurable at runtime)
// ENV: HAKMEM_WARM_POOL_SIZE=N (default: 4)
static inline int warm_pool_max_per_class(void) {
    static int g_max = -1;
    if (__builtin_expect(g_max == -1, 0)) {
        const char* env = getenv("HAKMEM_WARM_POOL_SIZE");
        if (env && *env) {
            int v = atoi(env);
            // Clamp to valid range [1, 16]
            if (v < 1) v = 1;
            if (v > 16) v = 16;
            g_max = v;
        } else {
            g_max = TINY_WARM_POOL_MAX_PER_CLASS;
        }
    }
    return g_max;
}

// Push with environment-configured capacity
static inline int tiny_warm_pool_push_tunable(int class_idx, SuperSlab* ss) {
    int capacity = warm_pool_max_per_class();
    if (g_tiny_warm_pool[class_idx].count < capacity) {
        g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss;
        return 1;
    }
    return 0;
}

#endif // HAK_TINY_WARM_POOL_H
Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete) Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being implemented, causing all cache misses to go through expensive superslab_refill registry scans. Root Cause Analysis: - Warm pool was initialized once and pushed a single slab after each refill - When that slab was exhausted, it was discarded (not pushed back) - Next refill would push another single slab, which was immediately exhausted - Pool would oscillate between 0 and 1 items, yielding 0% hit rate Solution: Secondary Prefill on Cache Miss When warm pool becomes empty, we now do multiple superslab_refills and prefill the pool with 3 additional HOT superlslabs before attempting to carve. This builds a working set of slabs that can sustain allocation pressure. Implementation Details: - Modified unified_cache_refill() cold path to detect empty pool - Added prefill loop: when pool count == 0, load 3 extra superlslabs - Store extra slabs in warm pool, keep 1 in TLS for immediate carving - Track prefill events in g_warm_pool_stats[].prefilled counter Results (1M Random Mixed 256B allocations): - Before: C7 hits=1, misses=3976, hit_rate=0.0% - After: C7 hits=3929, misses=3143, hit_rate=55.6% - Throughput: 4.055M ops/s (maintained vs 4.07M baseline) - Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s) Performance Impact: - No regression: throughput remained stable at ~4.1M ops/s - Registry scan avoided in 55.6% of cache misses (significant savings) - Warm pool now functioning as intended with strong locality Configuration: - TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill - Prefill budget hardcoded to 3 (tunable via env var if needed later) - All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1 Next Steps: - Monitor for further optimization opportunities (prefill budget tuning) - Consider adaptive prefill budget based on class-specific hit rates - Validate at larger allocation counts (10M+ pending registry size fix) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-12-04 23:31:54 +09:00			`// tiny_warm_pool.h - Warm Pool Optimization for Unified Cache`
			`// Purpose: Eliminate registry O(N) scan on cache miss by using per-thread warm SuperSlab pools`
			`// Expected Gain: +40-50% throughput (1.06M → 1.5M+ ops/s)`
			`// License: MIT`
			`// Date: 2025-12-04`

			`#ifndef HAK_TINY_WARM_POOL_H`
			`#define HAK_TINY_WARM_POOL_H`

			`#include <stdint.h>`
			`#include "../hakmem_tiny_config.h"`
			`#include "../superslab/superslab_types.h"`

			`// ============================================================================`
			`// Warm Pool Design`
			`// ============================================================================`
			`//`
			`// PROBLEM:`
			`// - unified_cache_refill() scans registry O(N) on every cache miss`
			`// - Registry scan is expensive (~50-100 cycles per miss)`
			`// - Cost grows with number of SuperSlabs per class`
			`//`
			`// SOLUTION:`
			`// - Per-thread warm pool of pre-qualified HOT SuperSlabs`
			`// - O(1) pop from warm pool (no registry scan needed)`
			`// - Pool pre-filled during registry scan (look-ahead)`
			`//`
			`// DESIGN:`
			`// - Thread-local array per class (no synchronization needed)`
			`// - Fixed capacity per class (default: 4 SuperSlabs)`
			`// - LIFO stack (simple pop/push operations)`
			`//`
			`// EXPECTED GAIN:`
			`// - Eliminate registry scan from hot path`
			`// - +40-50% throughput improvement`
			`// - Memory overhead: ~256-512 KB per thread (acceptable)`
			`//`
			`// ============================================================================`

			`// Maximum warm SuperSlabs per thread per class (tunable)`
			`// Trade-off: Working set size vs warm pool effectiveness`
			`// - 4: Original (90% hit rate expected, but broken implementation)`
			`// - 16: Increased to compensate for suboptimal push logic`
			`// - Higher values: More memory but better locality`
			`#define TINY_WARM_POOL_MAX_PER_CLASS 16`

			`typedef struct {`
			`SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS];`
			`int32_t count;`
			`} TinyWarmPool;`

			`// Per-thread warm pool (one per class)`
			`extern __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES];`

			`// Per-thread warm pool statistics structure`
			`typedef struct {`
			`uint64_t hits; // Warm pool hit count`
			`uint64_t misses; // Warm pool miss count`
			`uint64_t prefilled; // Total SuperSlabs prefilled during registry scans`
			`} TinyWarmPoolStats;`

			`// Per-thread warm pool statistics (for tracking prefill effectiveness)`
			`extern __thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES];`

			`// ============================================================================`
			`// API: Warm Pool Operations`
			`// ============================================================================`

			`// Initialize warm pool once per thread (lazy)`
			`// Called on first access, sets all counts to 0`
			`static inline void tiny_warm_pool_init_once(void) {`
			`static __thread int initialized = 0;`
			`if (!initialized) {`
			`for (int i = 0; i < TINY_NUM_CLASSES; i++) {`
			`g_tiny_warm_pool[i].count = 0;`
			`}`
			`initialized = 1;`
			`}`
			`}`

			`// O(1) pop from warm pool`
			`// Returns: SuperSlab* (pre-qualified HOT SuperSlab), or NULL if pool empty`
			`static inline SuperSlab* tiny_warm_pool_pop(int class_idx) {`
			`if (g_tiny_warm_pool[class_idx].count > 0) {`
			`return g_tiny_warm_pool[class_idx].slabs[--g_tiny_warm_pool[class_idx].count];`
			`}`
			`return NULL;`
			`}`

			`// O(1) push to warm pool`
			`// Returns: 1 if pushed successfully, 0 if pool full (caller should free to LRU)`
			`static inline int tiny_warm_pool_push(int class_idx, SuperSlab* ss) {`
			`if (g_tiny_warm_pool[class_idx].count < TINY_WARM_POOL_MAX_PER_CLASS) {`
			`g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss;`
			`return 1;`
			`}`
			`return 0;`
			`}`

			`// Get current count (for metrics/debugging)`
			`static inline int tiny_warm_pool_count(int class_idx) {`
			`return g_tiny_warm_pool[class_idx].count;`
			`}`

			`// ============================================================================`
			`// Optional: Environment Variable Tuning`
			`// ============================================================================`

			`// Get warm pool capacity from environment (configurable at runtime)`
			`// ENV: HAKMEM_WARM_POOL_SIZE=N (default: 4)`
			`static inline int warm_pool_max_per_class(void) {`
			`static int g_max = -1;`
			`if (__builtin_expect(g_max == -1, 0)) {`
			`const char* env = getenv("HAKMEM_WARM_POOL_SIZE");`
			`if (env && *env) {`
			`int v = atoi(env);`
			`// Clamp to valid range [1, 16]`
			`if (v < 1) v = 1;`
			`if (v > 16) v = 16;`
			`g_max = v;`
			`} else {`
			`g_max = TINY_WARM_POOL_MAX_PER_CLASS;`
			`}`
			`}`
			`return g_max;`
			`}`

			`// Push with environment-configured capacity`
			`static inline int tiny_warm_pool_push_tunable(int class_idx, SuperSlab* ss) {`
			`int capacity = warm_pool_max_per_class();`
			`if (g_tiny_warm_pool[class_idx].count < capacity) {`
			`g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss;`
			`return 1;`
			`}`
			`return 0;`
			`}`

			`#endif // HAK_TINY_WARM_POOL_H`