// tiny_warm_pool.h - Warm Pool Optimization for Unified Cache // Purpose: Eliminate registry O(N) scan on cache miss by using per-thread warm SuperSlab pools // Expected Gain: +40-50% throughput (1.06M → 1.5M+ ops/s) // License: MIT // Date: 2025-12-04 #ifndef HAK_TINY_WARM_POOL_H #define HAK_TINY_WARM_POOL_H #include #include "../hakmem_tiny_config.h" #include "../superslab/superslab_types.h" // ============================================================================ // Warm Pool Design // ============================================================================ // // PROBLEM: // - unified_cache_refill() scans registry O(N) on every cache miss // - Registry scan is expensive (~50-100 cycles per miss) // - Cost grows with number of SuperSlabs per class // // SOLUTION: // - Per-thread warm pool of pre-qualified HOT SuperSlabs // - O(1) pop from warm pool (no registry scan needed) // - Pool pre-filled during registry scan (look-ahead) // // DESIGN: // - Thread-local array per class (no synchronization needed) // - Fixed capacity per class (default: 4 SuperSlabs) // - LIFO stack (simple pop/push operations) // // EXPECTED GAIN: // - Eliminate registry scan from hot path // - +40-50% throughput improvement // - Memory overhead: ~256-512 KB per thread (acceptable) // // ============================================================================ // Maximum warm SuperSlabs per thread per class (tunable) // Trade-off: Working set size vs warm pool effectiveness // - 4: Original (90% hit rate expected, but broken implementation) // - 16: Increased to compensate for suboptimal push logic // - Higher values: More memory but better locality #define TINY_WARM_POOL_MAX_PER_CLASS 16 typedef struct { SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS]; int32_t count; } TinyWarmPool; // Per-thread warm pool (one per class) extern __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES]; // Per-thread warm pool statistics structure typedef struct { uint64_t hits; // Warm pool hit count uint64_t misses; // Warm pool miss count uint64_t prefilled; // Total SuperSlabs prefilled during registry scans } TinyWarmPoolStats; // Per-thread warm pool statistics (for tracking prefill effectiveness) extern __thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES]; // ============================================================================ // API: Warm Pool Operations // ============================================================================ // Initialize warm pool once per thread (lazy) // Called on first access, sets all counts to 0 static inline void tiny_warm_pool_init_once(void) { static __thread int initialized = 0; if (!initialized) { for (int i = 0; i < TINY_NUM_CLASSES; i++) { g_tiny_warm_pool[i].count = 0; } initialized = 1; } } // O(1) pop from warm pool // Returns: SuperSlab* (pre-qualified HOT SuperSlab), or NULL if pool empty static inline SuperSlab* tiny_warm_pool_pop(int class_idx) { if (g_tiny_warm_pool[class_idx].count > 0) { return g_tiny_warm_pool[class_idx].slabs[--g_tiny_warm_pool[class_idx].count]; } return NULL; } // O(1) push to warm pool // Returns: 1 if pushed successfully, 0 if pool full (caller should free to LRU) static inline int tiny_warm_pool_push(int class_idx, SuperSlab* ss) { if (g_tiny_warm_pool[class_idx].count < TINY_WARM_POOL_MAX_PER_CLASS) { g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss; return 1; } return 0; } // Get current count (for metrics/debugging) static inline int tiny_warm_pool_count(int class_idx) { return g_tiny_warm_pool[class_idx].count; } // ============================================================================ // Optional: Environment Variable Tuning // ============================================================================ // Get warm pool capacity from environment (configurable at runtime) // ENV: HAKMEM_WARM_POOL_SIZE=N (default: 4) static inline int warm_pool_max_per_class(void) { static int g_max = -1; if (__builtin_expect(g_max == -1, 0)) { const char* env = getenv("HAKMEM_WARM_POOL_SIZE"); if (env && *env) { int v = atoi(env); // Clamp to valid range [1, 16] if (v < 1) v = 1; if (v > 16) v = 16; g_max = v; } else { g_max = TINY_WARM_POOL_MAX_PER_CLASS; } } return g_max; } // Push with environment-configured capacity static inline int tiny_warm_pool_push_tunable(int class_idx, SuperSlab* ss) { int capacity = warm_pool_max_per_class(); if (g_tiny_warm_pool[class_idx].count < capacity) { g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss; return 1; } return 0; } #endif // HAK_TINY_WARM_POOL_H