// tiny_warm_pool.h - Warm Pool Optimization for Unified Cache // Purpose: Eliminate registry O(N) scan on cache miss by using per-thread warm SuperSlab pools // Expected Gain: +40-50% throughput (1.06M → 1.5M+ ops/s) // License: MIT // Date: 2025-12-04 #ifndef HAK_TINY_WARM_POOL_H #define HAK_TINY_WARM_POOL_H #include #include "../hakmem_tiny_config.h" #include "../superslab/superslab_types.h" #include "../box/tiny_mem_stats_box.h" // ============================================================================ // Warm Pool Design // ============================================================================ // // PROBLEM: // - unified_cache_refill() scans registry O(N) on every cache miss // - Registry scan is expensive (~50-100 cycles per miss) // - Cost grows with number of SuperSlabs per class // // SOLUTION: // - Per-thread warm pool of pre-qualified HOT SuperSlabs // - O(1) pop from warm pool (no registry scan needed) // - Pool pre-filled during registry scan (look-ahead) // // DESIGN: // - Thread-local array per class (no synchronization needed) // - Fixed capacity per class (default: 4 SuperSlabs) // - LIFO stack (simple pop/push operations) // // EXPECTED GAIN: // - Eliminate registry scan from hot path // - +40-50% throughput improvement // - Memory overhead: ~256-512 KB per thread (acceptable) // // ============================================================================ // Maximum warm SuperSlabs per thread per class (tunable) // Trade-off: Working set size vs warm pool effectiveness // - 4: Original (90% hit rate expected, but broken implementation - hardcoded prefill threshold) // - 12: Optimized capacity with matching prefill threshold (Phase 1) // - Higher values: More memory but better locality #define TINY_WARM_POOL_MAX_PER_CLASS 12 typedef struct { SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS]; int32_t count; } TinyWarmPool; // Per-thread warm pool (one per class) extern __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES]; // Per-thread warm pool statistics structure typedef struct { uint64_t hits; // Warm pool hit count uint64_t misses; // Warm pool miss count uint64_t prefilled; // Total SuperSlabs prefilled during registry scans } TinyWarmPoolStats; // Per-thread warm pool statistics (for tracking prefill effectiveness) extern __thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES]; // ============================================================================ // API: Warm Pool Operations // ============================================================================ // Initialize warm pool once per thread (lazy) // Called on first access, sets all counts to 0 static inline void tiny_warm_pool_init_once(void) { static __thread int initialized = 0; if (!initialized) { for (int i = 0; i < TINY_NUM_CLASSES; i++) { g_tiny_warm_pool[i].count = 0; } tiny_mem_stats_add_warm((ssize_t)(sizeof(g_tiny_warm_pool) + sizeof(g_warm_pool_stats))); initialized = 1; } } // O(1) pop from warm pool // Returns: SuperSlab* (pre-qualified HOT SuperSlab), or NULL if pool empty static inline SuperSlab* tiny_warm_pool_pop(int class_idx) { if (g_tiny_warm_pool[class_idx].count > 0) { return g_tiny_warm_pool[class_idx].slabs[--g_tiny_warm_pool[class_idx].count]; } return NULL; } // Get current count (for metrics/debugging) static inline int tiny_warm_pool_count(int class_idx) { return g_tiny_warm_pool[class_idx].count; } // ============================================================================ // Optional: Environment Variable Tuning // ============================================================================ // Get warm pool capacity from environment (configurable at runtime) // ENV: HAKMEM_WARM_POOL_SIZE=N (default: 12) static inline int warm_pool_max_per_class(void) { static int g_max = -1; if (__builtin_expect(g_max == -1, 0)) { const char* env = getenv("HAKMEM_WARM_POOL_SIZE"); if (env && *env) { int v = atoi(env); // Clamp to valid range [1, 12] if (v < 1) v = 1; if (v > 12) v = 12; g_max = v; } else { g_max = TINY_WARM_POOL_MAX_PER_CLASS; } } return g_max; } // O(1) push to warm pool (cap-aware) // cap_hint <=0 → use warm_pool_max_per_class() clamped to TINY_WARM_POOL_MAX_PER_CLASS static inline int tiny_warm_pool_push_with_cap(int class_idx, SuperSlab* ss, int cap_hint) { int limit = cap_hint; if (limit <= 0 || limit > TINY_WARM_POOL_MAX_PER_CLASS) { limit = warm_pool_max_per_class(); if (limit <= 0) { limit = TINY_WARM_POOL_MAX_PER_CLASS; } if (limit > TINY_WARM_POOL_MAX_PER_CLASS) { limit = TINY_WARM_POOL_MAX_PER_CLASS; } } if (g_tiny_warm_pool[class_idx].count < limit) { g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss; return 1; } return 0; } // Default push (uses ENV/default cap) static inline int tiny_warm_pool_push(int class_idx, SuperSlab* ss) { return tiny_warm_pool_push_with_cap(class_idx, ss, -1); } // Push with environment-configured capacity (legacy name) static inline int tiny_warm_pool_push_tunable(int class_idx, SuperSlab* ss) { return tiny_warm_pool_push_with_cap(class_idx, ss, warm_pool_max_per_class()); } #endif // HAK_TINY_WARM_POOL_H