Files
hakmem/core/front/tiny_warm_pool.h

152 lines
5.4 KiB
C

// tiny_warm_pool.h - Warm Pool Optimization for Unified Cache
// Purpose: Eliminate registry O(N) scan on cache miss by using per-thread warm SuperSlab pools
// Expected Gain: +40-50% throughput (1.06M → 1.5M+ ops/s)
// License: MIT
// Date: 2025-12-04
#ifndef HAK_TINY_WARM_POOL_H
#define HAK_TINY_WARM_POOL_H
#include <stdint.h>
#include "../hakmem_tiny_config.h"
#include "../superslab/superslab_types.h"
#include "../box/tiny_mem_stats_box.h"
// ============================================================================
// Warm Pool Design
// ============================================================================
//
// PROBLEM:
// - unified_cache_refill() scans registry O(N) on every cache miss
// - Registry scan is expensive (~50-100 cycles per miss)
// - Cost grows with number of SuperSlabs per class
//
// SOLUTION:
// - Per-thread warm pool of pre-qualified HOT SuperSlabs
// - O(1) pop from warm pool (no registry scan needed)
// - Pool pre-filled during registry scan (look-ahead)
//
// DESIGN:
// - Thread-local array per class (no synchronization needed)
// - Fixed capacity per class (default: 4 SuperSlabs)
// - LIFO stack (simple pop/push operations)
//
// EXPECTED GAIN:
// - Eliminate registry scan from hot path
// - +40-50% throughput improvement
// - Memory overhead: ~256-512 KB per thread (acceptable)
//
// ============================================================================
// Maximum warm SuperSlabs per thread per class (tunable)
// Trade-off: Working set size vs warm pool effectiveness
// - 4: Original (90% hit rate expected, but broken implementation - hardcoded prefill threshold)
// - 12: Optimized capacity with matching prefill threshold (Phase 1)
// - Higher values: More memory but better locality
#define TINY_WARM_POOL_MAX_PER_CLASS 12
typedef struct {
SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS];
int32_t count;
} TinyWarmPool;
// Per-thread warm pool (one per class)
extern __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES];
// Per-thread warm pool statistics structure
typedef struct {
uint64_t hits; // Warm pool hit count
uint64_t misses; // Warm pool miss count
uint64_t prefilled; // Total SuperSlabs prefilled during registry scans
} TinyWarmPoolStats;
// Per-thread warm pool statistics (for tracking prefill effectiveness)
extern __thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES];
// ============================================================================
// API: Warm Pool Operations
// ============================================================================
// Initialize warm pool once per thread (lazy)
// Called on first access, sets all counts to 0
static inline void tiny_warm_pool_init_once(void) {
static __thread int initialized = 0;
if (!initialized) {
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
g_tiny_warm_pool[i].count = 0;
}
tiny_mem_stats_add_warm((ssize_t)(sizeof(g_tiny_warm_pool) + sizeof(g_warm_pool_stats)));
initialized = 1;
}
}
// O(1) pop from warm pool
// Returns: SuperSlab* (pre-qualified HOT SuperSlab), or NULL if pool empty
static inline SuperSlab* tiny_warm_pool_pop(int class_idx) {
if (g_tiny_warm_pool[class_idx].count > 0) {
return g_tiny_warm_pool[class_idx].slabs[--g_tiny_warm_pool[class_idx].count];
}
return NULL;
}
// Get current count (for metrics/debugging)
static inline int tiny_warm_pool_count(int class_idx) {
return g_tiny_warm_pool[class_idx].count;
}
// ============================================================================
// Optional: Environment Variable Tuning
// ============================================================================
// Get warm pool capacity from environment (configurable at runtime)
// ENV: HAKMEM_WARM_POOL_SIZE=N (default: 12)
static inline int warm_pool_max_per_class(void) {
static int g_max = -1;
if (__builtin_expect(g_max == -1, 0)) {
const char* env = getenv("HAKMEM_WARM_POOL_SIZE");
if (env && *env) {
int v = atoi(env);
// Clamp to valid range [1, 12]
if (v < 1) v = 1;
if (v > 12) v = 12;
g_max = v;
} else {
g_max = TINY_WARM_POOL_MAX_PER_CLASS;
}
}
return g_max;
}
// O(1) push to warm pool (cap-aware)
// cap_hint <=0 → use warm_pool_max_per_class() clamped to TINY_WARM_POOL_MAX_PER_CLASS
static inline int tiny_warm_pool_push_with_cap(int class_idx, SuperSlab* ss, int cap_hint) {
int limit = cap_hint;
if (limit <= 0 || limit > TINY_WARM_POOL_MAX_PER_CLASS) {
limit = warm_pool_max_per_class();
if (limit <= 0) {
limit = TINY_WARM_POOL_MAX_PER_CLASS;
}
if (limit > TINY_WARM_POOL_MAX_PER_CLASS) {
limit = TINY_WARM_POOL_MAX_PER_CLASS;
}
}
if (g_tiny_warm_pool[class_idx].count < limit) {
g_tiny_warm_pool[class_idx].slabs[g_tiny_warm_pool[class_idx].count++] = ss;
return 1;
}
return 0;
}
// Default push (uses ENV/default cap)
static inline int tiny_warm_pool_push(int class_idx, SuperSlab* ss) {
return tiny_warm_pool_push_with_cap(class_idx, ss, -1);
}
// Push with environment-configured capacity (legacy name)
static inline int tiny_warm_pool_push_tunable(int class_idx, SuperSlab* ss) {
return tiny_warm_pool_push_with_cap(class_idx, ss, warm_pool_max_per_class());
}
#endif // HAK_TINY_WARM_POOL_H