Phase 70-0: Infrastructure prep for refill tuning (Observability + WarmPool Sizing)

- Observability Fix: Enabled unified cache hit/miss stats in release builds when HAKMEM_UNIFIED_CACHE_STATS_COMPILED is set.
- WarmPool Sizing: Decoupled hardcoded '12' from prefill logic; now uses TINY_WARM_POOL_DEFAULT_PER_CLASS macro and respects ENV overrides.
- Increased TINY_WARM_POOL_MAX_PER_CLASS to 32 to support wider ENV sweeps.
- Added unified_cache_auto_stats destructor to dump metrics at exit (replacing debug print hack).
This commit is contained in:
Moe Charm (CI)
2025-12-18 03:02:40 +09:00
parent b6212bbe31
commit a6ab262ad2
5 changed files with 32 additions and 26 deletions

View File

@ -50,13 +50,13 @@
// Debug Metrics (Zero Overhead in Release)
// ============================================================================
#if !HAKMEM_BUILD_RELEASE
// Increment cache hit counter (debug only)
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
// Increment cache hit counter (debug/observe only; zero overhead when compiled-out)
#define TINY_HOT_METRICS_HIT(class_idx) \
do { extern __thread uint64_t g_unified_cache_hit[]; \
g_unified_cache_hit[class_idx]++; } while(0)
// Increment cache miss counter (debug only)
// Increment cache miss counter (debug/observe only; zero overhead when compiled-out)
#define TINY_HOT_METRICS_MISS(class_idx) \
do { extern __thread uint64_t g_unified_cache_miss[]; \
g_unified_cache_miss[class_idx]++; } while(0)

View File

@ -97,7 +97,7 @@ __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};
// Metrics (Phase 23, optional for debugging)
// ============================================================================
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
__thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
@ -262,7 +262,7 @@ void unified_cache_shutdown(void) {
void unified_cache_print_stats(void) {
if (!unified_cache_enabled()) return;
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
@ -296,6 +296,11 @@ void unified_cache_print_stats(void) {
#endif
}
__attribute__((destructor))
static void unified_cache_auto_stats(void) {
unified_cache_print_stats();
}
// ============================================================================
// Warm Pool Stats (always compiled, ENV-gated at runtime)
// ============================================================================

View File

@ -92,7 +92,7 @@ extern __thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];
// Metrics (Phase 23, optional for debugging)
// ============================================================================
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
extern __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES]; // Alloc hits
extern __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES]; // Alloc misses
extern __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES]; // Free pushes
@ -143,7 +143,7 @@ static inline size_t unified_capacity(int class_idx) {
while (pow2 < g_cap[class_idx]) pow2 *= 2;
g_cap[class_idx] = pow2;
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
fprintf(stderr, "[Unified-INIT] C%d capacity = %zu (power of 2)\n", class_idx, g_cap[class_idx]);
fflush(stderr);
#endif
@ -197,7 +197,7 @@ static inline hak_base_ptr_t unified_cache_pop(int class_idx) {
// Empty check
if (__builtin_expect(cache->head == cache->tail, 0)) {
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_miss[class_idx]++;
#endif
return HAK_BASE_FROM_RAW(NULL); // Empty
@ -207,7 +207,7 @@ static inline hak_base_ptr_t unified_cache_pop(int class_idx) {
void* base = cache->slots[cache->head]; // 1 cache miss (array access)
cache->head = (cache->head + 1) & cache->mask; // Fast modulo (power of 2)
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_hit[class_idx]++;
#endif
@ -251,7 +251,7 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {
// Full check (leave 1 slot empty to distinguish full/empty)
if (__builtin_expect(next_tail == cache->head, 0)) {
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_full[class_idx]++;
#endif
return 0; // Full
@ -261,7 +261,7 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {
cache->slots[cache->tail] = base_raw; // 1 cache miss (array write)
cache->tail = next_tail;
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_push[class_idx]++;
#endif
@ -300,7 +300,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
// Phase 22: Compile-out when disabled (default OFF)
void* tcache_base = tiny_tcache_try_pop(class_idx);
if (tcache_base != NULL) {
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_hit[class_idx]++;
#endif
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
@ -320,7 +320,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
if (__builtin_expect(cache->head != cache->tail, 1)) {
void* base = cache->slots[cache->head]; // 1 cache miss (array access)
cache->head = (cache->head + 1) & cache->mask;
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_hit[class_idx]++;
#endif
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
@ -337,7 +337,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
}
// Cache miss → Batch refill from SuperSlab
#if !HAKMEM_BUILD_RELEASE
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
g_unified_cache_miss[class_idx]++;
#endif
return unified_cache_refill(class_idx); // Refill + return first block (BASE)

View File

@ -38,12 +38,11 @@
//
// ============================================================================
// Maximum warm SuperSlabs per thread per class (tunable)
// Trade-off: Working set size vs warm pool effectiveness
// - 4: Original (90% hit rate expected, but broken implementation - hardcoded prefill threshold)
// - 12: Optimized capacity with matching prefill threshold (Phase 1)
// - Higher values: More memory but better locality
#define TINY_WARM_POOL_MAX_PER_CLASS 12
// Warm pool sizing policy:
// - DEFAULT is the value used when ENV is unset (keeps TLS footprint reasonable).
// - MAX is the absolute cap used for array sizing and clamping (allows safe ENV sweeps).
#define TINY_WARM_POOL_DEFAULT_PER_CLASS 12
#define TINY_WARM_POOL_MAX_PER_CLASS 32
typedef struct {
SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS];
@ -106,12 +105,12 @@ static inline int warm_pool_max_per_class(void) {
const char* env = getenv("HAKMEM_WARM_POOL_SIZE");
if (env && *env) {
int v = atoi(env);
// Clamp to valid range [1, 12]
// Clamp to valid range [1, TINY_WARM_POOL_MAX_PER_CLASS]
if (v < 1) v = 1;
if (v > 12) v = 12;
if (v > TINY_WARM_POOL_MAX_PER_CLASS) v = TINY_WARM_POOL_MAX_PER_CLASS;
g_max = v;
} else {
g_max = TINY_WARM_POOL_MAX_PER_CLASS;
g_max = TINY_WARM_POOL_DEFAULT_PER_CLASS;
}
}
return g_max;

View File

@ -266,6 +266,9 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
SuperSlab* primary_result = NULL;
int primary_slab_idx = -1;
// Cache warm pool cap once per acquire call (SSOT: same as unified_cache_refill()).
const int warm_cap = warm_pool_max_per_class();
for (int i = 0; i < scan_limit; i++) {
SuperSlab* ss = super_reg_by_class_at(class_idx, i);
if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
@ -274,9 +277,8 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
if (ss->empty_count == 0) continue; // No EMPTY slabs in this SS
// WARM POOL PREFILL: Add HOT SuperSlabs to warm pool (if not already primary result)
// This is low-cost during registry scan and avoids future expensive scans
// Phase 1: Increase threshold from 4 to 12 to match TINY_WARM_POOL_MAX_PER_CLASS
if (ss != primary_result && tiny_warm_pool_count(class_idx) < 12) {
// This is low-cost during registry scan and avoids future expensive scans.
if (ss != primary_result && tiny_warm_pool_count(class_idx) < warm_cap) {
tiny_warm_pool_push(class_idx, ss);
// Track prefilled SuperSlabs for metrics
g_warm_pool_stats[class_idx].prefilled++;