Phase 70-0: Infrastructure prep for refill tuning (Observability + WarmPool Sizing)

- Observability Fix: Enabled unified cache hit/miss stats in release builds when HAKMEM_UNIFIED_CACHE_STATS_COMPILED is set. - WarmPool Sizing: Decoupled hardcoded '12' from prefill logic; now uses TINY_WARM_POOL_DEFAULT_PER_CLASS macro and respects ENV overrides. - Increased TINY_WARM_POOL_MAX_PER_CLASS to 32 to support wider ENV sweeps. - Added unified_cache_auto_stats destructor to dump metrics at exit (replacing debug print hack).
2025-12-18 03:02:40 +09:00
parent b6212bbe31
commit a6ab262ad2
5 changed files with 32 additions and 26 deletions
--- a/core/box/tiny_front_hot_box.h
+++ b/core/box/tiny_front_hot_box.h
@ -50,13 +50,13 @@
 // Debug Metrics (Zero Overhead in Release)
 // ============================================================================

-#if !HAKMEM_BUILD_RELEASE
-// Increment cache hit counter (debug only)
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
+// Increment cache hit counter (debug/observe only; zero overhead when compiled-out)
 #define TINY_HOT_METRICS_HIT(class_idx) \
    do { extern __thread uint64_t g_unified_cache_hit[]; \
         g_unified_cache_hit[class_idx]++; } while(0)

-// Increment cache miss counter (debug only)
+// Increment cache miss counter (debug/observe only; zero overhead when compiled-out)
 #define TINY_HOT_METRICS_MISS(class_idx) \
    do { extern __thread uint64_t g_unified_cache_miss[]; \
         g_unified_cache_miss[class_idx]++; } while(0)
--- a/core/front/tiny_unified_cache.c
+++ b/core/front/tiny_unified_cache.c
@ -97,7 +97,7 @@ __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};
 // Metrics (Phase 23, optional for debugging)
 // ============================================================================

-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
 __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
 __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
 __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
@ -262,7 +262,7 @@ void unified_cache_shutdown(void) {
 void unified_cache_print_stats(void) {
    if (!unified_cache_enabled()) return;

-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
    fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");

    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
@ -296,6 +296,11 @@ void unified_cache_print_stats(void) {
 #endif
 }

+__attribute__((destructor))
+static void unified_cache_auto_stats(void) {
+    unified_cache_print_stats();
+}
+
 // ============================================================================
 // Warm Pool Stats (always compiled, ENV-gated at runtime)
 // ============================================================================
--- a/core/front/tiny_unified_cache.h
+++ b/core/front/tiny_unified_cache.h
@ -92,7 +92,7 @@ extern __thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];
 // Metrics (Phase 23, optional for debugging)
 // ============================================================================

-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
 extern __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES];    // Alloc hits
 extern __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES];   // Alloc misses
 extern __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES];   // Free pushes
@ -143,7 +143,7 @@ static inline size_t unified_capacity(int class_idx) {
        while (pow2 < g_cap[class_idx]) pow2 *= 2;
        g_cap[class_idx] = pow2;

-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
        fprintf(stderr, "[Unified-INIT] C%d capacity = %zu (power of 2)\n", class_idx, g_cap[class_idx]);
        fflush(stderr);
 #endif
@ -197,7 +197,7 @@ static inline hak_base_ptr_t unified_cache_pop(int class_idx) {

    // Empty check
    if (__builtin_expect(cache->head == cache->tail, 0)) {
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
        g_unified_cache_miss[class_idx]++;
 #endif
        return HAK_BASE_FROM_RAW(NULL);  // Empty
@ -207,7 +207,7 @@ static inline hak_base_ptr_t unified_cache_pop(int class_idx) {
    void* base = cache->slots[cache->head];  // 1 cache miss (array access)
    cache->head = (cache->head + 1) & cache->mask;  // Fast modulo (power of 2)

-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
    g_unified_cache_hit[class_idx]++;
 #endif

@ -251,7 +251,7 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {

    // Full check (leave 1 slot empty to distinguish full/empty)
    if (__builtin_expect(next_tail == cache->head, 0)) {
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
        g_unified_cache_full[class_idx]++;
 #endif
        return 0;  // Full
@ -261,7 +261,7 @@ static inline int unified_cache_push(int class_idx, hak_base_ptr_t base) {
    cache->slots[cache->tail] = base_raw;  // 1 cache miss (array write)
    cache->tail = next_tail;

-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
    g_unified_cache_push[class_idx]++;
 #endif

@ -300,7 +300,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
    // Phase 22: Compile-out when disabled (default OFF)
    void* tcache_base = tiny_tcache_try_pop(class_idx);
    if (tcache_base != NULL) {
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
        g_unified_cache_hit[class_idx]++;
 #endif
 #if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
@ -320,7 +320,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
    if (__builtin_expect(cache->head != cache->tail, 1)) {
        void* base = cache->slots[cache->head];  // 1 cache miss (array access)
        cache->head = (cache->head + 1) & cache->mask;
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
        g_unified_cache_hit[class_idx]++;
 #endif
 #if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
@ -337,7 +337,7 @@ static inline hak_base_ptr_t unified_cache_pop_or_refill(int class_idx) {
    }

    // Cache miss → Batch refill from SuperSlab
-#if !HAKMEM_BUILD_RELEASE
+#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
    g_unified_cache_miss[class_idx]++;
 #endif
    return unified_cache_refill(class_idx);  // Refill + return first block (BASE)
--- a/core/front/tiny_warm_pool.h
+++ b/core/front/tiny_warm_pool.h
@ -38,12 +38,11 @@
 //
 // ============================================================================

-// Maximum warm SuperSlabs per thread per class (tunable)
-// Trade-off: Working set size vs warm pool effectiveness
-//   - 4: Original (90% hit rate expected, but broken implementation - hardcoded prefill threshold)
-//   - 12: Optimized capacity with matching prefill threshold (Phase 1)
-//   - Higher values: More memory but better locality
-#define TINY_WARM_POOL_MAX_PER_CLASS 12
+// Warm pool sizing policy:
+// - DEFAULT is the value used when ENV is unset (keeps TLS footprint reasonable).
+// - MAX is the absolute cap used for array sizing and clamping (allows safe ENV sweeps).
+#define TINY_WARM_POOL_DEFAULT_PER_CLASS 12
+#define TINY_WARM_POOL_MAX_PER_CLASS 32

 typedef struct {
    SuperSlab* slabs[TINY_WARM_POOL_MAX_PER_CLASS];
@ -106,12 +105,12 @@ static inline int warm_pool_max_per_class(void) {
        const char* env = getenv("HAKMEM_WARM_POOL_SIZE");
        if (env && *env) {
            int v = atoi(env);
-            // Clamp to valid range [1, 12]
+            // Clamp to valid range [1, TINY_WARM_POOL_MAX_PER_CLASS]
            if (v < 1) v = 1;
-            if (v > 12) v = 12;
+            if (v > TINY_WARM_POOL_MAX_PER_CLASS) v = TINY_WARM_POOL_MAX_PER_CLASS;
            g_max = v;
        } else {
-            g_max = TINY_WARM_POOL_MAX_PER_CLASS;
+            g_max = TINY_WARM_POOL_DEFAULT_PER_CLASS;
        }
    }
    return g_max;
--- a/core/hakmem_shared_pool_acquire.c
+++ b/core/hakmem_shared_pool_acquire.c
@ -266,6 +266,9 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
    SuperSlab* primary_result = NULL;
    int primary_slab_idx = -1;

+    // Cache warm pool cap once per acquire call (SSOT: same as unified_cache_refill()).
+    const int warm_cap = warm_pool_max_per_class();
+
    for (int i = 0; i < scan_limit; i++) {
        SuperSlab* ss = super_reg_by_class_at(class_idx, i);
        if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue;
@ -274,9 +277,8 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out,
        if (ss->empty_count == 0) continue;  // No EMPTY slabs in this SS

        // WARM POOL PREFILL: Add HOT SuperSlabs to warm pool (if not already primary result)
-        // This is low-cost during registry scan and avoids future expensive scans
-        // Phase 1: Increase threshold from 4 to 12 to match TINY_WARM_POOL_MAX_PER_CLASS
-        if (ss != primary_result && tiny_warm_pool_count(class_idx) < 12) {
+        // This is low-cost during registry scan and avoids future expensive scans.
+        if (ss != primary_result && tiny_warm_pool_count(class_idx) < warm_cap) {
            tiny_warm_pool_push(class_idx, ss);
            // Track prefilled SuperSlabs for metrics
            g_warm_pool_stats[class_idx].prefilled++;