hakmem/core/front/tiny_unified_cache.c

// tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation
#include "tiny_unified_cache.h"
#include "tiny_warm_pool.h"                  // Warm Pool: O(1) SuperSlab lookup
#include "../tiny_tls.h"                     // Phase 23-E: TinyTLSSlab, TinySlabMeta
#include "../tiny_box_geometry.h"            // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry
#include "../box/tiny_next_ptr_box.h"        // Phase 23-E: tiny_next_read (freelist traversal)
#include "../hakmem_tiny_superslab.h"        // Phase 23-E: SuperSlab, superslab_refill()
#include "../superslab/superslab_inline.h"   // Phase 23-E: ss_active_add, slab_index_for, ss_slabs_capacity
#include "../hakmem_super_registry.h"        // For hak_super_lookup (pointer→SuperSlab)
#include "../box/pagefault_telemetry_box.h"  // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
#include "../box/ss_tier_box.h"              // For ss_tier_is_hot() tier checks
#include "../box/ss_slab_meta_box.h"         // For ss_active_add() and slab metadata operations
#include "../box/warm_pool_stats_box.h"      // Box: Warm Pool Statistics Recording (inline)
#include "../box/slab_carve_box.h"           // Box: Slab Carving (inline O(slabs) scan)
#include "../box/warm_pool_prefill_box.h"    // Box: Warm Pool Prefill (secondary optimization)
#include "../hakmem_env_cache.h"             // Priority-2: ENV cache (eliminate syscalls)
#include "../box/tiny_page_box.h"           // Tiny-Plus Page Box (C5–C7 initial hook)
#include "../box/ss_tls_bind_box.h"         // Box: TLS Bind (SuperSlab -> TLS binding)
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
#include <time.h>

// ============================================================================
// Performance Measurement: Unified Cache (ENV-gated)
// ============================================================================
// Global atomic counters for unified cache performance measurement
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
_Atomic uint64_t g_unified_cache_hits_global = 0;
_Atomic uint64_t g_unified_cache_misses_global = 0;
_Atomic uint64_t g_unified_cache_refill_cycles_global = 0;

// Per-class counters（Tiny クラス別の Unified Cache 観測用）
_Atomic uint64_t g_unified_cache_hits_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_misses_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_refill_cycles_by_class[TINY_NUM_CLASSES] = {0};

// Helper: Get cycle count (x86_64 rdtsc)
static inline uint64_t read_tsc(void) {
#if defined(__x86_64__) || defined(_M_X64)
    uint32_t lo, hi;
    __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
    return ((uint64_t)hi << 32) | lo;
#else
    // Fallback to clock_gettime for non-x86 platforms
    struct timespec ts;
    clock_gettime(CLOCK_MONOTONIC, &ts);
    return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
#endif
}

// Check if measurement is enabled (cached)
static inline int unified_cache_measure_enabled(void) {
    static int g_measure = -1;
    if (__builtin_expect(g_measure == -1, 0)) {
        const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
        g_measure = (e && *e && *e != '0') ? 1 : 0;
    }
    return g_measure;
}

// Phase 23-E: Forward declarations
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];  // From hakmem_tiny_superslab.c
extern void ss_active_add(SuperSlab* ss, uint32_t n);       // From hakmem_tiny_ss_active_box.inc

// ============================================================================
// TLS Variables (defined here, extern in header)
// ============================================================================

__thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];

// Warm Pool: Per-thread warm SuperSlab pools (one per class)
__thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};

// ============================================================================
// Metrics (Phase 23, optional for debugging)
// ============================================================================

#if !HAKMEM_BUILD_RELEASE
__thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};
#endif

// Warm Pool metrics (definition - declared in tiny_warm_pool.h as extern)
// Note: These are kept outside !HAKMEM_BUILD_RELEASE for profiling in release builds
__thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES] = {0};

#if !HAKMEM_BUILD_RELEASE
// Debug-only diagnostics for Warm Pool effectiveness
_Atomic uint64_t g_dbg_warm_prefill_attempts = 0;
_Atomic uint64_t g_dbg_warm_prefill_refill_fail = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_ok = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_full = 0;
_Atomic uint64_t g_dbg_warm_pop_attempts = 0;
_Atomic uint64_t g_dbg_warm_pop_hits = 0;
_Atomic uint64_t g_dbg_warm_pop_empty = 0;
_Atomic uint64_t g_dbg_warm_pop_carve_zero = 0;
#endif

// Forward declaration for Warm Pool stats printer (defined later in this file)
static inline void tiny_warm_pool_print_stats(void);

// ============================================================================
// Phase 8-Step1-Fix: unified_cache_enabled() implementation (non-static)
// ============================================================================

// Enable flag (default: ON, disable with HAKMEM_TINY_UNIFIED_CACHE=0)
int unified_cache_enabled(void) {
    // Priority-2: Use cached ENV (eliminate lazy-init static overhead)
    static int g_enable = -1;
    if (__builtin_expect(g_enable == -1, 0)) {
        g_enable = HAK_ENV_TINY_UNIFIED_CACHE();
#if !HAKMEM_BUILD_RELEASE
        if (g_enable) {
            fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable);
            fflush(stderr);
        }
#endif
    }
    return g_enable;
}

// ============================================================================
// Init (called at thread start or lazy on first access)
// ============================================================================

void unified_cache_init(void) {
    if (!unified_cache_enabled()) return;

    // Layer 2 Defensive Fix: Use __libc_calloc for infrastructure allocations
    // Rationale: Cache arrays are infrastructure (not workload), bypass HAKMEM entirely
    // This prevents interaction with BenchFast mode and ensures clean separation
    extern void* __libc_calloc(size_t, size_t);

    // Initialize all classes (C0-C7)
    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        if (g_unified_cache[cls].slots != NULL) continue;  // Already initialized

        size_t cap = unified_capacity(cls);
        g_unified_cache[cls].slots = (void**)__libc_calloc(cap, sizeof(void*));

        if (!g_unified_cache[cls].slots) {
#if !HAKMEM_BUILD_RELEASE
            fprintf(stderr, "[Unified-INIT] Failed to allocate C%d cache (%zu slots)\n", cls, cap);
            fflush(stderr);
#endif
            continue;  // Skip this class, try others
        }

        g_unified_cache[cls].capacity = (uint16_t)cap;
        g_unified_cache[cls].mask = (uint16_t)(cap - 1);
        g_unified_cache[cls].head = 0;
        g_unified_cache[cls].tail = 0;

#if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[Unified-INIT] C%d: %zu slots (%zu bytes)\n",
                cls, cap, cap * sizeof(void*));
        fflush(stderr);
#endif
    }
}

// ============================================================================
// Shutdown (called at thread exit, optional)
// ============================================================================

void unified_cache_shutdown(void) {
    if (!unified_cache_enabled()) return;

    // TODO: Drain caches to SuperSlab before shutdown (prevent leak)

    // Layer 2 Defensive Fix: Use __libc_free (symmetric with __libc_calloc in init)
    extern void __libc_free(void*);

    // Free cache buffers
    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        if (g_unified_cache[cls].slots) {
            __libc_free(g_unified_cache[cls].slots);
            g_unified_cache[cls].slots = NULL;
        }
    }

#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "[Unified-SHUTDOWN] All caches freed\n");
    fflush(stderr);
#endif
}

// ============================================================================
// Stats (Phase 23 metrics)
// ============================================================================

void unified_cache_print_stats(void) {
    if (!unified_cache_enabled()) return;

#if !HAKMEM_BUILD_RELEASE
    fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");

    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        uint64_t total_allocs = g_unified_cache_hit[cls] + g_unified_cache_miss[cls];
        uint64_t total_frees = g_unified_cache_push[cls] + g_unified_cache_full[cls];

        if (total_allocs == 0 && total_frees == 0) continue;  // Skip unused classes

        double hit_rate = (total_allocs > 0) ? (100.0 * g_unified_cache_hit[cls] / total_allocs) : 0.0;
        double full_rate = (total_frees > 0) ? (100.0 * g_unified_cache_full[cls] / total_frees) : 0.0;

        // Current occupancy
        uint16_t count = (g_unified_cache[cls].tail >= g_unified_cache[cls].head)
                        ? (g_unified_cache[cls].tail - g_unified_cache[cls].head)
                        : (g_unified_cache[cls].capacity - g_unified_cache[cls].head + g_unified_cache[cls].tail);

        fprintf(stderr, "  C%d: %u/%u slots occupied, hit=%llu miss=%llu (%.1f%% hit), push=%llu full=%llu (%.1f%% full)\n",
                cls,
                count, g_unified_cache[cls].capacity,
                (unsigned long long)g_unified_cache_hit[cls],
                (unsigned long long)g_unified_cache_miss[cls],
                hit_rate,
                (unsigned long long)g_unified_cache_push[cls],
                (unsigned long long)g_unified_cache_full[cls],
                full_rate);
    }
    fflush(stderr);

    // Also print warm pool stats if enabled
    tiny_warm_pool_print_stats();
#endif
}

// ============================================================================
// Warm Pool Stats (always compiled, ENV-gated at runtime)
// ============================================================================

static inline void tiny_warm_pool_print_stats(void) {
    // Check if warm pool stats are enabled via ENV
    static int g_print_stats = -1;
    if (__builtin_expect(g_print_stats == -1, 0)) {
        const char* e = getenv("HAKMEM_WARM_POOL_STATS");
        g_print_stats = (e && *e && *e != '0') ? 1 : 0;
    }

    if (!g_print_stats) return;

    fprintf(stderr, "\n[WarmPool-STATS] Warm Pool Metrics:\n");

    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        uint64_t total = g_warm_pool_stats[i].hits + g_warm_pool_stats[i].misses;
        float hit_rate = (total > 0)
                             ? (100.0 * g_warm_pool_stats[i].hits / total)
                             : 0.0;
        fprintf(stderr, "  C%d: hits=%llu misses=%llu hit_rate=%.1f%% prefilled=%llu\n",
                i,
                (unsigned long long)g_warm_pool_stats[i].hits,
                (unsigned long long)g_warm_pool_stats[i].misses,
                hit_rate,
                (unsigned long long)g_warm_pool_stats[i].prefilled);
    }

#if !HAKMEM_BUILD_RELEASE
    // Debug-only aggregated diagnostics for Warm Pool
    fprintf(stderr,
            "  [DBG] prefill_attempts=%llu refill_fail=%llu push_ok=%llu push_full=%llu "
            "pop_attempts=%llu pop_hits=%llu pop_empty=%llu pop_carve_zero=%llu\n",
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_attempts, memory_order_relaxed),
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_refill_fail, memory_order_relaxed),
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_ok, memory_order_relaxed),
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_full, memory_order_relaxed),
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_attempts, memory_order_relaxed),
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_hits, memory_order_relaxed),
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_empty, memory_order_relaxed),
            (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_carve_zero, memory_order_relaxed));
#endif
    fflush(stderr);
}

// Public wrapper for benchmarks
void tiny_warm_pool_print_stats_public(void) {
    tiny_warm_pool_print_stats();
}

// ============================================================================
// Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)
// ============================================================================

// Fail-fast helper: verify that a candidate BASE pointer belongs to a valid
// Tiny slab within a SuperSlab. This is intentionally defensive and only
// compiled in debug builds to avoid hot-path overhead in release.
static inline int unified_refill_validate_base(int class_idx,
                                               TinyTLSSlab* tls,
                                               TinySlabMeta* meta,
                                               void* base,
                                               const char* stage)
{
#if HAKMEM_BUILD_RELEASE
    (void)class_idx; (void)tls; (void)base; (void)stage;
    return 1;
#else
    if (!base) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=NULL tls_ss=%p meta=%p\n",
                stage ? stage : "unified_refill",
                class_idx,
                (void*)(tls ? tls->ss : NULL),
                (void*)meta);
        abort();
    }

    SuperSlab* tls_ss = tls ? tls->ss : NULL;
    if (!tls_ss || tls_ss->magic != SUPERSLAB_MAGIC) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p meta=%p (invalid TLS ss)\n",
                stage ? stage : "unified_refill",
                class_idx,
                base,
                (void*)tls_ss,
                (void*)meta);
        abort();
    }

    // Cross-check registry lookup for additional safety.
    SuperSlab* ss_lookup = hak_super_lookup(base);
    if (!ss_lookup || ss_lookup->magic != SUPERSLAB_MAGIC) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p meta=%p\n",
                stage ? stage : "unified_refill",
                class_idx,
                base,
                (void*)tls_ss,
                (void*)ss_lookup,
                (void*)meta);
        abort();
    }
    if (ss_lookup != tls_ss) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p (mismatch)\n",
                stage ? stage : "unified_refill",
                class_idx,
                base,
                (void*)tls_ss,
                (void*)ss_lookup);
        abort();
    }

    int slab_idx = tls ? (int)tls->slab_idx : -1;
    int cap = ss_slabs_capacity(tls_ss);
    if (slab_idx < 0 || slab_idx >= cap) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d cap=%d meta_cap=%u meta_used=%u meta_carved=%u\n",
                stage ? stage : "unified_refill",
                class_idx,
                base,
                (void*)tls_ss,
                slab_idx,
                cap,
                meta ? meta->capacity : 0u,
                meta ? (unsigned)meta->used : 0u,
                meta ? (unsigned)meta->carved : 0u);
        abort();
    }

    // Ensure meta matches TLS view for this slab.
    TinySlabMeta* expected_meta = &tls_ss->slabs[slab_idx];
    if (meta && meta != expected_meta) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d meta=%p expected_meta=%p\n",
                stage ? stage : "unified_refill",
                class_idx,
                base,
                (void*)tls_ss,
                slab_idx,
                (void*)meta,
                (void*)expected_meta);
        abort();
    }

    uint8_t* slab_base = tiny_slab_base_for_geometry(tls_ss, slab_idx);
    size_t stride = tiny_stride_for_class(class_idx);
    size_t usable = tiny_usable_bytes_for_slab(slab_idx);
    uint8_t* slab_end = slab_base + usable;

    if ((uint8_t*)base < slab_base || (uint8_t*)base >= slab_end) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p range=[%p,%p) stride=%zu meta_cap=%u meta_used=%u meta_carved=%u\n",
                stage ? stage : "unified_refill",
                class_idx,
                base,
                (void*)slab_base,
                (void*)slab_end,
                stride,
                meta ? meta->capacity : 0u,
                meta ? (unsigned)meta->used : 0u,
                meta ? (unsigned)meta->carved : 0u);
        abort();
    }

    ptrdiff_t offset = (uint8_t*)base - slab_base;
    if (offset % (ptrdiff_t)stride != 0) {
        fprintf(stderr,
                "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p offset=%td stride=%zu (misaligned) meta_cap=%u meta_used=%u meta_carved=%u\n",
                stage ? stage : "unified_refill",
                class_idx,
                base,
                offset,
                stride,
                meta ? meta->capacity : 0u,
                meta ? (unsigned)meta->used : 0u,
                meta ? (unsigned)meta->carved : 0u);
        abort();
    }

    return 1;
#endif
}

// ============================================================================
// Warm Pool Enhanced: Direct carve from warm SuperSlab (bypass superslab_refill)
// ============================================================================

// ============================================================================
// Batch refill from SuperSlab (called on cache miss)
// ============================================================================
// Returns: BASE pointer (first block, wrapped), or NULL-wrapped if failed
// Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)
// Warm Pool Integration: PRIORITIZE warm pool, use superslab_refill as fallback
hak_base_ptr_t unified_cache_refill(int class_idx) {
    // Measure refill cost if enabled
    uint64_t start_cycles = 0;
    int measure = unified_cache_measure_enabled();
    if (measure) {
        start_cycles = read_tsc();
    }

    // Initialize warm pool on first use (per-thread)
    tiny_warm_pool_init_once();

    TinyUnifiedCache* cache = &g_unified_cache[class_idx];

    // ✅ Phase 11+: Ensure cache is initialized (lazy init for cold path)
    if (!cache->slots) {
        unified_cache_init();
        // Re-check after init (may fail due to alloc failure)
        if (!cache->slots) {
            return NULL;
        }
    }

    // Calculate available room in unified cache
    int room = (int)cache->capacity - 1;  // Leave 1 slot for full detection
    if (cache->head > cache->tail) {
        room = cache->head - cache->tail - 1;
    } else if (cache->head < cache->tail) {
        room = cache->capacity - (cache->tail - cache->head) - 1;
    }

    if (room <= 0) return HAK_BASE_FROM_RAW(NULL);
    // Batch size limit（クラス別チューニング）
    //   - 通常: 128
    //   - C5〜C6（129B〜512B）: 256 まで拡張
    //   - C7（≈1KB）: 512 まで拡張して refill 頻度をさらに下げる
    //   - 安全性のため、下の out[] 配列サイズ（512）と常に整合させる
    int max_batch;
    if (class_idx == 7) {
        max_batch = 512;
    } else if (class_idx >= 5 && class_idx <= 6) {
        max_batch = 256;
    } else {
        max_batch = 128;
    }
    if (room > max_batch) room = max_batch;

    // NOTE:
    //  - C7 では max_batch を 512 まで拡張するため、スタック配列も 512 エントリ確保する。
    //  - これにより、room <= max_batch <= 512 が常に成り立ち、out[] オーバーランを防止する。
    void* out[512];
    int produced = 0;

    // ========== PAGE BOX HOT PATH（Tiny-Plus 層）: Try page box FIRST ==========
    // 将来的に C7 専用の page-level freelist 管理をここに統合する。
    // いまは stub 実装で常に 0 を返すが、Box 境界としての接続だけ先に行う。
    if (tiny_page_box_is_enabled(class_idx)) {
        int page_produced = tiny_page_box_refill(class_idx, out, room);
        if (page_produced > 0) {
            // Store blocks into cache and return first
            void* first = out[0];
            for (int i = 1; i < page_produced; i++) {
                cache->slots[cache->tail] = out[i];
                cache->tail = (cache->tail + 1) & cache->mask;
            }

            #if !HAKMEM_BUILD_RELEASE
            g_unified_cache_miss[class_idx]++;
            #endif

            if (measure) {
                uint64_t end_cycles = read_tsc();
                uint64_t delta = end_cycles - start_cycles;
                atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
                                          delta, memory_order_relaxed);
                atomic_fetch_add_explicit(&g_unified_cache_misses_global,
                                          1, memory_order_relaxed);
                atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
                                          delta, memory_order_relaxed);
                atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
                                          1, memory_order_relaxed);
            }

            return HAK_BASE_FROM_RAW(first);
        }
    }

    // ========== WARM POOL HOT PATH: Check warm pool FIRST ==========
    // This is the critical optimization - avoid superslab_refill() registry scan
    #if !HAKMEM_BUILD_RELEASE
    atomic_fetch_add_explicit(&g_dbg_warm_pop_attempts, 1, memory_order_relaxed);
    #endif
    SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx);
    if (warm_ss) {
        #if !HAKMEM_BUILD_RELEASE
        // FUTURE: TLS Bind Box Integration
        // Currently we carve directly from warm_ss via slab_carve_from_ss().
        // To unify logic, we should eventually:
        // 1. Choose a slab index (via tiny_page_box or heuristic).
        // 2. Bind it to TLS via ss_tls_bind_one(..., warm_ss, slab_idx, ...).
        // 3. Fall through to TLS-based allocation.

        // EXPERIMENTAL: Test TLS Bind Box connectivity for C7 (Debug only)
        static int g_warm_tls_bind_c7 = -1;
        if (g_warm_tls_bind_c7 == -1) {
            const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
            g_warm_tls_bind_c7 = (e && *e && *e != '0') ? 1 : 0;
        }

        if (g_warm_tls_bind_c7 && class_idx == 7) {
            // Find a slab index in this SuperSlab that matches our class
            int cap = ss_slabs_capacity(warm_ss);
            int slab_idx = -1;

            // Simple heuristic: find first slab belonging to this class
            // Note: In real logic, we should pick the *best* slab (e.g. from PageBox)
            for (int i = 0; i < cap; i++) {
                if (tiny_get_class_from_ss(warm_ss, i) == class_idx) {
                    slab_idx = i;
                    break;
                }
            }

            if (slab_idx >= 0) {
                TinyTLSSlab* tls = &g_tls_slabs[class_idx];
                // Try to bind. If successful, we have "connected" the path.
                // For now, we still fall through to slab_carve_from_ss() to do the actual
                // work, but the side effect (TLS updated) confirms connectivity.
                // In a future step, we would 'break' here and let the TLS path handle it.
                uint32_t tid = (uint32_t)(uintptr_t)pthread_self();
                if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) {
                    static int logged = 0;
                    if (!logged) {
                        fprintf(stderr, "[WARM_TLS_BIND] C7 bind success: ss=%p slab=%d\n",
                                (void*)warm_ss, slab_idx);
                        logged = 1;
                    }
                }
            }
        }

        atomic_fetch_add_explicit(&g_dbg_warm_pop_hits, 1, memory_order_relaxed);
        #endif
        // HOT PATH: Warm pool hit, try to carve directly
        produced = slab_carve_from_ss(class_idx, warm_ss, out, room);
        if (produced > 0) {
            // Update active counter for carved blocks
            ss_active_add(warm_ss, (uint32_t)produced);
        }

        if (produced > 0) {
            // Success! Return SuperSlab to warm pool for next use
            tiny_warm_pool_push(class_idx, warm_ss);

            // Track warm pool hit (always compiled, ENV-gated printing)
            warm_pool_record_hit(class_idx);

            // Store blocks into cache and return first
            void* first = out[0];
            for (int i = 1; i < produced; i++) {
                cache->slots[cache->tail] = out[i];
                cache->tail = (cache->tail + 1) & cache->mask;
            }

            #if !HAKMEM_BUILD_RELEASE
            g_unified_cache_miss[class_idx]++;
            #endif

            if (measure) {
                uint64_t end_cycles = read_tsc();
                uint64_t delta = end_cycles - start_cycles;
                atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
                                          delta, memory_order_relaxed);
                atomic_fetch_add_explicit(&g_unified_cache_misses_global,
                                          1, memory_order_relaxed);
                // Per-class 集計（C5–C7 の refill コストを可視化）
                atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
                                          delta, memory_order_relaxed);
                atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
                                          1, memory_order_relaxed);
            }

            return HAK_BASE_FROM_RAW(first);
        }

        // SuperSlab carve failed (produced == 0)
        #if !HAKMEM_BUILD_RELEASE
        atomic_fetch_add_explicit(&g_dbg_warm_pop_carve_zero, 1, memory_order_relaxed);
        #endif
        // This slab is either exhausted or has no more available capacity
        // The statistics counter 'prefilled' tracks how often we try to prefill
        if (produced == 0 && tiny_warm_pool_count(class_idx) == 0) {
            // Pool is empty and carve failed - prefill would help here
            warm_pool_record_prefilled(class_idx);
        }
    } else {
        #if !HAKMEM_BUILD_RELEASE
        atomic_fetch_add_explicit(&g_dbg_warm_pop_empty, 1, memory_order_relaxed);
        #endif
    }

    // ========== COLD PATH: Warm pool miss, use superslab_refill ==========
    // Track warm pool miss (always compiled, ENV-gated printing)
    warm_pool_record_miss(class_idx);

    TinyTLSSlab* tls = &g_tls_slabs[class_idx];

    // Step 1: Ensure SuperSlab available via normal refill
    // Enhanced: Use Warm Pool Prefill Box for secondary prefill when pool is empty
    if (warm_pool_do_prefill(class_idx, tls) < 0) {
        return HAK_BASE_FROM_RAW(NULL);
    }
    // After prefill: tls->ss has the final slab for carving
    // tls = &g_tls_slabs[class_idx];  // Reload (already done in prefill box)

    // Step 2: Direct carve from SuperSlab into local array (bypass TLS SLL!)
    TinySlabMeta* m = tls->meta;
    size_t bs = tiny_stride_for_class(class_idx);
    uint8_t* base = tls->slab_base
                        ? tls->slab_base
                        : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);

    while (produced < room) {
        if (m->freelist) {
            // Freelist pop
            void* p = m->freelist;

            void* next_node = tiny_next_read(class_idx, p);

            // ROOT CAUSE FIX: Write header BEFORE exposing block (but AFTER reading next)
            // For Class 0 (offset 0), next overlaps header, so we must read next first.
            #if HAKMEM_TINY_HEADER_CLASSIDX
            *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));

            // Prevent compiler from reordering header write after out[] assignment
            __atomic_thread_fence(__ATOMIC_RELEASE);
            #endif

            m->freelist = next_node;

            unified_refill_validate_base(class_idx, tls, m, p,
                                         "unified_refill_freelist");

            // PageFaultTelemetry: record page touch for this BASE
            pagefault_telemetry_touch(class_idx, p);

            m->used++;
            out[produced++] = p;

        } else if (m->carved < m->capacity) {
            // Linear carve (fresh block, no freelist link)
            void* p = (void*)(base + ((size_t)m->carved * bs));

            unified_refill_validate_base(class_idx, tls, m, p,
                                         "unified_refill_carve");

            // PageFaultTelemetry: record page touch for this BASE
            pagefault_telemetry_touch(class_idx, p);

            // ✅ CRITICAL: Write header (new block)
            #if HAKMEM_TINY_HEADER_CLASSIDX
            *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
            #endif

            m->carved++;
            m->used++;
            out[produced++] = p;

        } else {
            // SuperSlab exhausted → refill and retry
            if (!superslab_refill(class_idx)) break;

            // ✅ CRITICAL: Reload TLS pointers after refill (avoid stale pointer bug)
            tls = &g_tls_slabs[class_idx];
            m = tls->meta;
            base = tls->slab_base
                       ? tls->slab_base
                       : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
        }
    }

    if (produced == 0) return HAK_BASE_FROM_RAW(NULL);

    // Step 4: Update active counter
    // Guard: tls->ss can be NULL if all SuperSlab refills failed
    if (tls->ss) {
        ss_active_add(tls->ss, (uint32_t)produced);
    }

    // Step 5: Store blocks into unified cache (skip first, return it)
    void* first = out[0];
    for (int i = 1; i < produced; i++) {
        cache->slots[cache->tail] = out[i];
        cache->tail = (cache->tail + 1) & cache->mask;
    }

    #if !HAKMEM_BUILD_RELEASE
    g_unified_cache_miss[class_idx]++;
    #endif

    // Measure refill cycles
    if (measure) {
        uint64_t end_cycles = read_tsc();
        uint64_t delta = end_cycles - start_cycles;
        atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
                                  delta, memory_order_relaxed);
        atomic_fetch_add_explicit(&g_unified_cache_misses_global,
                                  1, memory_order_relaxed);
        // Per-class 集計
        atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
                                  delta, memory_order_relaxed);
        atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
                                  1, memory_order_relaxed);
    }

    return HAK_BASE_FROM_RAW(first);  // Return first block (BASE pointer)
}

// ============================================================================
// Performance Measurement: Print Statistics
// ============================================================================
void unified_cache_print_measurements(void) {
    if (!unified_cache_measure_enabled()) {
        return;  // Measurement disabled, nothing to print
    }

    uint64_t hits = atomic_load_explicit(&g_unified_cache_hits_global, memory_order_relaxed);
    uint64_t misses = atomic_load_explicit(&g_unified_cache_misses_global, memory_order_relaxed);
    uint64_t refill_cycles = atomic_load_explicit(&g_unified_cache_refill_cycles_global, memory_order_relaxed);

    uint64_t total = hits + misses;
    if (total == 0) {
        fprintf(stderr, "\n========================================\n");
        fprintf(stderr, "Unified Cache Statistics\n");
        fprintf(stderr, "========================================\n");
        fprintf(stderr, "No operations recorded (measurement may be disabled)\n");
        fprintf(stderr, "========================================\n\n");
        return;
    }

    double hit_rate = (100.0 * hits) / total;
    double avg_refill_cycles = misses > 0 ? (double)refill_cycles / misses : 0.0;

    // Estimate time at 1GHz (conservative, most modern CPUs are 2-4GHz)
    double avg_refill_us = avg_refill_cycles / 1000.0;

    fprintf(stderr, "\n========================================\n");
    fprintf(stderr, "Unified Cache Statistics\n");
    fprintf(stderr, "========================================\n");
    fprintf(stderr, "Hits:        %llu\n", (unsigned long long)hits);
    fprintf(stderr, "Misses:      %llu\n", (unsigned long long)misses);
    fprintf(stderr, "Hit Rate:    %.1f%%\n", hit_rate);
    fprintf(stderr, "Avg Refill Cycles: %.0f (est. %.2fus @ 1GHz)\n",
            avg_refill_cycles, avg_refill_us);

    // Per-class breakdown（Tiny クラス 0-7、特に C5–C7 を観測）
    fprintf(stderr, "\nPer-class Unified Cache (Tiny classes):\n");
    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        uint64_t ch = atomic_load_explicit(&g_unified_cache_hits_by_class[cls],
                                           memory_order_relaxed);
        uint64_t cm = atomic_load_explicit(&g_unified_cache_misses_by_class[cls],
                                           memory_order_relaxed);
        uint64_t cc = atomic_load_explicit(&g_unified_cache_refill_cycles_by_class[cls],
                                           memory_order_relaxed);
        uint64_t ct = ch + cm;
        if (ct == 0 && cc == 0) {
            continue;  // 未使用クラスは省略
        }
        double cls_hit_rate = ct > 0 ? (100.0 * (double)ch / (double)ct) : 0.0;
        double cls_avg_refill = cm > 0 ? (double)cc / (double)cm : 0.0;
        double cls_avg_us = cls_avg_refill / 1000.0;
        fprintf(stderr,
                "  C%d: hits=%llu miss=%llu hit=%.1f%% avg_refill=%.0f cyc (%.2fus @1GHz)\n",
                cls,
                (unsigned long long)ch,
                (unsigned long long)cm,
                cls_hit_rate,
                cls_avg_refill,
                cls_avg_us);
    }

    fprintf(stderr, "========================================\n\n");
}