// tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation #include "tiny_unified_cache.h" #include "tiny_warm_pool.h" // Warm Pool: O(1) SuperSlab lookup #include "../tiny_tls.h" // Phase 23-E: TinyTLSSlab, TinySlabMeta #include "../tiny_box_geometry.h" // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry #include "../box/tiny_next_ptr_box.h" // Phase 23-E: tiny_next_read (freelist traversal) #include "../hakmem_tiny_superslab.h" // Phase 23-E: SuperSlab, superslab_refill() #include "../superslab/superslab_inline.h" // Phase 23-E: ss_active_add, slab_index_for, ss_slabs_capacity #include "../hakmem_super_registry.h" // For hak_super_lookup (pointer→SuperSlab) #include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats) #include "../box/ss_tier_box.h" // For ss_tier_is_hot() tier checks #include "../box/ss_slab_meta_box.h" // For ss_active_add() and slab metadata operations #include "../box/warm_pool_stats_box.h" // Box: Warm Pool Statistics Recording (inline) #include "../box/slab_carve_box.h" // Box: Slab Carving (inline O(slabs) scan) #define WARM_POOL_REL_DEFINE #include "../box/warm_pool_rel_counters_box.h" // Box: Release-side C7 counters #undef WARM_POOL_REL_DEFINE #include "../box/c7_meta_used_counter_box.h" // Box: C7 meta->used increment counters #include "../box/warm_pool_prefill_box.h" // Box: Warm Pool Prefill (secondary optimization) #include "../box/tiny_mem_stats_box.h" // Box: Tiny front memory accounting #include "../hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls) #include "../box/tiny_page_box.h" // Tiny-Plus Page Box (C5–C7 initial hook) #include "../box/ss_tls_bind_box.h" // Box: TLS Bind (SuperSlab -> TLS binding) #include "../box/tiny_tls_carve_one_block_box.h" // Box: TLS carve helper (shared) #include "../box/tiny_class_policy_box.h" // Box: per-class policy (Page/Warm caps) #include "../box/tiny_class_stats_box.h" // Box: lightweight per-class stats #include "../box/warm_tls_bind_logger_box.h" // Box: Warm TLS Bind logging (throttled) #define WARM_POOL_DBG_DEFINE #include "../box/warm_pool_dbg_box.h" // Box: Warm Pool C7 debug counters #undef WARM_POOL_DBG_DEFINE #include #include #include #include #include // ============================================================================ // Performance Measurement: Unified Cache (ENV-gated) // ============================================================================ // Global atomic counters for unified cache performance measurement // ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF) _Atomic uint64_t g_unified_cache_hits_global = 0; _Atomic uint64_t g_unified_cache_misses_global = 0; _Atomic uint64_t g_unified_cache_refill_cycles_global = 0; // Per-class counters(Tiny クラス別の Unified Cache 観測用) _Atomic uint64_t g_unified_cache_hits_by_class[TINY_NUM_CLASSES] = {0}; _Atomic uint64_t g_unified_cache_misses_by_class[TINY_NUM_CLASSES] = {0}; _Atomic uint64_t g_unified_cache_refill_cycles_by_class[TINY_NUM_CLASSES] = {0}; // Helper: Get cycle count (x86_64 rdtsc) static inline uint64_t read_tsc(void) { #if defined(__x86_64__) || defined(_M_X64) uint32_t lo, hi; __asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi)); return ((uint64_t)hi << 32) | lo; #else // Fallback to clock_gettime for non-x86 platforms struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec; #endif } // Check if measurement is enabled (cached) static inline int unified_cache_measure_enabled(void) { static int g_measure = -1; if (__builtin_expect(g_measure == -1, 0)) { const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE"); g_measure = (e && *e && *e != '0') ? 1 : 0; } return g_measure; } // Phase 23-E: Forward declarations extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c extern void ss_active_add(SuperSlab* ss, uint32_t n); // From hakmem_tiny_ss_active_box.inc // ============================================================================ // TLS Variables (defined here, extern in header) // ============================================================================ __thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES]; // Phase 3 C2 Patch 2: First Page Inline Cache (TLS per-class) #include "tiny_first_page_cache.h" __thread TinyFirstPageCache g_first_page_cache[TINY_NUM_CLASSES] = {0}; // Warm Pool: Per-thread warm SuperSlab pools (one per class) __thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0}; // ============================================================================ // Metrics (Phase 23, optional for debugging) // ============================================================================ #if !HAKMEM_BUILD_RELEASE __thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0}; __thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0}; __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0}; __thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0}; #endif // Release-side lightweight telemetry (C7 Warm path only) #if HAKMEM_BUILD_RELEASE _Atomic uint64_t g_rel_c7_warm_pop = 0; _Atomic uint64_t g_rel_c7_warm_push = 0; #endif // Warm Pool metrics (definition - declared in tiny_warm_pool.h as extern) // Note: These are kept outside !HAKMEM_BUILD_RELEASE for profiling in release builds __thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES] = {0}; #if !HAKMEM_BUILD_RELEASE // Debug-only diagnostics for Warm Pool effectiveness _Atomic uint64_t g_dbg_warm_prefill_attempts = 0; _Atomic uint64_t g_dbg_warm_prefill_refill_fail = 0; _Atomic uint64_t g_dbg_warm_prefill_push_ok = 0; _Atomic uint64_t g_dbg_warm_prefill_push_full = 0; _Atomic uint64_t g_dbg_warm_pop_attempts = 0; _Atomic uint64_t g_dbg_warm_pop_hits = 0; _Atomic uint64_t g_dbg_warm_pop_empty = 0; _Atomic uint64_t g_dbg_warm_pop_carve_zero = 0; #endif // Warm TLS Bind (C7) mode selector // mode 0: Legacy warm path(デバッグ専用・C7では非推奨) // mode 1: Bind-only 本番経路(C7 標準) // mode 2: Bind + TLS carve 実験経路(Debug 専用) // Release ビルドでは常に mode=1 に固定し、ENV は無視する。 static inline int warm_tls_bind_mode_c7(void) { #if HAKMEM_BUILD_RELEASE static int g_warm_tls_bind_mode_c7 = -1; if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) { const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7"); int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only if (mode < 0) mode = 0; if (mode > 2) mode = 2; g_warm_tls_bind_mode_c7 = mode; } return g_warm_tls_bind_mode_c7; #else static int g_warm_tls_bind_mode_c7 = -1; if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) { const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7"); int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only if (mode < 0) mode = 0; if (mode > 2) mode = 2; g_warm_tls_bind_mode_c7 = mode; } return g_warm_tls_bind_mode_c7; #endif } // Forward declaration for Warm Pool stats printer (defined later in this file) static inline void tiny_warm_pool_print_stats(void); // ============================================================================ // Phase 8-Step1-Fix: unified_cache_enabled() implementation (non-static) // ============================================================================ // Enable flag (default: ON, disable with HAKMEM_TINY_UNIFIED_CACHE=0) int unified_cache_enabled(void) { // Priority-2: Use cached ENV (eliminate lazy-init static overhead) static int g_enable = -1; if (__builtin_expect(g_enable == -1, 0)) { g_enable = HAK_ENV_TINY_UNIFIED_CACHE(); #if !HAKMEM_BUILD_RELEASE if (g_enable) { fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable); fflush(stderr); } #else if (g_enable) { static int printed = 0; if (!printed) { fprintf(stderr, "[Rel-Unified] unified_cache_enabled() = %d\n", g_enable); fflush(stderr); printed = 1; } } #endif } return g_enable; } // ============================================================================ // Init (called at thread start or lazy on first access) // ============================================================================ void unified_cache_init(void) { if (!unified_cache_enabled()) return; // Layer 2 Defensive Fix: Use __libc_calloc for infrastructure allocations // Rationale: Cache arrays are infrastructure (not workload), bypass HAKMEM entirely // This prevents interaction with BenchFast mode and ensures clean separation extern void* __libc_calloc(size_t, size_t); // Initialize all classes (C0-C7) for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { if (g_unified_cache[cls].slots != NULL) continue; // Already initialized size_t cap = unified_capacity(cls); g_unified_cache[cls].slots = (void**)__libc_calloc(cap, sizeof(void*)); if (!g_unified_cache[cls].slots) { #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[Unified-INIT] Failed to allocate C%d cache (%zu slots)\n", cls, cap); fflush(stderr); #endif continue; // Skip this class, try others } tiny_mem_stats_add_unified((ssize_t)(cap * sizeof(void*))); g_unified_cache[cls].capacity = (uint16_t)cap; g_unified_cache[cls].mask = (uint16_t)(cap - 1); g_unified_cache[cls].head = 0; g_unified_cache[cls].tail = 0; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[Unified-INIT] C%d: %zu slots (%zu bytes)\n", cls, cap, cap * sizeof(void*)); fflush(stderr); #endif } } // ============================================================================ // Shutdown (called at thread exit, optional) // ============================================================================ void unified_cache_shutdown(void) { if (!unified_cache_enabled()) return; // TODO: Drain caches to SuperSlab before shutdown (prevent leak) // Layer 2 Defensive Fix: Use __libc_free (symmetric with __libc_calloc in init) extern void __libc_free(void*); // Free cache buffers for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { if (g_unified_cache[cls].slots) { __libc_free(g_unified_cache[cls].slots); g_unified_cache[cls].slots = NULL; } } #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[Unified-SHUTDOWN] All caches freed\n"); fflush(stderr); #endif } // ============================================================================ // Stats (Phase 23 metrics) // ============================================================================ void unified_cache_print_stats(void) { if (!unified_cache_enabled()) return; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n"); for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { uint64_t total_allocs = g_unified_cache_hit[cls] + g_unified_cache_miss[cls]; uint64_t total_frees = g_unified_cache_push[cls] + g_unified_cache_full[cls]; if (total_allocs == 0 && total_frees == 0) continue; // Skip unused classes double hit_rate = (total_allocs > 0) ? (100.0 * g_unified_cache_hit[cls] / total_allocs) : 0.0; double full_rate = (total_frees > 0) ? (100.0 * g_unified_cache_full[cls] / total_frees) : 0.0; // Current occupancy uint16_t count = (g_unified_cache[cls].tail >= g_unified_cache[cls].head) ? (g_unified_cache[cls].tail - g_unified_cache[cls].head) : (g_unified_cache[cls].capacity - g_unified_cache[cls].head + g_unified_cache[cls].tail); fprintf(stderr, " C%d: %u/%u slots occupied, hit=%llu miss=%llu (%.1f%% hit), push=%llu full=%llu (%.1f%% full)\n", cls, count, g_unified_cache[cls].capacity, (unsigned long long)g_unified_cache_hit[cls], (unsigned long long)g_unified_cache_miss[cls], hit_rate, (unsigned long long)g_unified_cache_push[cls], (unsigned long long)g_unified_cache_full[cls], full_rate); } fflush(stderr); // Also print warm pool stats if enabled tiny_warm_pool_print_stats(); #endif } // ============================================================================ // Warm Pool Stats (always compiled, ENV-gated at runtime) // ============================================================================ static inline void tiny_warm_pool_print_stats(void) { // Check if warm pool stats are enabled via ENV static int g_print_stats = -1; if (__builtin_expect(g_print_stats == -1, 0)) { const char* e = getenv("HAKMEM_WARM_POOL_STATS"); g_print_stats = (e && *e && *e != '0') ? 1 : 0; } if (!g_print_stats) return; fprintf(stderr, "\n[WarmPool-STATS] Warm Pool Metrics:\n"); for (int i = 0; i < TINY_NUM_CLASSES; i++) { uint64_t total = g_warm_pool_stats[i].hits + g_warm_pool_stats[i].misses; float hit_rate = (total > 0) ? (100.0 * g_warm_pool_stats[i].hits / total) : 0.0; fprintf(stderr, " C%d: hits=%llu misses=%llu hit_rate=%.1f%% prefilled=%llu\n", i, (unsigned long long)g_warm_pool_stats[i].hits, (unsigned long long)g_warm_pool_stats[i].misses, hit_rate, (unsigned long long)g_warm_pool_stats[i].prefilled); } #if !HAKMEM_BUILD_RELEASE // Debug-only aggregated diagnostics for Warm Pool fprintf(stderr, " [DBG] prefill_attempts=%llu refill_fail=%llu push_ok=%llu push_full=%llu " "pop_attempts=%llu pop_hits=%llu pop_empty=%llu pop_carve_zero=%llu\n", (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_attempts, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_refill_fail, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_ok, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_full, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_attempts, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_hits, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_empty, memory_order_relaxed), (unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_carve_zero, memory_order_relaxed)); uint64_t c7_attempts = warm_pool_dbg_c7_attempts(); uint64_t c7_hits = warm_pool_dbg_c7_hits(); uint64_t c7_carve = warm_pool_dbg_c7_carves(); uint64_t c7_tls_attempts = warm_pool_dbg_c7_tls_attempts(); uint64_t c7_tls_success = warm_pool_dbg_c7_tls_successes(); uint64_t c7_tls_fail = warm_pool_dbg_c7_tls_failures(); uint64_t c7_uc_warm = warm_pool_dbg_c7_uc_miss_warm_refills(); uint64_t c7_uc_tls = warm_pool_dbg_c7_uc_miss_tls_refills(); uint64_t c7_uc_shared = warm_pool_dbg_c7_uc_miss_shared_refills(); if (c7_attempts || c7_hits || c7_carve || c7_tls_attempts || c7_tls_success || c7_tls_fail || c7_uc_warm || c7_uc_tls || c7_uc_shared) { fprintf(stderr, " [DBG_C7] warm_pop_attempts=%llu warm_pop_hits=%llu warm_pop_carve=%llu " "tls_carve_attempts=%llu tls_carve_success=%llu tls_carve_fail=%llu " "uc_miss_warm=%llu uc_miss_tls=%llu uc_miss_shared=%llu\n", (unsigned long long)c7_attempts, (unsigned long long)c7_hits, (unsigned long long)c7_carve, (unsigned long long)c7_tls_attempts, (unsigned long long)c7_tls_success, (unsigned long long)c7_tls_fail, (unsigned long long)c7_uc_warm, (unsigned long long)c7_uc_tls, (unsigned long long)c7_uc_shared); } #endif fflush(stderr); } // Public wrapper for benchmarks void tiny_warm_pool_print_stats_public(void) { tiny_warm_pool_print_stats(); } // ============================================================================ // Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass) // ============================================================================ // Fail-fast helper: verify that a candidate BASE pointer belongs to a valid // Tiny slab within a SuperSlab. This is intentionally defensive and only // compiled in debug builds to avoid hot-path overhead in release. static inline int unified_refill_validate_base(int class_idx, TinyTLSSlab* tls, TinySlabMeta* meta, void* base, const char* stage) { #if HAKMEM_BUILD_RELEASE (void)class_idx; (void)tls; (void)base; (void)stage; (void)meta; return 1; #else if (!base) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=NULL tls_ss=%p meta=%p\n", stage ? stage : "unified_refill", class_idx, (void*)(tls ? tls->ss : NULL), (void*)meta); abort(); } SuperSlab* tls_ss = tls ? tls->ss : NULL; if (!tls_ss || tls_ss->magic != SUPERSLAB_MAGIC) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p meta=%p (invalid TLS ss)\n", stage ? stage : "unified_refill", class_idx, base, (void*)tls_ss, (void*)meta); abort(); } // Cross-check registry lookup for additional safety. SuperSlab* ss_lookup = hak_super_lookup(base); if (!ss_lookup || ss_lookup->magic != SUPERSLAB_MAGIC) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p meta=%p\n", stage ? stage : "unified_refill", class_idx, base, (void*)tls_ss, (void*)ss_lookup, (void*)meta); abort(); } if (ss_lookup != tls_ss) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p (mismatch)\n", stage ? stage : "unified_refill", class_idx, base, (void*)tls_ss, (void*)ss_lookup); abort(); } int slab_idx = tls ? (int)tls->slab_idx : -1; int cap = ss_slabs_capacity(tls_ss); if (slab_idx < 0 || slab_idx >= cap) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d cap=%d meta_cap=%u meta_used=%u meta_carved=%u\n", stage ? stage : "unified_refill", class_idx, base, (void*)tls_ss, slab_idx, cap, meta ? meta->capacity : 0u, meta ? (unsigned)meta->used : 0u, meta ? (unsigned)meta->carved : 0u); abort(); } // Ensure meta matches TLS view for this slab. TinySlabMeta* expected_meta = &tls_ss->slabs[slab_idx]; if (meta && meta != expected_meta) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d meta=%p expected_meta=%p\n", stage ? stage : "unified_refill", class_idx, base, (void*)tls_ss, slab_idx, (void*)meta, (void*)expected_meta); abort(); } uint8_t* slab_base = tiny_slab_base_for_geometry(tls_ss, slab_idx); size_t stride = tiny_stride_for_class(class_idx); size_t usable = tiny_usable_bytes_for_slab(slab_idx); uint8_t* slab_end = slab_base + usable; if ((uint8_t*)base < slab_base || (uint8_t*)base >= slab_end) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p range=[%p,%p) stride=%zu meta_cap=%u meta_used=%u meta_carved=%u\n", stage ? stage : "unified_refill", class_idx, base, (void*)slab_base, (void*)slab_end, stride, meta ? meta->capacity : 0u, meta ? (unsigned)meta->used : 0u, meta ? (unsigned)meta->carved : 0u); abort(); } ptrdiff_t offset = (uint8_t*)base - slab_base; if (offset % (ptrdiff_t)stride != 0) { fprintf(stderr, "[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p offset=%td stride=%zu (misaligned) meta_cap=%u meta_used=%u meta_carved=%u\n", stage ? stage : "unified_refill", class_idx, base, offset, stride, meta ? meta->capacity : 0u, meta ? (unsigned)meta->used : 0u, meta ? (unsigned)meta->carved : 0u); abort(); } return 1; #endif } // ============================================================================ // Warm Pool Enhanced: Direct carve from warm SuperSlab (bypass superslab_refill) // ============================================================================ // ============================================================================ // Batch refill from SuperSlab (called on cache miss) // ============================================================================ // Returns: BASE pointer (first block, wrapped), or NULL-wrapped if failed // Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer) // Warm Pool Integration: PRIORITIZE warm pool, use superslab_refill as fallback hak_base_ptr_t unified_cache_refill(int class_idx) { // Measure refill cost if enabled uint64_t start_cycles = 0; int measure = unified_cache_measure_enabled(); if (measure) { start_cycles = read_tsc(); } // Initialize warm pool on first use (per-thread) tiny_warm_pool_init_once(); TinyUnifiedCache* cache = &g_unified_cache[class_idx]; const TinyClassPolicy* policy = tiny_policy_get(class_idx); int warm_enabled = policy ? policy->warm_enabled : 0; int warm_cap = policy ? policy->warm_cap : 0; int page_enabled = policy ? policy->page_box_enabled : 0; TinyTLSSlab* tls = &g_tls_slabs[class_idx]; // ✅ Phase 11+: Ensure cache is initialized (lazy init for cold path) if (!cache->slots) { unified_cache_init(); // Re-check after init (may fail due to alloc failure) if (!cache->slots) { return NULL; } } // Calculate available room in unified cache int room = (int)cache->capacity - 1; // Leave 1 slot for full detection if (cache->head > cache->tail) { room = cache->head - cache->tail - 1; } else if (cache->head < cache->tail) { room = cache->capacity - (cache->tail - cache->head) - 1; } if (room <= 0) return HAK_BASE_FROM_RAW(NULL); // Batch size limit(クラス別チューニング) // - 通常: 128 // - C5〜C6(129B〜512B): 256 まで拡張 // - C7(≈1KB): 512 まで拡張して refill 頻度をさらに下げる // - 安全性のため、下の out[] 配列サイズ(512)と常に整合させる int max_batch; if (class_idx == 7) { max_batch = 512; } else if (class_idx >= 5 && class_idx <= 6) { max_batch = 256; } else { max_batch = 128; } if (room > max_batch) room = max_batch; // NOTE: // - C7 では max_batch を 512 まで拡張するため、スタック配列も 512 エントリ確保する。 // - これにより、room <= max_batch <= 512 が常に成り立ち、out[] オーバーランを防止する。 void* out[512]; int produced = 0; int tls_carved = 0; // Debug bookkeeping: track TLS carve experiment hits #if HAKMEM_BUILD_RELEASE (void)tls_carved; #endif // ========== PAGE BOX HOT PATH(Tiny-Plus 層): Try page box FIRST ========== // 将来的に C7 専用の page-level freelist 管理をここに統合する。 // いまは stub 実装で常に 0 を返すが、Box 境界としての接続だけ先に行う。 if (page_enabled && tiny_page_box_is_enabled(class_idx)) { int page_produced = tiny_page_box_refill(class_idx, tls, out, room); if (page_produced > 0) { // Store blocks into cache and return first void* first = out[0]; for (int i = 1; i < page_produced; i++) { cache->slots[cache->tail] = out[i]; cache->tail = (cache->tail + 1) & cache->mask; } #if !HAKMEM_BUILD_RELEASE g_unified_cache_miss[class_idx]++; #endif tiny_class_stats_on_uc_miss(class_idx); if (measure) { uint64_t end_cycles = read_tsc(); uint64_t delta = end_cycles - start_cycles; atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global, delta, memory_order_relaxed); atomic_fetch_add_explicit(&g_unified_cache_misses_global, 1, memory_order_relaxed); atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx], delta, memory_order_relaxed); atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx], 1, memory_order_relaxed); } return HAK_BASE_FROM_RAW(first); } } // ========== WARM POOL HOT PATH: Check warm pool FIRST ========== // This is the critical optimization - avoid superslab_refill() registry scan if (warm_enabled) { if (class_idx == 7) { const TinyClassPolicy* pol = tiny_policy_get(7); static _Atomic int g_c7_policy_logged = 0; if (atomic_exchange_explicit(&g_c7_policy_logged, 1, memory_order_acq_rel) == 0) { fprintf(stderr, "[C7_POLICY_AT_WARM] page=%u warm=%u cap=%u\n", pol ? pol->page_box_enabled : 0, pol ? pol->warm_enabled : 0, pol ? pol->warm_cap : 0); } } #if !HAKMEM_BUILD_RELEASE atomic_fetch_add_explicit(&g_dbg_warm_pop_attempts, 1, memory_order_relaxed); if (class_idx == 7) { warm_pool_dbg_c7_attempt(); } #endif #if HAKMEM_BUILD_RELEASE if (class_idx == 7) { atomic_fetch_add_explicit(&g_rel_c7_warm_pop, 1, memory_order_relaxed); } #endif SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx); if (warm_ss) { int allow_tls_bind = policy && policy->tls_carve_enabled; int allow_tls_carve = allow_tls_bind; int warm_mode = 0; if (class_idx == 7) { #if !HAKMEM_BUILD_RELEASE warm_pool_dbg_c7_hit(); #endif warm_mode = warm_tls_bind_mode_c7(); allow_tls_bind = (warm_mode >= 1); allow_tls_carve = (warm_mode == 2); } if (allow_tls_bind) { int cap = ss_slabs_capacity(warm_ss); int slab_idx = -1; // Simple heuristic: first slab matching class for (int i = 0; i < cap; i++) { if (tiny_get_class_from_ss(warm_ss, i) == class_idx) { slab_idx = i; break; } } if (slab_idx >= 0) { uint32_t tid = (uint32_t)(uintptr_t)pthread_self(); if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) { if (class_idx == 7) { warm_tls_bind_log_success(warm_ss, slab_idx); } // Mode 2: carve a single block via TLS fast path (policy enabled classes) if (allow_tls_carve) { #if !HAKMEM_BUILD_RELEASE if (class_idx == 7) { warm_pool_dbg_c7_tls_attempt(); } #endif TinyTLSCarveOneResult tls_carve = tiny_tls_carve_one_block(tls, class_idx); if (tls_carve.block) { if (class_idx == 7) { warm_tls_bind_log_tls_carve(warm_ss, slab_idx, tls_carve.block); #if !HAKMEM_BUILD_RELEASE warm_pool_dbg_c7_tls_success(); #endif } out[0] = tls_carve.block; produced = 1; tls_carved = 1; } else { if (class_idx == 7) { warm_tls_bind_log_tls_fail(warm_ss, slab_idx); #if !HAKMEM_BUILD_RELEASE warm_pool_dbg_c7_tls_fail(); #endif } } } } } } #if !HAKMEM_BUILD_RELEASE atomic_fetch_add_explicit(&g_dbg_warm_pop_hits, 1, memory_order_relaxed); #endif // HOT PATH: Warm pool hit, try to carve directly if (produced == 0) { #if HAKMEM_BUILD_RELEASE if (class_idx == 7) { warm_pool_rel_c7_carve_attempt(); } #endif produced = slab_carve_from_ss(class_idx, warm_ss, out, room); #if HAKMEM_BUILD_RELEASE if (class_idx == 7) { if (produced > 0) { warm_pool_rel_c7_carve_success(); } else { warm_pool_rel_c7_carve_zero(); } } #endif if (produced > 0) { // Update active counter for carved blocks ss_active_add(warm_ss, (uint32_t)produced); } } if (produced > 0) { #if !HAKMEM_BUILD_RELEASE if (class_idx == 7) { warm_pool_dbg_c7_carve(); if (tls_carved) { warm_pool_dbg_c7_uc_miss_tls(); } else { warm_pool_dbg_c7_uc_miss_warm(); } } #endif // Success! Return SuperSlab to warm pool for next use #if HAKMEM_BUILD_RELEASE if (class_idx == 7) { atomic_fetch_add_explicit(&g_rel_c7_warm_push, 1, memory_order_relaxed); } #endif tiny_warm_pool_push_with_cap(class_idx, warm_ss, warm_cap); // Track warm pool hit (always compiled, ENV-gated printing) warm_pool_record_hit(class_idx); tiny_class_stats_on_warm_hit(class_idx); // Store blocks into cache and return first void* first = out[0]; for (int i = 1; i < produced; i++) { cache->slots[cache->tail] = out[i]; cache->tail = (cache->tail + 1) & cache->mask; } #if !HAKMEM_BUILD_RELEASE g_unified_cache_miss[class_idx]++; #endif tiny_class_stats_on_uc_miss(class_idx); if (measure) { uint64_t end_cycles = read_tsc(); uint64_t delta = end_cycles - start_cycles; atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global, delta, memory_order_relaxed); atomic_fetch_add_explicit(&g_unified_cache_misses_global, 1, memory_order_relaxed); // Per-class 集計(C5–C7 の refill コストを可視化) atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx], delta, memory_order_relaxed); atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx], 1, memory_order_relaxed); } return HAK_BASE_FROM_RAW(first); } // SuperSlab carve failed (produced == 0) #if !HAKMEM_BUILD_RELEASE atomic_fetch_add_explicit(&g_dbg_warm_pop_carve_zero, 1, memory_order_relaxed); #endif // This slab is either exhausted or has no more available capacity // The statistics counter 'prefilled' tracks how often we try to prefill if (produced == 0 && tiny_warm_pool_count(class_idx) == 0) { // Pool is empty and carve failed - prefill would help here warm_pool_record_prefilled(class_idx); } } else { #if !HAKMEM_BUILD_RELEASE atomic_fetch_add_explicit(&g_dbg_warm_pop_empty, 1, memory_order_relaxed); #endif } // ========== COLD PATH: Warm pool miss, use superslab_refill ========== // Track warm pool miss (always compiled, ENV-gated printing) warm_pool_record_miss(class_idx); } // Step 1: Ensure SuperSlab available via normal refill // Enhanced: Use Warm Pool Prefill Box for secondary prefill when pool is empty if (warm_enabled) { if (warm_pool_do_prefill(class_idx, tls, warm_cap) < 0) { return HAK_BASE_FROM_RAW(NULL); } // After prefill: tls->ss has the final slab for carving tls = &g_tls_slabs[class_idx]; // Reload (already done in prefill box) } else { if (!tls->ss) { if (!superslab_refill(class_idx)) { return HAK_BASE_FROM_RAW(NULL); } tls = &g_tls_slabs[class_idx]; } } // Step 2: Direct carve from SuperSlab into local array (bypass TLS SLL!) TinySlabMeta* m = tls->meta; size_t bs = tiny_stride_for_class(class_idx); uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx); while (produced < room) { if (m->freelist) { // Freelist pop void* p = m->freelist; void* next_node = tiny_next_read(class_idx, p); // ROOT CAUSE FIX: Write header BEFORE exposing block (but AFTER reading next) // For Class 0 (offset 0), next overlaps header, so we must read next first. #if HAKMEM_TINY_HEADER_CLASSIDX *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f)); // Prevent compiler from reordering header write after out[] assignment __atomic_thread_fence(__ATOMIC_RELEASE); #endif m->freelist = next_node; unified_refill_validate_base(class_idx, tls, m, p, "unified_refill_freelist"); // PageFaultTelemetry: record page touch for this BASE pagefault_telemetry_touch(class_idx, p); m->used++; out[produced++] = p; } else if (m->carved < m->capacity) { // Linear carve (fresh block, no freelist link) void* p = (void*)(base + ((size_t)m->carved * bs)); unified_refill_validate_base(class_idx, tls, m, p, "unified_refill_carve"); // PageFaultTelemetry: record page touch for this BASE pagefault_telemetry_touch(class_idx, p); // ✅ CRITICAL: Write header (new block) #if HAKMEM_TINY_HEADER_CLASSIDX *(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f)); #endif m->carved++; m->used++; out[produced++] = p; } else { // SuperSlab exhausted → refill and retry if (!superslab_refill(class_idx)) break; // ✅ CRITICAL: Reload TLS pointers after refill (avoid stale pointer bug) tls = &g_tls_slabs[class_idx]; m = tls->meta; base = tls->slab_base ? tls->slab_base : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx); } } if (produced == 0) return HAK_BASE_FROM_RAW(NULL); // Step 4: Update active counter // Guard: tls->ss can be NULL if all SuperSlab refills failed if (tls->ss) { ss_active_add(tls->ss, (uint32_t)produced); } // Step 5: Store blocks into unified cache (skip first, return it) void* first = out[0]; for (int i = 1; i < produced; i++) { cache->slots[cache->tail] = out[i]; cache->tail = (cache->tail + 1) & cache->mask; } #if !HAKMEM_BUILD_RELEASE if (class_idx == 7) { warm_pool_dbg_c7_uc_miss_shared(); } g_unified_cache_miss[class_idx]++; #endif tiny_class_stats_on_uc_miss(class_idx); // Measure refill cycles if (measure) { uint64_t end_cycles = read_tsc(); uint64_t delta = end_cycles - start_cycles; atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global, delta, memory_order_relaxed); atomic_fetch_add_explicit(&g_unified_cache_misses_global, 1, memory_order_relaxed); // Per-class 集計 atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx], delta, memory_order_relaxed); atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx], 1, memory_order_relaxed); } return HAK_BASE_FROM_RAW(first); // Return first block (BASE pointer) } // ============================================================================ // Performance Measurement: Print Statistics // ============================================================================ void unified_cache_print_measurements(void) { if (!unified_cache_measure_enabled()) { return; // Measurement disabled, nothing to print } uint64_t hits = atomic_load_explicit(&g_unified_cache_hits_global, memory_order_relaxed); uint64_t misses = atomic_load_explicit(&g_unified_cache_misses_global, memory_order_relaxed); uint64_t refill_cycles = atomic_load_explicit(&g_unified_cache_refill_cycles_global, memory_order_relaxed); uint64_t total = hits + misses; if (total == 0) { fprintf(stderr, "\n========================================\n"); fprintf(stderr, "Unified Cache Statistics\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "No operations recorded (measurement may be disabled)\n"); fprintf(stderr, "========================================\n\n"); return; } double hit_rate = (100.0 * hits) / total; double avg_refill_cycles = misses > 0 ? (double)refill_cycles / misses : 0.0; // Estimate time at 1GHz (conservative, most modern CPUs are 2-4GHz) double avg_refill_us = avg_refill_cycles / 1000.0; fprintf(stderr, "\n========================================\n"); fprintf(stderr, "Unified Cache Statistics\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "Hits: %llu\n", (unsigned long long)hits); fprintf(stderr, "Misses: %llu\n", (unsigned long long)misses); fprintf(stderr, "Hit Rate: %.1f%%\n", hit_rate); fprintf(stderr, "Avg Refill Cycles: %.0f (est. %.2fus @ 1GHz)\n", avg_refill_cycles, avg_refill_us); // Per-class breakdown(Tiny クラス 0-7、特に C5–C7 を観測) fprintf(stderr, "\nPer-class Unified Cache (Tiny classes):\n"); for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { uint64_t ch = atomic_load_explicit(&g_unified_cache_hits_by_class[cls], memory_order_relaxed); uint64_t cm = atomic_load_explicit(&g_unified_cache_misses_by_class[cls], memory_order_relaxed); uint64_t cc = atomic_load_explicit(&g_unified_cache_refill_cycles_by_class[cls], memory_order_relaxed); uint64_t ct = ch + cm; if (ct == 0 && cc == 0) { continue; // 未使用クラスは省略 } double cls_hit_rate = ct > 0 ? (100.0 * (double)ch / (double)ct) : 0.0; double cls_avg_refill = cm > 0 ? (double)cc / (double)cm : 0.0; double cls_avg_us = cls_avg_refill / 1000.0; fprintf(stderr, " C%d: hits=%llu miss=%llu hit=%.1f%% avg_refill=%.0f cyc (%.2fus @1GHz)\n", cls, (unsigned long long)ch, (unsigned long long)cm, cls_hit_rate, cls_avg_refill, cls_avg_us); } fprintf(stderr, "========================================\n\n"); }