Files
hakmem/core/front/tiny_unified_cache.c
Moe Charm (CI) 4c986fa9d1 Feat: Add experimental TLS Bind Box path in Unified Cache
- Added experimental path in unified_cache_refill to test ss_tls_bind_one for C7 class.
- Guarded by HAKMEM_WARM_TLS_BIND_C7 env var and debug build.
- Updated Page Box comments to clarify future TLS Bind Box integration.
2025-12-05 20:05:11 +09:00

809 lines
34 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation
#include "tiny_unified_cache.h"
#include "tiny_warm_pool.h" // Warm Pool: O(1) SuperSlab lookup
#include "../tiny_tls.h" // Phase 23-E: TinyTLSSlab, TinySlabMeta
#include "../tiny_box_geometry.h" // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry
#include "../box/tiny_next_ptr_box.h" // Phase 23-E: tiny_next_read (freelist traversal)
#include "../hakmem_tiny_superslab.h" // Phase 23-E: SuperSlab, superslab_refill()
#include "../superslab/superslab_inline.h" // Phase 23-E: ss_active_add, slab_index_for, ss_slabs_capacity
#include "../hakmem_super_registry.h" // For hak_super_lookup (pointer→SuperSlab)
#include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
#include "../box/ss_tier_box.h" // For ss_tier_is_hot() tier checks
#include "../box/ss_slab_meta_box.h" // For ss_active_add() and slab metadata operations
#include "../box/warm_pool_stats_box.h" // Box: Warm Pool Statistics Recording (inline)
#include "../box/slab_carve_box.h" // Box: Slab Carving (inline O(slabs) scan)
#include "../box/warm_pool_prefill_box.h" // Box: Warm Pool Prefill (secondary optimization)
#include "../hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
#include "../box/tiny_page_box.h" // Tiny-Plus Page Box (C5C7 initial hook)
#include "../box/ss_tls_bind_box.h" // Box: TLS Bind (SuperSlab -> TLS binding)
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
#include <time.h>
// ============================================================================
// Performance Measurement: Unified Cache (ENV-gated)
// ============================================================================
// Global atomic counters for unified cache performance measurement
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
_Atomic uint64_t g_unified_cache_hits_global = 0;
_Atomic uint64_t g_unified_cache_misses_global = 0;
_Atomic uint64_t g_unified_cache_refill_cycles_global = 0;
// Per-class countersTiny クラス別の Unified Cache 観測用)
_Atomic uint64_t g_unified_cache_hits_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_misses_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_refill_cycles_by_class[TINY_NUM_CLASSES] = {0};
// Helper: Get cycle count (x86_64 rdtsc)
static inline uint64_t read_tsc(void) {
#if defined(__x86_64__) || defined(_M_X64)
uint32_t lo, hi;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
#else
// Fallback to clock_gettime for non-x86 platforms
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
#endif
}
// Check if measurement is enabled (cached)
static inline int unified_cache_measure_enabled(void) {
static int g_measure = -1;
if (__builtin_expect(g_measure == -1, 0)) {
const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
g_measure = (e && *e && *e != '0') ? 1 : 0;
}
return g_measure;
}
// Phase 23-E: Forward declarations
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c
extern void ss_active_add(SuperSlab* ss, uint32_t n); // From hakmem_tiny_ss_active_box.inc
// ============================================================================
// TLS Variables (defined here, extern in header)
// ============================================================================
__thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];
// Warm Pool: Per-thread warm SuperSlab pools (one per class)
__thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};
// ============================================================================
// Metrics (Phase 23, optional for debugging)
// ============================================================================
#if !HAKMEM_BUILD_RELEASE
__thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};
#endif
// Warm Pool metrics (definition - declared in tiny_warm_pool.h as extern)
// Note: These are kept outside !HAKMEM_BUILD_RELEASE for profiling in release builds
__thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES] = {0};
#if !HAKMEM_BUILD_RELEASE
// Debug-only diagnostics for Warm Pool effectiveness
_Atomic uint64_t g_dbg_warm_prefill_attempts = 0;
_Atomic uint64_t g_dbg_warm_prefill_refill_fail = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_ok = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_full = 0;
_Atomic uint64_t g_dbg_warm_pop_attempts = 0;
_Atomic uint64_t g_dbg_warm_pop_hits = 0;
_Atomic uint64_t g_dbg_warm_pop_empty = 0;
_Atomic uint64_t g_dbg_warm_pop_carve_zero = 0;
#endif
// Forward declaration for Warm Pool stats printer (defined later in this file)
static inline void tiny_warm_pool_print_stats(void);
// ============================================================================
// Phase 8-Step1-Fix: unified_cache_enabled() implementation (non-static)
// ============================================================================
// Enable flag (default: ON, disable with HAKMEM_TINY_UNIFIED_CACHE=0)
int unified_cache_enabled(void) {
// Priority-2: Use cached ENV (eliminate lazy-init static overhead)
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
g_enable = HAK_ENV_TINY_UNIFIED_CACHE();
#if !HAKMEM_BUILD_RELEASE
if (g_enable) {
fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable);
fflush(stderr);
}
#endif
}
return g_enable;
}
// ============================================================================
// Init (called at thread start or lazy on first access)
// ============================================================================
void unified_cache_init(void) {
if (!unified_cache_enabled()) return;
// Layer 2 Defensive Fix: Use __libc_calloc for infrastructure allocations
// Rationale: Cache arrays are infrastructure (not workload), bypass HAKMEM entirely
// This prevents interaction with BenchFast mode and ensures clean separation
extern void* __libc_calloc(size_t, size_t);
// Initialize all classes (C0-C7)
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_unified_cache[cls].slots != NULL) continue; // Already initialized
size_t cap = unified_capacity(cls);
g_unified_cache[cls].slots = (void**)__libc_calloc(cap, sizeof(void*));
if (!g_unified_cache[cls].slots) {
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-INIT] Failed to allocate C%d cache (%zu slots)\n", cls, cap);
fflush(stderr);
#endif
continue; // Skip this class, try others
}
g_unified_cache[cls].capacity = (uint16_t)cap;
g_unified_cache[cls].mask = (uint16_t)(cap - 1);
g_unified_cache[cls].head = 0;
g_unified_cache[cls].tail = 0;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-INIT] C%d: %zu slots (%zu bytes)\n",
cls, cap, cap * sizeof(void*));
fflush(stderr);
#endif
}
}
// ============================================================================
// Shutdown (called at thread exit, optional)
// ============================================================================
void unified_cache_shutdown(void) {
if (!unified_cache_enabled()) return;
// TODO: Drain caches to SuperSlab before shutdown (prevent leak)
// Layer 2 Defensive Fix: Use __libc_free (symmetric with __libc_calloc in init)
extern void __libc_free(void*);
// Free cache buffers
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_unified_cache[cls].slots) {
__libc_free(g_unified_cache[cls].slots);
g_unified_cache[cls].slots = NULL;
}
}
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-SHUTDOWN] All caches freed\n");
fflush(stderr);
#endif
}
// ============================================================================
// Stats (Phase 23 metrics)
// ============================================================================
void unified_cache_print_stats(void) {
if (!unified_cache_enabled()) return;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
uint64_t total_allocs = g_unified_cache_hit[cls] + g_unified_cache_miss[cls];
uint64_t total_frees = g_unified_cache_push[cls] + g_unified_cache_full[cls];
if (total_allocs == 0 && total_frees == 0) continue; // Skip unused classes
double hit_rate = (total_allocs > 0) ? (100.0 * g_unified_cache_hit[cls] / total_allocs) : 0.0;
double full_rate = (total_frees > 0) ? (100.0 * g_unified_cache_full[cls] / total_frees) : 0.0;
// Current occupancy
uint16_t count = (g_unified_cache[cls].tail >= g_unified_cache[cls].head)
? (g_unified_cache[cls].tail - g_unified_cache[cls].head)
: (g_unified_cache[cls].capacity - g_unified_cache[cls].head + g_unified_cache[cls].tail);
fprintf(stderr, " C%d: %u/%u slots occupied, hit=%llu miss=%llu (%.1f%% hit), push=%llu full=%llu (%.1f%% full)\n",
cls,
count, g_unified_cache[cls].capacity,
(unsigned long long)g_unified_cache_hit[cls],
(unsigned long long)g_unified_cache_miss[cls],
hit_rate,
(unsigned long long)g_unified_cache_push[cls],
(unsigned long long)g_unified_cache_full[cls],
full_rate);
}
fflush(stderr);
// Also print warm pool stats if enabled
tiny_warm_pool_print_stats();
#endif
}
// ============================================================================
// Warm Pool Stats (always compiled, ENV-gated at runtime)
// ============================================================================
static inline void tiny_warm_pool_print_stats(void) {
// Check if warm pool stats are enabled via ENV
static int g_print_stats = -1;
if (__builtin_expect(g_print_stats == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_POOL_STATS");
g_print_stats = (e && *e && *e != '0') ? 1 : 0;
}
if (!g_print_stats) return;
fprintf(stderr, "\n[WarmPool-STATS] Warm Pool Metrics:\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
uint64_t total = g_warm_pool_stats[i].hits + g_warm_pool_stats[i].misses;
float hit_rate = (total > 0)
? (100.0 * g_warm_pool_stats[i].hits / total)
: 0.0;
fprintf(stderr, " C%d: hits=%llu misses=%llu hit_rate=%.1f%% prefilled=%llu\n",
i,
(unsigned long long)g_warm_pool_stats[i].hits,
(unsigned long long)g_warm_pool_stats[i].misses,
hit_rate,
(unsigned long long)g_warm_pool_stats[i].prefilled);
}
#if !HAKMEM_BUILD_RELEASE
// Debug-only aggregated diagnostics for Warm Pool
fprintf(stderr,
" [DBG] prefill_attempts=%llu refill_fail=%llu push_ok=%llu push_full=%llu "
"pop_attempts=%llu pop_hits=%llu pop_empty=%llu pop_carve_zero=%llu\n",
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_attempts, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_refill_fail, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_ok, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_full, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_attempts, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_hits, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_empty, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_carve_zero, memory_order_relaxed));
#endif
fflush(stderr);
}
// Public wrapper for benchmarks
void tiny_warm_pool_print_stats_public(void) {
tiny_warm_pool_print_stats();
}
// ============================================================================
// Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)
// ============================================================================
// Fail-fast helper: verify that a candidate BASE pointer belongs to a valid
// Tiny slab within a SuperSlab. This is intentionally defensive and only
// compiled in debug builds to avoid hot-path overhead in release.
static inline int unified_refill_validate_base(int class_idx,
TinyTLSSlab* tls,
TinySlabMeta* meta,
void* base,
const char* stage)
{
#if HAKMEM_BUILD_RELEASE
(void)class_idx; (void)tls; (void)base; (void)stage;
return 1;
#else
if (!base) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=NULL tls_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
(void*)(tls ? tls->ss : NULL),
(void*)meta);
abort();
}
SuperSlab* tls_ss = tls ? tls->ss : NULL;
if (!tls_ss || tls_ss->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p meta=%p (invalid TLS ss)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)meta);
abort();
}
// Cross-check registry lookup for additional safety.
SuperSlab* ss_lookup = hak_super_lookup(base);
if (!ss_lookup || ss_lookup->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup,
(void*)meta);
abort();
}
if (ss_lookup != tls_ss) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p (mismatch)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup);
abort();
}
int slab_idx = tls ? (int)tls->slab_idx : -1;
int cap = ss_slabs_capacity(tls_ss);
if (slab_idx < 0 || slab_idx >= cap) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d cap=%d meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
cap,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
// Ensure meta matches TLS view for this slab.
TinySlabMeta* expected_meta = &tls_ss->slabs[slab_idx];
if (meta && meta != expected_meta) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d meta=%p expected_meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
(void*)meta,
(void*)expected_meta);
abort();
}
uint8_t* slab_base = tiny_slab_base_for_geometry(tls_ss, slab_idx);
size_t stride = tiny_stride_for_class(class_idx);
size_t usable = tiny_usable_bytes_for_slab(slab_idx);
uint8_t* slab_end = slab_base + usable;
if ((uint8_t*)base < slab_base || (uint8_t*)base >= slab_end) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p range=[%p,%p) stride=%zu meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)slab_base,
(void*)slab_end,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
ptrdiff_t offset = (uint8_t*)base - slab_base;
if (offset % (ptrdiff_t)stride != 0) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p offset=%td stride=%zu (misaligned) meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
offset,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
return 1;
#endif
}
// ============================================================================
// Warm Pool Enhanced: Direct carve from warm SuperSlab (bypass superslab_refill)
// ============================================================================
// ============================================================================
// Batch refill from SuperSlab (called on cache miss)
// ============================================================================
// Returns: BASE pointer (first block, wrapped), or NULL-wrapped if failed
// Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)
// Warm Pool Integration: PRIORITIZE warm pool, use superslab_refill as fallback
hak_base_ptr_t unified_cache_refill(int class_idx) {
// Measure refill cost if enabled
uint64_t start_cycles = 0;
int measure = unified_cache_measure_enabled();
if (measure) {
start_cycles = read_tsc();
}
// Initialize warm pool on first use (per-thread)
tiny_warm_pool_init_once();
TinyUnifiedCache* cache = &g_unified_cache[class_idx];
// ✅ Phase 11+: Ensure cache is initialized (lazy init for cold path)
if (!cache->slots) {
unified_cache_init();
// Re-check after init (may fail due to alloc failure)
if (!cache->slots) {
return NULL;
}
}
// Calculate available room in unified cache
int room = (int)cache->capacity - 1; // Leave 1 slot for full detection
if (cache->head > cache->tail) {
room = cache->head - cache->tail - 1;
} else if (cache->head < cache->tail) {
room = cache->capacity - (cache->tail - cache->head) - 1;
}
if (room <= 0) return HAK_BASE_FROM_RAW(NULL);
// Batch size limitクラス別チューニング
// - 通常: 128
// - C5〜C6129B〜512B: 256 まで拡張
// - C7≈1KB: 512 まで拡張して refill 頻度をさらに下げる
// - 安全性のため、下の out[] 配列サイズ512と常に整合させる
int max_batch;
if (class_idx == 7) {
max_batch = 512;
} else if (class_idx >= 5 && class_idx <= 6) {
max_batch = 256;
} else {
max_batch = 128;
}
if (room > max_batch) room = max_batch;
// NOTE:
// - C7 では max_batch を 512 まで拡張するため、スタック配列も 512 エントリ確保する。
// - これにより、room <= max_batch <= 512 が常に成り立ち、out[] オーバーランを防止する。
void* out[512];
int produced = 0;
// ========== PAGE BOX HOT PATHTiny-Plus 層): Try page box FIRST ==========
// 将来的に C7 専用の page-level freelist 管理をここに統合する。
// いまは stub 実装で常に 0 を返すが、Box 境界としての接続だけ先に行う。
if (tiny_page_box_is_enabled(class_idx)) {
int page_produced = tiny_page_box_refill(class_idx, out, room);
if (page_produced > 0) {
// Store blocks into cache and return first
void* first = out[0];
for (int i = 1; i < page_produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
#if !HAKMEM_BUILD_RELEASE
g_unified_cache_miss[class_idx]++;
#endif
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
return HAK_BASE_FROM_RAW(first);
}
}
// ========== WARM POOL HOT PATH: Check warm pool FIRST ==========
// This is the critical optimization - avoid superslab_refill() registry scan
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_attempts, 1, memory_order_relaxed);
#endif
SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx);
if (warm_ss) {
#if !HAKMEM_BUILD_RELEASE
// FUTURE: TLS Bind Box Integration
// Currently we carve directly from warm_ss via slab_carve_from_ss().
// To unify logic, we should eventually:
// 1. Choose a slab index (via tiny_page_box or heuristic).
// 2. Bind it to TLS via ss_tls_bind_one(..., warm_ss, slab_idx, ...).
// 3. Fall through to TLS-based allocation.
// EXPERIMENTAL: Test TLS Bind Box connectivity for C7 (Debug only)
static int g_warm_tls_bind_c7 = -1;
if (g_warm_tls_bind_c7 == -1) {
const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
g_warm_tls_bind_c7 = (e && *e && *e != '0') ? 1 : 0;
}
if (g_warm_tls_bind_c7 && class_idx == 7) {
// Find a slab index in this SuperSlab that matches our class
int cap = ss_slabs_capacity(warm_ss);
int slab_idx = -1;
// Simple heuristic: find first slab belonging to this class
// Note: In real logic, we should pick the *best* slab (e.g. from PageBox)
for (int i = 0; i < cap; i++) {
if (tiny_get_class_from_ss(warm_ss, i) == class_idx) {
slab_idx = i;
break;
}
}
if (slab_idx >= 0) {
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
// Try to bind. If successful, we have "connected" the path.
// For now, we still fall through to slab_carve_from_ss() to do the actual
// work, but the side effect (TLS updated) confirms connectivity.
// In a future step, we would 'break' here and let the TLS path handle it.
uint32_t tid = (uint32_t)(uintptr_t)pthread_self();
if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) {
static int logged = 0;
if (!logged) {
fprintf(stderr, "[WARM_TLS_BIND] C7 bind success: ss=%p slab=%d\n",
(void*)warm_ss, slab_idx);
logged = 1;
}
}
}
}
atomic_fetch_add_explicit(&g_dbg_warm_pop_hits, 1, memory_order_relaxed);
#endif
// HOT PATH: Warm pool hit, try to carve directly
produced = slab_carve_from_ss(class_idx, warm_ss, out, room);
if (produced > 0) {
// Update active counter for carved blocks
ss_active_add(warm_ss, (uint32_t)produced);
}
if (produced > 0) {
// Success! Return SuperSlab to warm pool for next use
tiny_warm_pool_push(class_idx, warm_ss);
// Track warm pool hit (always compiled, ENV-gated printing)
warm_pool_record_hit(class_idx);
// Store blocks into cache and return first
void* first = out[0];
for (int i = 1; i < produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
#if !HAKMEM_BUILD_RELEASE
g_unified_cache_miss[class_idx]++;
#endif
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
// Per-class 集計C5C7 の refill コストを可視化)
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
return HAK_BASE_FROM_RAW(first);
}
// SuperSlab carve failed (produced == 0)
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_carve_zero, 1, memory_order_relaxed);
#endif
// This slab is either exhausted or has no more available capacity
// The statistics counter 'prefilled' tracks how often we try to prefill
if (produced == 0 && tiny_warm_pool_count(class_idx) == 0) {
// Pool is empty and carve failed - prefill would help here
warm_pool_record_prefilled(class_idx);
}
} else {
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_empty, 1, memory_order_relaxed);
#endif
}
// ========== COLD PATH: Warm pool miss, use superslab_refill ==========
// Track warm pool miss (always compiled, ENV-gated printing)
warm_pool_record_miss(class_idx);
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
// Step 1: Ensure SuperSlab available via normal refill
// Enhanced: Use Warm Pool Prefill Box for secondary prefill when pool is empty
if (warm_pool_do_prefill(class_idx, tls) < 0) {
return HAK_BASE_FROM_RAW(NULL);
}
// After prefill: tls->ss has the final slab for carving
// tls = &g_tls_slabs[class_idx]; // Reload (already done in prefill box)
// Step 2: Direct carve from SuperSlab into local array (bypass TLS SLL!)
TinySlabMeta* m = tls->meta;
size_t bs = tiny_stride_for_class(class_idx);
uint8_t* base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
while (produced < room) {
if (m->freelist) {
// Freelist pop
void* p = m->freelist;
void* next_node = tiny_next_read(class_idx, p);
// ROOT CAUSE FIX: Write header BEFORE exposing block (but AFTER reading next)
// For Class 0 (offset 0), next overlaps header, so we must read next first.
#if HAKMEM_TINY_HEADER_CLASSIDX
*(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
// Prevent compiler from reordering header write after out[] assignment
__atomic_thread_fence(__ATOMIC_RELEASE);
#endif
m->freelist = next_node;
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_freelist");
// PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p);
m->used++;
out[produced++] = p;
} else if (m->carved < m->capacity) {
// Linear carve (fresh block, no freelist link)
void* p = (void*)(base + ((size_t)m->carved * bs));
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_carve");
// PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p);
// ✅ CRITICAL: Write header (new block)
#if HAKMEM_TINY_HEADER_CLASSIDX
*(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
#endif
m->carved++;
m->used++;
out[produced++] = p;
} else {
// SuperSlab exhausted → refill and retry
if (!superslab_refill(class_idx)) break;
// ✅ CRITICAL: Reload TLS pointers after refill (avoid stale pointer bug)
tls = &g_tls_slabs[class_idx];
m = tls->meta;
base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
}
}
if (produced == 0) return HAK_BASE_FROM_RAW(NULL);
// Step 4: Update active counter
// Guard: tls->ss can be NULL if all SuperSlab refills failed
if (tls->ss) {
ss_active_add(tls->ss, (uint32_t)produced);
}
// Step 5: Store blocks into unified cache (skip first, return it)
void* first = out[0];
for (int i = 1; i < produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
#if !HAKMEM_BUILD_RELEASE
g_unified_cache_miss[class_idx]++;
#endif
// Measure refill cycles
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
// Per-class 集計
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
return HAK_BASE_FROM_RAW(first); // Return first block (BASE pointer)
}
// ============================================================================
// Performance Measurement: Print Statistics
// ============================================================================
void unified_cache_print_measurements(void) {
if (!unified_cache_measure_enabled()) {
return; // Measurement disabled, nothing to print
}
uint64_t hits = atomic_load_explicit(&g_unified_cache_hits_global, memory_order_relaxed);
uint64_t misses = atomic_load_explicit(&g_unified_cache_misses_global, memory_order_relaxed);
uint64_t refill_cycles = atomic_load_explicit(&g_unified_cache_refill_cycles_global, memory_order_relaxed);
uint64_t total = hits + misses;
if (total == 0) {
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "Unified Cache Statistics\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "No operations recorded (measurement may be disabled)\n");
fprintf(stderr, "========================================\n\n");
return;
}
double hit_rate = (100.0 * hits) / total;
double avg_refill_cycles = misses > 0 ? (double)refill_cycles / misses : 0.0;
// Estimate time at 1GHz (conservative, most modern CPUs are 2-4GHz)
double avg_refill_us = avg_refill_cycles / 1000.0;
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "Unified Cache Statistics\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "Hits: %llu\n", (unsigned long long)hits);
fprintf(stderr, "Misses: %llu\n", (unsigned long long)misses);
fprintf(stderr, "Hit Rate: %.1f%%\n", hit_rate);
fprintf(stderr, "Avg Refill Cycles: %.0f (est. %.2fus @ 1GHz)\n",
avg_refill_cycles, avg_refill_us);
// Per-class breakdownTiny クラス 0-7、特に C5C7 を観測)
fprintf(stderr, "\nPer-class Unified Cache (Tiny classes):\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
uint64_t ch = atomic_load_explicit(&g_unified_cache_hits_by_class[cls],
memory_order_relaxed);
uint64_t cm = atomic_load_explicit(&g_unified_cache_misses_by_class[cls],
memory_order_relaxed);
uint64_t cc = atomic_load_explicit(&g_unified_cache_refill_cycles_by_class[cls],
memory_order_relaxed);
uint64_t ct = ch + cm;
if (ct == 0 && cc == 0) {
continue; // 未使用クラスは省略
}
double cls_hit_rate = ct > 0 ? (100.0 * (double)ch / (double)ct) : 0.0;
double cls_avg_refill = cm > 0 ? (double)cc / (double)cm : 0.0;
double cls_avg_us = cls_avg_refill / 1000.0;
fprintf(stderr,
" C%d: hits=%llu miss=%llu hit=%.1f%% avg_refill=%.0f cyc (%.2fus @1GHz)\n",
cls,
(unsigned long long)ch,
(unsigned long long)cm,
cls_hit_rate,
cls_avg_refill,
cls_avg_us);
}
fprintf(stderr, "========================================\n\n");
}