Files
hakmem/core/front/tiny_unified_cache.c
Moe Charm (CI) 8fdbc6d07e Phase 70-73: Route banner + observe stats consistency + WarmPool analysis SSOT
Observability infrastructure:
- Route Banner (ENV: HAKMEM_ROUTE_BANNER=1) for runtime configuration display
- Unified Cache consistency check (total_allocs vs total_frees)
- Verified counters are balanced (5.3M allocs = 5.3M frees)

WarmPool=16 comprehensive analysis:
- Phase 71: A/B test confirmed +1.31% throughput, 2.4x stability improvement
- Phase 73: Hardware profiling identified instruction reduction as root cause
  * -17.4M instructions (-0.38%)
  * -3.7M branches (-0.30%)
  * Trade-off: dTLB/cache misses increased, but instruction savings dominate
- Phase 72-0: Function-level perf record pinpointed unified_cache_push
  * Branches: -0.86% overhead (largest single-function improvement)
  * Instructions: -0.22% overhead

Key finding: WarmPool=16 optimization is control-flow based, not memory-hierarchy based.
Full analysis: docs/analysis/PHASE70_71_WARMPOOL16_ANALYSIS.md
2025-12-18 05:55:27 +09:00

1087 lines
46 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation
#include "tiny_unified_cache.h"
#include "tiny_warm_pool.h" // Warm Pool: O(1) SuperSlab lookup
#include "../tiny_tls.h" // Phase 23-E: TinyTLSSlab, TinySlabMeta
#include "../tiny_box_geometry.h" // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry
#include "../box/tiny_next_ptr_box.h" // Phase 23-E: tiny_next_read (freelist traversal)
#include "../hakmem_tiny_superslab.h" // Phase 23-E: SuperSlab, superslab_refill()
#include "../superslab/superslab_inline.h" // Phase 23-E: ss_active_add, slab_index_for, ss_slabs_capacity
#include "../hakmem_super_registry.h" // For hak_super_lookup (pointer→SuperSlab)
#include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
#include "../box/ss_tier_box.h" // For ss_tier_is_hot() tier checks
#include "../box/ss_slab_meta_box.h" // For ss_active_add() and slab metadata operations
#include "../box/warm_pool_stats_box.h" // Box: Warm Pool Statistics Recording (inline)
#include "../box/slab_carve_box.h" // Box: Slab Carving (inline O(slabs) scan)
#define WARM_POOL_REL_DEFINE
#include "../box/warm_pool_rel_counters_box.h" // Box: Release-side C7 counters
#undef WARM_POOL_REL_DEFINE
#include "../box/c7_meta_used_counter_box.h" // Box: C7 meta->used increment counters
#include "../box/warm_pool_prefill_box.h" // Box: Warm Pool Prefill (secondary optimization)
#include "../box/tiny_mem_stats_box.h" // Box: Tiny front memory accounting
#include "../hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
#include "../box/tiny_page_box.h" // Tiny-Plus Page Box (C5C7 initial hook)
#include "../box/ss_tls_bind_box.h" // Box: TLS Bind (SuperSlab -> TLS binding)
#include "../box/tiny_tls_carve_one_block_box.h" // Box: TLS carve helper (shared)
#include "../box/tiny_class_policy_box.h" // Box: per-class policy (Page/Warm caps)
#include "../box/tiny_class_stats_box.h" // Box: lightweight per-class stats
#include "../box/warm_tls_bind_logger_box.h" // Box: Warm TLS Bind logging (throttled)
#define WARM_POOL_DBG_DEFINE
#include "../box/warm_pool_dbg_box.h" // Box: Warm Pool C7 debug counters
#undef WARM_POOL_DBG_DEFINE
#include "../box/tiny_header_write_once_env_box.h" // Phase 5 E5-2: Header write-once optimization
#include "../box/tiny_header_box.h" // Phase 5 E5-2: Header class preservation logic
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
#include <stdio.h>
#include <time.h>
// ============================================================================
// Performance Measurement: Unified Cache (ENV-gated)
// ============================================================================
// Global atomic counters for unified cache performance measurement
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
_Atomic uint64_t g_unified_cache_hits_global = 0;
_Atomic uint64_t g_unified_cache_misses_global = 0;
_Atomic uint64_t g_unified_cache_refill_cycles_global = 0;
// Per-class countersTiny クラス別の Unified Cache 観測用)
_Atomic uint64_t g_unified_cache_hits_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_misses_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_refill_cycles_by_class[TINY_NUM_CLASSES] = {0};
// Helper: Get cycle count (x86_64 rdtsc)
static inline uint64_t read_tsc(void) {
#if defined(__x86_64__) || defined(_M_X64)
uint32_t lo, hi;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
#else
// Fallback to clock_gettime for non-x86 platforms
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
#endif
}
// Check if measurement is enabled (cached)
static inline int unified_cache_measure_enabled(void) {
static int g_measure = -1;
if (__builtin_expect(g_measure == -1, 0)) {
const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
g_measure = (e && *e && *e != '0') ? 1 : 0;
}
return g_measure;
}
#endif
// Phase 23-E: Forward declarations
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c
extern void ss_active_add(SuperSlab* ss, uint32_t n); // From hakmem_tiny_ss_active_box.inc
// ============================================================================
// TLS Variables (defined here, extern in header)
// ============================================================================
__thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];
// Phase 3 C2 Patch 2: First Page Inline Cache (TLS per-class)
#include "tiny_first_page_cache.h"
__thread TinyFirstPageCache g_first_page_cache[TINY_NUM_CLASSES] = {0};
// Warm Pool: Per-thread warm SuperSlab pools (one per class)
__thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};
// ============================================================================
// Metrics (Phase 23, optional for debugging)
// ============================================================================
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
__thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};
#endif
// Release-side lightweight telemetry (C7 Warm path only)
#if HAKMEM_BUILD_RELEASE
_Atomic uint64_t g_rel_c7_warm_pop = 0;
_Atomic uint64_t g_rel_c7_warm_push = 0;
#endif
// Warm Pool metrics (definition - declared in tiny_warm_pool.h as extern)
// Note: These are kept outside !HAKMEM_BUILD_RELEASE for profiling in release builds
__thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES] = {0};
#if !HAKMEM_BUILD_RELEASE
// Debug-only diagnostics for Warm Pool effectiveness
_Atomic uint64_t g_dbg_warm_prefill_attempts = 0;
_Atomic uint64_t g_dbg_warm_prefill_refill_fail = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_ok = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_full = 0;
_Atomic uint64_t g_dbg_warm_pop_attempts = 0;
_Atomic uint64_t g_dbg_warm_pop_hits = 0;
_Atomic uint64_t g_dbg_warm_pop_empty = 0;
_Atomic uint64_t g_dbg_warm_pop_carve_zero = 0;
#endif
// Warm TLS Bind (C7) mode selector
// mode 0: Legacy warm pathデバッグ専用・C7では非推奨
// mode 1: Bind-only 本番経路C7 標準)
// mode 2: Bind + TLS carve 実験経路Debug 専用)
// Release ビルドでは常に mode=1 に固定し、ENV は無視する。
static inline int warm_tls_bind_mode_c7(void) {
#if HAKMEM_BUILD_RELEASE
static int g_warm_tls_bind_mode_c7 = -1;
if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only
if (mode < 0) mode = 0;
if (mode > 2) mode = 2;
g_warm_tls_bind_mode_c7 = mode;
}
return g_warm_tls_bind_mode_c7;
#else
static int g_warm_tls_bind_mode_c7 = -1;
if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only
if (mode < 0) mode = 0;
if (mode > 2) mode = 2;
g_warm_tls_bind_mode_c7 = mode;
}
return g_warm_tls_bind_mode_c7;
#endif
}
// Forward declaration for Warm Pool stats printer (defined later in this file)
static inline void tiny_warm_pool_print_stats(void);
// ============================================================================
// Phase 8-Step1-Fix: unified_cache_enabled() implementation (non-static)
// ============================================================================
// Enable flag (default: ON, disable with HAKMEM_TINY_UNIFIED_CACHE=0)
int unified_cache_enabled(void) {
// Priority-2: Use cached ENV (eliminate lazy-init static overhead)
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
g_enable = HAK_ENV_TINY_UNIFIED_CACHE();
#if !HAKMEM_BUILD_RELEASE
if (g_enable) {
fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable);
fflush(stderr);
}
#else
if (g_enable) {
static int printed = 0;
if (!printed) {
fprintf(stderr, "[Rel-Unified] unified_cache_enabled() = %d\n", g_enable);
fflush(stderr);
printed = 1;
}
}
#endif
}
return g_enable;
}
// ============================================================================
// Init (called at thread start or lazy on first access)
// ============================================================================
void unified_cache_init(void) {
if (!unified_cache_enabled()) return;
// Layer 2 Defensive Fix: Use __libc_calloc for infrastructure allocations
// Rationale: Cache arrays are infrastructure (not workload), bypass HAKMEM entirely
// This prevents interaction with BenchFast mode and ensures clean separation
extern void* __libc_calloc(size_t, size_t);
// Initialize all classes (C0-C7)
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_unified_cache[cls].slots != NULL) continue; // Already initialized
size_t cap = unified_capacity(cls);
g_unified_cache[cls].slots = (void**)__libc_calloc(cap, sizeof(void*));
if (!g_unified_cache[cls].slots) {
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-INIT] Failed to allocate C%d cache (%zu slots)\n", cls, cap);
fflush(stderr);
#endif
continue; // Skip this class, try others
}
tiny_mem_stats_add_unified((ssize_t)(cap * sizeof(void*)));
g_unified_cache[cls].capacity = (uint16_t)cap;
g_unified_cache[cls].mask = (uint16_t)(cap - 1);
g_unified_cache[cls].head = 0;
g_unified_cache[cls].tail = 0;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-INIT] C%d: %zu slots (%zu bytes)\n",
cls, cap, cap * sizeof(void*));
fflush(stderr);
#endif
}
}
// ============================================================================
// Shutdown (called at thread exit, optional)
// ============================================================================
void unified_cache_shutdown(void) {
if (!unified_cache_enabled()) return;
// TODO: Drain caches to SuperSlab before shutdown (prevent leak)
// Layer 2 Defensive Fix: Use __libc_free (symmetric with __libc_calloc in init)
extern void __libc_free(void*);
// Free cache buffers
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_unified_cache[cls].slots) {
__libc_free(g_unified_cache[cls].slots);
g_unified_cache[cls].slots = NULL;
}
}
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-SHUTDOWN] All caches freed\n");
fflush(stderr);
#endif
}
// ============================================================================
// Stats (Phase 23 metrics)
// ============================================================================
void unified_cache_print_stats(void) {
if (!unified_cache_enabled()) return;
#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");
// Phase 70-3: Consistency Check - calculate totals across all classes
uint64_t total_allocs_all = 0;
uint64_t total_frees_all = 0;
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
total_allocs_all += g_unified_cache_hit[cls] + g_unified_cache_miss[cls];
total_frees_all += g_unified_cache_push[cls] + g_unified_cache_full[cls];
}
// Print consistency check BEFORE individual class stats
fprintf(stderr, "[Unified-STATS] Consistency Check:\n");
fprintf(stderr, "[Unified-STATS] total_allocs (hit+miss) = %llu\n",
(unsigned long long)total_allocs_all);
fprintf(stderr, "[Unified-STATS] total_frees (push+full) = %llu\n",
(unsigned long long)total_frees_all);
// Phase 70-3: WARNING logic for inconsistent counters
static int g_consistency_warned = 0;
if (!g_consistency_warned && total_allocs_all > 0 && total_frees_all > total_allocs_all * 2) {
fprintf(stderr, "[Unified-STATS-WARNING] total_frees >> total_allocs detected! "
"Alloc counters may not be wired.\n");
g_consistency_warned = 1;
}
fprintf(stderr, "\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
uint64_t total_allocs = g_unified_cache_hit[cls] + g_unified_cache_miss[cls];
uint64_t total_frees = g_unified_cache_push[cls] + g_unified_cache_full[cls];
if (total_allocs == 0 && total_frees == 0) continue; // Skip unused classes
double hit_rate = (total_allocs > 0) ? (100.0 * g_unified_cache_hit[cls] / total_allocs) : 0.0;
double full_rate = (total_frees > 0) ? (100.0 * g_unified_cache_full[cls] / total_frees) : 0.0;
// Current occupancy
uint16_t count = (g_unified_cache[cls].tail >= g_unified_cache[cls].head)
? (g_unified_cache[cls].tail - g_unified_cache[cls].head)
: (g_unified_cache[cls].capacity - g_unified_cache[cls].head + g_unified_cache[cls].tail);
fprintf(stderr, " C%d: %u/%u slots occupied, hit=%llu miss=%llu (%.1f%% hit), push=%llu full=%llu (%.1f%% full)\n",
cls,
count, g_unified_cache[cls].capacity,
(unsigned long long)g_unified_cache_hit[cls],
(unsigned long long)g_unified_cache_miss[cls],
hit_rate,
(unsigned long long)g_unified_cache_push[cls],
(unsigned long long)g_unified_cache_full[cls],
full_rate);
}
fflush(stderr);
// Also print warm pool stats if enabled
tiny_warm_pool_print_stats();
#endif
}
__attribute__((destructor))
static void unified_cache_auto_stats(void) {
unified_cache_print_stats();
}
// ============================================================================
// Warm Pool Stats (always compiled, ENV-gated at runtime)
// ============================================================================
static inline void tiny_warm_pool_print_stats(void) {
// Check if warm pool stats are enabled via ENV
static int g_print_stats = -1;
if (__builtin_expect(g_print_stats == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_POOL_STATS");
g_print_stats = (e && *e && *e != '0') ? 1 : 0;
}
if (!g_print_stats) return;
fprintf(stderr, "\n[WarmPool-STATS] Warm Pool Metrics:\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
uint64_t total = g_warm_pool_stats[i].hits + g_warm_pool_stats[i].misses;
float hit_rate = (total > 0)
? (100.0 * g_warm_pool_stats[i].hits / total)
: 0.0;
fprintf(stderr, " C%d: hits=%llu misses=%llu hit_rate=%.1f%% prefilled=%llu\n",
i,
(unsigned long long)g_warm_pool_stats[i].hits,
(unsigned long long)g_warm_pool_stats[i].misses,
hit_rate,
(unsigned long long)g_warm_pool_stats[i].prefilled);
}
#if !HAKMEM_BUILD_RELEASE
// Debug-only aggregated diagnostics for Warm Pool
fprintf(stderr,
" [DBG] prefill_attempts=%llu refill_fail=%llu push_ok=%llu push_full=%llu "
"pop_attempts=%llu pop_hits=%llu pop_empty=%llu pop_carve_zero=%llu\n",
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_attempts, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_refill_fail, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_ok, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_full, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_attempts, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_hits, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_empty, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_carve_zero, memory_order_relaxed));
uint64_t c7_attempts = warm_pool_dbg_c7_attempts();
uint64_t c7_hits = warm_pool_dbg_c7_hits();
uint64_t c7_carve = warm_pool_dbg_c7_carves();
uint64_t c7_tls_attempts = warm_pool_dbg_c7_tls_attempts();
uint64_t c7_tls_success = warm_pool_dbg_c7_tls_successes();
uint64_t c7_tls_fail = warm_pool_dbg_c7_tls_failures();
uint64_t c7_uc_warm = warm_pool_dbg_c7_uc_miss_warm_refills();
uint64_t c7_uc_tls = warm_pool_dbg_c7_uc_miss_tls_refills();
uint64_t c7_uc_shared = warm_pool_dbg_c7_uc_miss_shared_refills();
if (c7_attempts || c7_hits || c7_carve ||
c7_tls_attempts || c7_tls_success || c7_tls_fail ||
c7_uc_warm || c7_uc_tls || c7_uc_shared) {
fprintf(stderr,
" [DBG_C7] warm_pop_attempts=%llu warm_pop_hits=%llu warm_pop_carve=%llu "
"tls_carve_attempts=%llu tls_carve_success=%llu tls_carve_fail=%llu "
"uc_miss_warm=%llu uc_miss_tls=%llu uc_miss_shared=%llu\n",
(unsigned long long)c7_attempts,
(unsigned long long)c7_hits,
(unsigned long long)c7_carve,
(unsigned long long)c7_tls_attempts,
(unsigned long long)c7_tls_success,
(unsigned long long)c7_tls_fail,
(unsigned long long)c7_uc_warm,
(unsigned long long)c7_uc_tls,
(unsigned long long)c7_uc_shared);
}
#endif
fflush(stderr);
}
// Public wrapper for benchmarks
void tiny_warm_pool_print_stats_public(void) {
tiny_warm_pool_print_stats();
}
// ============================================================================
// Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)
// ============================================================================
// Fail-fast helper: verify that a candidate BASE pointer belongs to a valid
// Tiny slab within a SuperSlab. This is intentionally defensive and only
// compiled in debug builds to avoid hot-path overhead in release.
static inline int unified_refill_validate_base(int class_idx,
TinyTLSSlab* tls,
TinySlabMeta* meta,
void* base,
const char* stage)
{
#if HAKMEM_BUILD_RELEASE
(void)class_idx; (void)tls; (void)base; (void)stage; (void)meta;
return 1;
#else
if (!base) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=NULL tls_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
(void*)(tls ? tls->ss : NULL),
(void*)meta);
abort();
}
SuperSlab* tls_ss = tls ? tls->ss : NULL;
if (!tls_ss || tls_ss->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p meta=%p (invalid TLS ss)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)meta);
abort();
}
// Cross-check registry lookup for additional safety.
SuperSlab* ss_lookup = hak_super_lookup(base);
if (!ss_lookup || ss_lookup->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup,
(void*)meta);
abort();
}
if (ss_lookup != tls_ss) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p (mismatch)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup);
abort();
}
int slab_idx = tls ? (int)tls->slab_idx : -1;
int cap = ss_slabs_capacity(tls_ss);
if (slab_idx < 0 || slab_idx >= cap) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d cap=%d meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
cap,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
// Ensure meta matches TLS view for this slab.
TinySlabMeta* expected_meta = &tls_ss->slabs[slab_idx];
if (meta && meta != expected_meta) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d meta=%p expected_meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
(void*)meta,
(void*)expected_meta);
abort();
}
uint8_t* slab_base = tiny_slab_base_for_geometry(tls_ss, slab_idx);
size_t stride = tiny_stride_for_class(class_idx);
size_t usable = tiny_usable_bytes_for_slab(slab_idx);
uint8_t* slab_end = slab_base + usable;
if ((uint8_t*)base < slab_base || (uint8_t*)base >= slab_end) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p range=[%p,%p) stride=%zu meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)slab_base,
(void*)slab_end,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
ptrdiff_t offset = (uint8_t*)base - slab_base;
if (offset % (ptrdiff_t)stride != 0) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p offset=%td stride=%zu (misaligned) meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
offset,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
return 1;
#endif
}
// ============================================================================
// Warm Pool Enhanced: Direct carve from warm SuperSlab (bypass superslab_refill)
// ============================================================================
// ============================================================================
// Phase 5 E5-2: Header Prefill at Refill Boundary
// ============================================================================
// Prefill headers for C1-C6 blocks stored in unified cache.
// Called after blocks are placed in cache->slots[] during refill.
//
// Strategy:
// - C1-C6: Write headers ONCE at refill (preserved in freelist)
// - C0, C7: Skip (headers will be overwritten by next pointer anyway)
//
// This eliminates redundant header writes in hot allocation path.
static inline void unified_cache_prefill_headers(int class_idx, TinyUnifiedCache* cache, int start_tail, int count) {
#if HAKMEM_TINY_HEADER_CLASSIDX && HAKMEM_TINY_HEADER_WRITE_ONCE_COMPILED
// Only prefill if write-once optimization is enabled
if (!tiny_header_write_once_enabled()) return;
// Only prefill for C1-C6 (classes that preserve headers)
if (!tiny_class_preserves_header(class_idx)) return;
// Prefill header byte (constant for this class)
const uint8_t header_byte = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
// Prefill headers in cache slots (circular buffer)
int tail_idx = start_tail;
for (int i = 0; i < count; i++) {
void* base = cache->slots[tail_idx];
if (base) { // Safety: skip NULL slots
*(uint8_t*)base = header_byte;
}
tail_idx = (tail_idx + 1) & cache->mask;
}
#else
(void)class_idx;
(void)cache;
(void)start_tail;
(void)count;
#endif
}
// ============================================================================
// Batch refill from SuperSlab (called on cache miss)
// ============================================================================
// Returns: BASE pointer (first block, wrapped), or NULL-wrapped if failed
// Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)
// Warm Pool Integration: PRIORITIZE warm pool, use superslab_refill as fallback
hak_base_ptr_t unified_cache_refill(int class_idx) {
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
// Measure refill cost if enabled
uint64_t start_cycles = 0;
int measure = unified_cache_measure_enabled();
if (measure) {
start_cycles = read_tsc();
}
#endif
// Initialize warm pool on first use (per-thread)
tiny_warm_pool_init_once();
TinyUnifiedCache* cache = &g_unified_cache[class_idx];
const TinyClassPolicy* policy = tiny_policy_get(class_idx);
int warm_enabled = policy ? policy->warm_enabled : 0;
int warm_cap = policy ? policy->warm_cap : 0;
int page_enabled = policy ? policy->page_box_enabled : 0;
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
// ✅ Phase 11+: Ensure cache is initialized (lazy init for cold path)
if (!cache->slots) {
unified_cache_init();
// Re-check after init (may fail due to alloc failure)
if (!cache->slots) {
return NULL;
}
}
// Calculate available room in unified cache
int room = (int)cache->capacity - 1; // Leave 1 slot for full detection
if (cache->head > cache->tail) {
room = cache->head - cache->tail - 1;
} else if (cache->head < cache->tail) {
room = cache->capacity - (cache->tail - cache->head) - 1;
}
if (room <= 0) return HAK_BASE_FROM_RAW(NULL);
// Batch size limitクラス別チューニング
// - 通常: 128
// - C5〜C6129B〜512B: 256 まで拡張
// - C7≈1KB: 512 まで拡張して refill 頻度をさらに下げる
// - 安全性のため、下の out[] 配列サイズ512と常に整合させる
int max_batch;
if (class_idx == 7) {
max_batch = 512;
} else if (class_idx >= 5 && class_idx <= 6) {
max_batch = 256;
} else {
max_batch = 128;
}
if (room > max_batch) room = max_batch;
// NOTE:
// - C7 では max_batch を 512 まで拡張するため、スタック配列も 512 エントリ確保する。
// - これにより、room <= max_batch <= 512 が常に成り立ち、out[] オーバーランを防止する。
void* out[512];
int produced = 0;
int tls_carved = 0; // Debug bookkeeping: track TLS carve experiment hits
#if HAKMEM_BUILD_RELEASE
(void)tls_carved;
#endif
// ========== PAGE BOX HOT PATHTiny-Plus 層): Try page box FIRST ==========
// 将来的に C7 専用の page-level freelist 管理をここに統合する。
// いまは stub 実装で常に 0 を返すが、Box 境界としての接続だけ先に行う。
if (page_enabled && tiny_page_box_is_enabled(class_idx)) {
int page_produced = tiny_page_box_refill(class_idx, tls, out, room);
if (page_produced > 0) {
// Store blocks into cache and return first
void* first = out[0];
int start_tail = cache->tail; // E5-2: Save tail position for header prefill
for (int i = 1; i < page_produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
// E5-2: Prefill headers for C1-C6 (write-once optimization)
unified_cache_prefill_headers(class_idx, cache, start_tail, page_produced - 1);
#if !HAKMEM_BUILD_RELEASE
g_unified_cache_miss[class_idx]++;
#endif
tiny_class_stats_on_uc_miss(class_idx);
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
#endif
return HAK_BASE_FROM_RAW(first);
}
}
// ========== WARM POOL HOT PATH: Check warm pool FIRST ==========
// This is the critical optimization - avoid superslab_refill() registry scan
if (warm_enabled) {
if (class_idx == 7) {
const TinyClassPolicy* pol = tiny_policy_get(7);
static _Atomic int g_c7_policy_logged = 0;
if (atomic_exchange_explicit(&g_c7_policy_logged, 1, memory_order_acq_rel) == 0) {
fprintf(stderr,
"[C7_POLICY_AT_WARM] page=%u warm=%u cap=%u\n",
pol ? pol->page_box_enabled : 0,
pol ? pol->warm_enabled : 0,
pol ? pol->warm_cap : 0);
}
}
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_attempts, 1, memory_order_relaxed);
if (class_idx == 7) {
warm_pool_dbg_c7_attempt();
}
#endif
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
atomic_fetch_add_explicit(&g_rel_c7_warm_pop, 1, memory_order_relaxed);
}
#endif
SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx);
if (warm_ss) {
int allow_tls_bind = policy && policy->tls_carve_enabled;
int allow_tls_carve = allow_tls_bind;
int warm_mode = 0;
if (class_idx == 7) {
#if !HAKMEM_BUILD_RELEASE
warm_pool_dbg_c7_hit();
#endif
warm_mode = warm_tls_bind_mode_c7();
allow_tls_bind = (warm_mode >= 1);
allow_tls_carve = (warm_mode == 2);
}
if (allow_tls_bind) {
int cap = ss_slabs_capacity(warm_ss);
int slab_idx = -1;
// Simple heuristic: first slab matching class
for (int i = 0; i < cap; i++) {
if (tiny_get_class_from_ss(warm_ss, i) == class_idx) {
slab_idx = i;
break;
}
}
if (slab_idx >= 0) {
uint32_t tid = (uint32_t)(uintptr_t)pthread_self();
if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) {
if (class_idx == 7) {
warm_tls_bind_log_success(warm_ss, slab_idx);
}
// Mode 2: carve a single block via TLS fast path (policy enabled classes)
if (allow_tls_carve) {
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_tls_attempt();
}
#endif
TinyTLSCarveOneResult tls_carve =
tiny_tls_carve_one_block(tls, class_idx);
if (tls_carve.block) {
if (class_idx == 7) {
warm_tls_bind_log_tls_carve(warm_ss, slab_idx, tls_carve.block);
#if !HAKMEM_BUILD_RELEASE
warm_pool_dbg_c7_tls_success();
#endif
}
out[0] = tls_carve.block;
produced = 1;
tls_carved = 1;
} else {
if (class_idx == 7) {
warm_tls_bind_log_tls_fail(warm_ss, slab_idx);
#if !HAKMEM_BUILD_RELEASE
warm_pool_dbg_c7_tls_fail();
#endif
}
}
}
}
}
}
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_hits, 1, memory_order_relaxed);
#endif
// HOT PATH: Warm pool hit, try to carve directly
if (produced == 0) {
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_rel_c7_carve_attempt();
}
#endif
produced = slab_carve_from_ss(class_idx, warm_ss, out, room);
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
if (produced > 0) {
warm_pool_rel_c7_carve_success();
} else {
warm_pool_rel_c7_carve_zero();
}
}
#endif
if (produced > 0) {
// Update active counter for carved blocks
ss_active_add(warm_ss, (uint32_t)produced);
}
}
if (produced > 0) {
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_carve();
if (tls_carved) {
warm_pool_dbg_c7_uc_miss_tls();
} else {
warm_pool_dbg_c7_uc_miss_warm();
}
}
#endif
// Success! Return SuperSlab to warm pool for next use
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
atomic_fetch_add_explicit(&g_rel_c7_warm_push, 1, memory_order_relaxed);
}
#endif
tiny_warm_pool_push_with_cap(class_idx, warm_ss, warm_cap);
// Track warm pool hit (always compiled, ENV-gated printing)
warm_pool_record_hit(class_idx);
tiny_class_stats_on_warm_hit(class_idx);
// Store blocks into cache and return first
void* first = out[0];
int start_tail = cache->tail; // E5-2: Save tail position for header prefill
for (int i = 1; i < produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
// E5-2: Prefill headers for C1-C6 (write-once optimization)
unified_cache_prefill_headers(class_idx, cache, start_tail, produced - 1);
#if !HAKMEM_BUILD_RELEASE
g_unified_cache_miss[class_idx]++;
#endif
tiny_class_stats_on_uc_miss(class_idx);
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
// Per-class 集計C5C7 の refill コストを可視化)
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
#endif
return HAK_BASE_FROM_RAW(first);
}
// SuperSlab carve failed (produced == 0)
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_carve_zero, 1, memory_order_relaxed);
#endif
// This slab is either exhausted or has no more available capacity
// The statistics counter 'prefilled' tracks how often we try to prefill
if (produced == 0 && tiny_warm_pool_count(class_idx) == 0) {
// Pool is empty and carve failed - prefill would help here
warm_pool_record_prefilled(class_idx);
}
} else {
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_empty, 1, memory_order_relaxed);
#endif
}
// ========== COLD PATH: Warm pool miss, use superslab_refill ==========
// Track warm pool miss (always compiled, ENV-gated printing)
warm_pool_record_miss(class_idx);
}
// Step 1: Ensure SuperSlab available via normal refill
// Enhanced: Use Warm Pool Prefill Box for secondary prefill when pool is empty
if (warm_enabled) {
if (warm_pool_do_prefill(class_idx, tls, warm_cap) < 0) {
return HAK_BASE_FROM_RAW(NULL);
}
// After prefill: tls->ss has the final slab for carving
tls = &g_tls_slabs[class_idx]; // Reload (already done in prefill box)
} else {
if (!tls->ss) {
if (!superslab_refill(class_idx)) {
return HAK_BASE_FROM_RAW(NULL);
}
tls = &g_tls_slabs[class_idx];
}
}
// Step 2: Direct carve from SuperSlab into local array (bypass TLS SLL!)
TinySlabMeta* m = tls->meta;
size_t bs = tiny_stride_for_class(class_idx);
uint8_t* base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
while (produced < room) {
if (m->freelist) {
// Freelist pop
void* p = m->freelist;
void* next_node = tiny_next_read(class_idx, p);
// ROOT CAUSE FIX: Write header BEFORE exposing block (but AFTER reading next)
// For Class 0 (offset 0), next overlaps header, so we must read next first.
#if HAKMEM_TINY_HEADER_CLASSIDX
*(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
// Prevent compiler from reordering header write after out[] assignment
__atomic_thread_fence(__ATOMIC_RELEASE);
#endif
m->freelist = next_node;
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_freelist");
// PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p);
m->used++;
out[produced++] = p;
} else if (m->carved < m->capacity) {
// Linear carve (fresh block, no freelist link)
void* p = (void*)(base + ((size_t)m->carved * bs));
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_carve");
// PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p);
// ✅ CRITICAL: Write header (new block)
#if HAKMEM_TINY_HEADER_CLASSIDX
*(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
#endif
m->carved++;
m->used++;
out[produced++] = p;
} else {
// SuperSlab exhausted → refill and retry
if (!superslab_refill(class_idx)) break;
// ✅ CRITICAL: Reload TLS pointers after refill (avoid stale pointer bug)
tls = &g_tls_slabs[class_idx];
m = tls->meta;
base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
}
}
if (produced == 0) return HAK_BASE_FROM_RAW(NULL);
// Step 4: Update active counter
// Guard: tls->ss can be NULL if all SuperSlab refills failed
if (tls->ss) {
ss_active_add(tls->ss, (uint32_t)produced);
}
// Step 5: Store blocks into unified cache (skip first, return it)
void* first = out[0];
int start_tail = cache->tail; // E5-2: Save tail position for header prefill
for (int i = 1; i < produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
// E5-2: Prefill headers for C1-C6 (write-once optimization)
unified_cache_prefill_headers(class_idx, cache, start_tail, produced - 1);
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_uc_miss_shared();
}
g_unified_cache_miss[class_idx]++;
#endif
tiny_class_stats_on_uc_miss(class_idx);
// Measure refill cycles
#if HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
// Per-class 集計
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
#endif
return HAK_BASE_FROM_RAW(first); // Return first block (BASE pointer)
}
// ============================================================================
// Performance Measurement: Print Statistics
// ============================================================================
void unified_cache_print_measurements(void) {
#if !HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
return;
#else
if (!unified_cache_measure_enabled()) {
return; // Measurement disabled, nothing to print
}
uint64_t hits = atomic_load_explicit(&g_unified_cache_hits_global, memory_order_relaxed);
uint64_t misses = atomic_load_explicit(&g_unified_cache_misses_global, memory_order_relaxed);
uint64_t refill_cycles = atomic_load_explicit(&g_unified_cache_refill_cycles_global, memory_order_relaxed);
uint64_t total = hits + misses;
if (total == 0) {
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "Unified Cache Statistics\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "No operations recorded (measurement may be disabled)\n");
fprintf(stderr, "========================================\n\n");
return;
}
double hit_rate = (100.0 * hits) / total;
double avg_refill_cycles = misses > 0 ? (double)refill_cycles / misses : 0.0;
// Estimate time at 1GHz (conservative, most modern CPUs are 2-4GHz)
double avg_refill_us = avg_refill_cycles / 1000.0;
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "Unified Cache Statistics\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "Hits: %llu\n", (unsigned long long)hits);
fprintf(stderr, "Misses: %llu\n", (unsigned long long)misses);
fprintf(stderr, "Hit Rate: %.1f%%\n", hit_rate);
fprintf(stderr, "Avg Refill Cycles: %.0f (est. %.2fus @ 1GHz)\n",
avg_refill_cycles, avg_refill_us);
// Per-class breakdownTiny クラス 0-7、特に C5C7 を観測)
fprintf(stderr, "\nPer-class Unified Cache (Tiny classes):\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
uint64_t ch = atomic_load_explicit(&g_unified_cache_hits_by_class[cls],
memory_order_relaxed);
uint64_t cm = atomic_load_explicit(&g_unified_cache_misses_by_class[cls],
memory_order_relaxed);
uint64_t cc = atomic_load_explicit(&g_unified_cache_refill_cycles_by_class[cls],
memory_order_relaxed);
uint64_t ct = ch + cm;
if (ct == 0 && cc == 0) {
continue; // 未使用クラスは省略
}
double cls_hit_rate = ct > 0 ? (100.0 * (double)ch / (double)ct) : 0.0;
double cls_avg_refill = cm > 0 ? (double)cc / (double)cm : 0.0;
double cls_avg_us = cls_avg_refill / 1000.0;
fprintf(stderr,
" C%d: hits=%llu miss=%llu hit=%.1f%% avg_refill=%.0f cyc (%.2fus @1GHz)\n",
cls,
(unsigned long long)ch,
(unsigned long long)cm,
cls_hit_rate,
cls_avg_refill,
cls_avg_us);
}
fprintf(stderr, "========================================\n\n");
#endif
}