Files
hakmem/core/front/tiny_unified_cache.c
Moe Charm (CI) deecda7336 Phase 3 C2: Slab Metadata Cache Optimization (3 patches) - NEUTRAL
Patch 1: Policy Hot Cache
- Add TinyPolicyHot struct (route_kind[8] cached in TLS)
- Eliminate policy_snapshot() calls (~2 memory ops saved)
- Safety: disabled when learner v7 active
- Files: tiny_metadata_cache_env_box.h, tiny_metadata_cache_hot_box.{h,c}
- Integration: malloc_tiny_fast.h route selection

Patch 2: First Page Inline Cache
- Cache current slab page pointer in TLS per-class
- Avoid superslab metadata lookup (1-2 memory ops)
- Fast-path in tiny_legacy_fallback_free_base()
- Files: tiny_first_page_cache.h, tiny_unified_cache.c
- Integration: tiny_legacy_fallback_box.h

Patch 3: Bounds Check Compile-out
- Hardcode unified_cache capacity as MACRO constant
- Eliminate modulo operation (constant fold)
- Macros: TINY_UNIFIED_CACHE_CAPACITY_POW2=11, CAPACITY=2048, MASK=2047
- File: tiny_unified_cache.h

A/B Test Results (Mixed, 10-run):
- Baseline (C2=0): 40.43M ops/s (avg), 40.72M ops/s (median)
- Optimized (C2=1): 40.25M ops/s (avg), 40.29M ops/s (median)
- Improvement: -0.45% (avg), -1.06% (median)
- DECISION: NEUTRAL (within ±1.0% threshold)
- Action: Keep as research box (ENV gate OFF by default)

Cumulative Gain (Phase 2-3):
- B3 (Routing shape): +2.89%
- B4 (Wrapper split): +1.47%
- C3 (Static routing): +2.20%
- C2 (Metadata cache): -0.45%
- Total: ~6.1% (from baseline 37.5M → 39.8M ops/s)

🤖 Generated with Claude Code

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-13 19:19:42 +09:00

990 lines
42 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_unified_cache.c - Phase 23: Unified Frontend Cache Implementation
#include "tiny_unified_cache.h"
#include "tiny_warm_pool.h" // Warm Pool: O(1) SuperSlab lookup
#include "../tiny_tls.h" // Phase 23-E: TinyTLSSlab, TinySlabMeta
#include "../tiny_box_geometry.h" // Phase 23-E: tiny_stride_for_class, tiny_slab_base_for_geometry
#include "../box/tiny_next_ptr_box.h" // Phase 23-E: tiny_next_read (freelist traversal)
#include "../hakmem_tiny_superslab.h" // Phase 23-E: SuperSlab, superslab_refill()
#include "../superslab/superslab_inline.h" // Phase 23-E: ss_active_add, slab_index_for, ss_slabs_capacity
#include "../hakmem_super_registry.h" // For hak_super_lookup (pointer→SuperSlab)
#include "../box/pagefault_telemetry_box.h" // Phase 24: Box PageFaultTelemetry (Tiny page touch stats)
#include "../box/ss_tier_box.h" // For ss_tier_is_hot() tier checks
#include "../box/ss_slab_meta_box.h" // For ss_active_add() and slab metadata operations
#include "../box/warm_pool_stats_box.h" // Box: Warm Pool Statistics Recording (inline)
#include "../box/slab_carve_box.h" // Box: Slab Carving (inline O(slabs) scan)
#define WARM_POOL_REL_DEFINE
#include "../box/warm_pool_rel_counters_box.h" // Box: Release-side C7 counters
#undef WARM_POOL_REL_DEFINE
#include "../box/c7_meta_used_counter_box.h" // Box: C7 meta->used increment counters
#include "../box/warm_pool_prefill_box.h" // Box: Warm Pool Prefill (secondary optimization)
#include "../box/tiny_mem_stats_box.h" // Box: Tiny front memory accounting
#include "../hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
#include "../box/tiny_page_box.h" // Tiny-Plus Page Box (C5C7 initial hook)
#include "../box/ss_tls_bind_box.h" // Box: TLS Bind (SuperSlab -> TLS binding)
#include "../box/tiny_tls_carve_one_block_box.h" // Box: TLS carve helper (shared)
#include "../box/tiny_class_policy_box.h" // Box: per-class policy (Page/Warm caps)
#include "../box/tiny_class_stats_box.h" // Box: lightweight per-class stats
#include "../box/warm_tls_bind_logger_box.h" // Box: Warm TLS Bind logging (throttled)
#define WARM_POOL_DBG_DEFINE
#include "../box/warm_pool_dbg_box.h" // Box: Warm Pool C7 debug counters
#undef WARM_POOL_DBG_DEFINE
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
#include <stdio.h>
#include <time.h>
// ============================================================================
// Performance Measurement: Unified Cache (ENV-gated)
// ============================================================================
// Global atomic counters for unified cache performance measurement
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
_Atomic uint64_t g_unified_cache_hits_global = 0;
_Atomic uint64_t g_unified_cache_misses_global = 0;
_Atomic uint64_t g_unified_cache_refill_cycles_global = 0;
// Per-class countersTiny クラス別の Unified Cache 観測用)
_Atomic uint64_t g_unified_cache_hits_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_misses_by_class[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_unified_cache_refill_cycles_by_class[TINY_NUM_CLASSES] = {0};
// Helper: Get cycle count (x86_64 rdtsc)
static inline uint64_t read_tsc(void) {
#if defined(__x86_64__) || defined(_M_X64)
uint32_t lo, hi;
__asm__ __volatile__("rdtsc" : "=a"(lo), "=d"(hi));
return ((uint64_t)hi << 32) | lo;
#else
// Fallback to clock_gettime for non-x86 platforms
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
#endif
}
// Check if measurement is enabled (cached)
static inline int unified_cache_measure_enabled(void) {
static int g_measure = -1;
if (__builtin_expect(g_measure == -1, 0)) {
const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
g_measure = (e && *e && *e != '0') ? 1 : 0;
}
return g_measure;
}
// Phase 23-E: Forward declarations
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // From hakmem_tiny_superslab.c
extern void ss_active_add(SuperSlab* ss, uint32_t n); // From hakmem_tiny_ss_active_box.inc
// ============================================================================
// TLS Variables (defined here, extern in header)
// ============================================================================
__thread TinyUnifiedCache g_unified_cache[TINY_NUM_CLASSES];
// Phase 3 C2 Patch 2: First Page Inline Cache (TLS per-class)
#include "tiny_first_page_cache.h"
__thread TinyFirstPageCache g_first_page_cache[TINY_NUM_CLASSES] = {0};
// Warm Pool: Per-thread warm SuperSlab pools (one per class)
__thread TinyWarmPool g_tiny_warm_pool[TINY_NUM_CLASSES] = {0};
// ============================================================================
// Metrics (Phase 23, optional for debugging)
// ============================================================================
#if !HAKMEM_BUILD_RELEASE
__thread uint64_t g_unified_cache_hit[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_miss[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};
#endif
// Release-side lightweight telemetry (C7 Warm path only)
#if HAKMEM_BUILD_RELEASE
_Atomic uint64_t g_rel_c7_warm_pop = 0;
_Atomic uint64_t g_rel_c7_warm_push = 0;
#endif
// Warm Pool metrics (definition - declared in tiny_warm_pool.h as extern)
// Note: These are kept outside !HAKMEM_BUILD_RELEASE for profiling in release builds
__thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES] = {0};
#if !HAKMEM_BUILD_RELEASE
// Debug-only diagnostics for Warm Pool effectiveness
_Atomic uint64_t g_dbg_warm_prefill_attempts = 0;
_Atomic uint64_t g_dbg_warm_prefill_refill_fail = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_ok = 0;
_Atomic uint64_t g_dbg_warm_prefill_push_full = 0;
_Atomic uint64_t g_dbg_warm_pop_attempts = 0;
_Atomic uint64_t g_dbg_warm_pop_hits = 0;
_Atomic uint64_t g_dbg_warm_pop_empty = 0;
_Atomic uint64_t g_dbg_warm_pop_carve_zero = 0;
#endif
// Warm TLS Bind (C7) mode selector
// mode 0: Legacy warm pathデバッグ専用・C7では非推奨
// mode 1: Bind-only 本番経路C7 標準)
// mode 2: Bind + TLS carve 実験経路Debug 専用)
// Release ビルドでは常に mode=1 に固定し、ENV は無視する。
static inline int warm_tls_bind_mode_c7(void) {
#if HAKMEM_BUILD_RELEASE
static int g_warm_tls_bind_mode_c7 = -1;
if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only
if (mode < 0) mode = 0;
if (mode > 2) mode = 2;
g_warm_tls_bind_mode_c7 = mode;
}
return g_warm_tls_bind_mode_c7;
#else
static int g_warm_tls_bind_mode_c7 = -1;
if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only
if (mode < 0) mode = 0;
if (mode > 2) mode = 2;
g_warm_tls_bind_mode_c7 = mode;
}
return g_warm_tls_bind_mode_c7;
#endif
}
// Forward declaration for Warm Pool stats printer (defined later in this file)
static inline void tiny_warm_pool_print_stats(void);
// ============================================================================
// Phase 8-Step1-Fix: unified_cache_enabled() implementation (non-static)
// ============================================================================
// Enable flag (default: ON, disable with HAKMEM_TINY_UNIFIED_CACHE=0)
int unified_cache_enabled(void) {
// Priority-2: Use cached ENV (eliminate lazy-init static overhead)
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
g_enable = HAK_ENV_TINY_UNIFIED_CACHE();
#if !HAKMEM_BUILD_RELEASE
if (g_enable) {
fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable);
fflush(stderr);
}
#else
if (g_enable) {
static int printed = 0;
if (!printed) {
fprintf(stderr, "[Rel-Unified] unified_cache_enabled() = %d\n", g_enable);
fflush(stderr);
printed = 1;
}
}
#endif
}
return g_enable;
}
// ============================================================================
// Init (called at thread start or lazy on first access)
// ============================================================================
void unified_cache_init(void) {
if (!unified_cache_enabled()) return;
// Layer 2 Defensive Fix: Use __libc_calloc for infrastructure allocations
// Rationale: Cache arrays are infrastructure (not workload), bypass HAKMEM entirely
// This prevents interaction with BenchFast mode and ensures clean separation
extern void* __libc_calloc(size_t, size_t);
// Initialize all classes (C0-C7)
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_unified_cache[cls].slots != NULL) continue; // Already initialized
size_t cap = unified_capacity(cls);
g_unified_cache[cls].slots = (void**)__libc_calloc(cap, sizeof(void*));
if (!g_unified_cache[cls].slots) {
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-INIT] Failed to allocate C%d cache (%zu slots)\n", cls, cap);
fflush(stderr);
#endif
continue; // Skip this class, try others
}
tiny_mem_stats_add_unified((ssize_t)(cap * sizeof(void*)));
g_unified_cache[cls].capacity = (uint16_t)cap;
g_unified_cache[cls].mask = (uint16_t)(cap - 1);
g_unified_cache[cls].head = 0;
g_unified_cache[cls].tail = 0;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-INIT] C%d: %zu slots (%zu bytes)\n",
cls, cap, cap * sizeof(void*));
fflush(stderr);
#endif
}
}
// ============================================================================
// Shutdown (called at thread exit, optional)
// ============================================================================
void unified_cache_shutdown(void) {
if (!unified_cache_enabled()) return;
// TODO: Drain caches to SuperSlab before shutdown (prevent leak)
// Layer 2 Defensive Fix: Use __libc_free (symmetric with __libc_calloc in init)
extern void __libc_free(void*);
// Free cache buffers
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
if (g_unified_cache[cls].slots) {
__libc_free(g_unified_cache[cls].slots);
g_unified_cache[cls].slots = NULL;
}
}
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[Unified-SHUTDOWN] All caches freed\n");
fflush(stderr);
#endif
}
// ============================================================================
// Stats (Phase 23 metrics)
// ============================================================================
void unified_cache_print_stats(void) {
if (!unified_cache_enabled()) return;
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "\n[Unified-STATS] Unified Cache Metrics:\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
uint64_t total_allocs = g_unified_cache_hit[cls] + g_unified_cache_miss[cls];
uint64_t total_frees = g_unified_cache_push[cls] + g_unified_cache_full[cls];
if (total_allocs == 0 && total_frees == 0) continue; // Skip unused classes
double hit_rate = (total_allocs > 0) ? (100.0 * g_unified_cache_hit[cls] / total_allocs) : 0.0;
double full_rate = (total_frees > 0) ? (100.0 * g_unified_cache_full[cls] / total_frees) : 0.0;
// Current occupancy
uint16_t count = (g_unified_cache[cls].tail >= g_unified_cache[cls].head)
? (g_unified_cache[cls].tail - g_unified_cache[cls].head)
: (g_unified_cache[cls].capacity - g_unified_cache[cls].head + g_unified_cache[cls].tail);
fprintf(stderr, " C%d: %u/%u slots occupied, hit=%llu miss=%llu (%.1f%% hit), push=%llu full=%llu (%.1f%% full)\n",
cls,
count, g_unified_cache[cls].capacity,
(unsigned long long)g_unified_cache_hit[cls],
(unsigned long long)g_unified_cache_miss[cls],
hit_rate,
(unsigned long long)g_unified_cache_push[cls],
(unsigned long long)g_unified_cache_full[cls],
full_rate);
}
fflush(stderr);
// Also print warm pool stats if enabled
tiny_warm_pool_print_stats();
#endif
}
// ============================================================================
// Warm Pool Stats (always compiled, ENV-gated at runtime)
// ============================================================================
static inline void tiny_warm_pool_print_stats(void) {
// Check if warm pool stats are enabled via ENV
static int g_print_stats = -1;
if (__builtin_expect(g_print_stats == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_POOL_STATS");
g_print_stats = (e && *e && *e != '0') ? 1 : 0;
}
if (!g_print_stats) return;
fprintf(stderr, "\n[WarmPool-STATS] Warm Pool Metrics:\n");
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
uint64_t total = g_warm_pool_stats[i].hits + g_warm_pool_stats[i].misses;
float hit_rate = (total > 0)
? (100.0 * g_warm_pool_stats[i].hits / total)
: 0.0;
fprintf(stderr, " C%d: hits=%llu misses=%llu hit_rate=%.1f%% prefilled=%llu\n",
i,
(unsigned long long)g_warm_pool_stats[i].hits,
(unsigned long long)g_warm_pool_stats[i].misses,
hit_rate,
(unsigned long long)g_warm_pool_stats[i].prefilled);
}
#if !HAKMEM_BUILD_RELEASE
// Debug-only aggregated diagnostics for Warm Pool
fprintf(stderr,
" [DBG] prefill_attempts=%llu refill_fail=%llu push_ok=%llu push_full=%llu "
"pop_attempts=%llu pop_hits=%llu pop_empty=%llu pop_carve_zero=%llu\n",
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_attempts, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_refill_fail, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_ok, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_prefill_push_full, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_attempts, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_hits, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_empty, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_carve_zero, memory_order_relaxed));
uint64_t c7_attempts = warm_pool_dbg_c7_attempts();
uint64_t c7_hits = warm_pool_dbg_c7_hits();
uint64_t c7_carve = warm_pool_dbg_c7_carves();
uint64_t c7_tls_attempts = warm_pool_dbg_c7_tls_attempts();
uint64_t c7_tls_success = warm_pool_dbg_c7_tls_successes();
uint64_t c7_tls_fail = warm_pool_dbg_c7_tls_failures();
uint64_t c7_uc_warm = warm_pool_dbg_c7_uc_miss_warm_refills();
uint64_t c7_uc_tls = warm_pool_dbg_c7_uc_miss_tls_refills();
uint64_t c7_uc_shared = warm_pool_dbg_c7_uc_miss_shared_refills();
if (c7_attempts || c7_hits || c7_carve ||
c7_tls_attempts || c7_tls_success || c7_tls_fail ||
c7_uc_warm || c7_uc_tls || c7_uc_shared) {
fprintf(stderr,
" [DBG_C7] warm_pop_attempts=%llu warm_pop_hits=%llu warm_pop_carve=%llu "
"tls_carve_attempts=%llu tls_carve_success=%llu tls_carve_fail=%llu "
"uc_miss_warm=%llu uc_miss_tls=%llu uc_miss_shared=%llu\n",
(unsigned long long)c7_attempts,
(unsigned long long)c7_hits,
(unsigned long long)c7_carve,
(unsigned long long)c7_tls_attempts,
(unsigned long long)c7_tls_success,
(unsigned long long)c7_tls_fail,
(unsigned long long)c7_uc_warm,
(unsigned long long)c7_uc_tls,
(unsigned long long)c7_uc_shared);
}
#endif
fflush(stderr);
}
// Public wrapper for benchmarks
void tiny_warm_pool_print_stats_public(void) {
tiny_warm_pool_print_stats();
}
// ============================================================================
// Phase 23-E: Direct SuperSlab Carve (TLS SLL Bypass)
// ============================================================================
// Fail-fast helper: verify that a candidate BASE pointer belongs to a valid
// Tiny slab within a SuperSlab. This is intentionally defensive and only
// compiled in debug builds to avoid hot-path overhead in release.
static inline int unified_refill_validate_base(int class_idx,
TinyTLSSlab* tls,
TinySlabMeta* meta,
void* base,
const char* stage)
{
#if HAKMEM_BUILD_RELEASE
(void)class_idx; (void)tls; (void)base; (void)stage; (void)meta;
return 1;
#else
if (!base) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=NULL tls_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
(void*)(tls ? tls->ss : NULL),
(void*)meta);
abort();
}
SuperSlab* tls_ss = tls ? tls->ss : NULL;
if (!tls_ss || tls_ss->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p meta=%p (invalid TLS ss)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)meta);
abort();
}
// Cross-check registry lookup for additional safety.
SuperSlab* ss_lookup = hak_super_lookup(base);
if (!ss_lookup || ss_lookup->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup,
(void*)meta);
abort();
}
if (ss_lookup != tls_ss) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p lookup_ss=%p (mismatch)\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
(void*)ss_lookup);
abort();
}
int slab_idx = tls ? (int)tls->slab_idx : -1;
int cap = ss_slabs_capacity(tls_ss);
if (slab_idx < 0 || slab_idx >= cap) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d cap=%d meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
cap,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
// Ensure meta matches TLS view for this slab.
TinySlabMeta* expected_meta = &tls_ss->slabs[slab_idx];
if (meta && meta != expected_meta) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p tls_ss=%p slab_idx=%d meta=%p expected_meta=%p\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)tls_ss,
slab_idx,
(void*)meta,
(void*)expected_meta);
abort();
}
uint8_t* slab_base = tiny_slab_base_for_geometry(tls_ss, slab_idx);
size_t stride = tiny_stride_for_class(class_idx);
size_t usable = tiny_usable_bytes_for_slab(slab_idx);
uint8_t* slab_end = slab_base + usable;
if ((uint8_t*)base < slab_base || (uint8_t*)base >= slab_end) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p range=[%p,%p) stride=%zu meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
(void*)slab_base,
(void*)slab_end,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
ptrdiff_t offset = (uint8_t*)base - slab_base;
if (offset % (ptrdiff_t)stride != 0) {
fprintf(stderr,
"[UNIFIED_REFILL_CORRUPT] stage=%s cls=%d base=%p offset=%td stride=%zu (misaligned) meta_cap=%u meta_used=%u meta_carved=%u\n",
stage ? stage : "unified_refill",
class_idx,
base,
offset,
stride,
meta ? meta->capacity : 0u,
meta ? (unsigned)meta->used : 0u,
meta ? (unsigned)meta->carved : 0u);
abort();
}
return 1;
#endif
}
// ============================================================================
// Warm Pool Enhanced: Direct carve from warm SuperSlab (bypass superslab_refill)
// ============================================================================
// ============================================================================
// Batch refill from SuperSlab (called on cache miss)
// ============================================================================
// Returns: BASE pointer (first block, wrapped), or NULL-wrapped if failed
// Design: Direct carve from SuperSlab to array (no TLS SLL intermediate layer)
// Warm Pool Integration: PRIORITIZE warm pool, use superslab_refill as fallback
hak_base_ptr_t unified_cache_refill(int class_idx) {
// Measure refill cost if enabled
uint64_t start_cycles = 0;
int measure = unified_cache_measure_enabled();
if (measure) {
start_cycles = read_tsc();
}
// Initialize warm pool on first use (per-thread)
tiny_warm_pool_init_once();
TinyUnifiedCache* cache = &g_unified_cache[class_idx];
const TinyClassPolicy* policy = tiny_policy_get(class_idx);
int warm_enabled = policy ? policy->warm_enabled : 0;
int warm_cap = policy ? policy->warm_cap : 0;
int page_enabled = policy ? policy->page_box_enabled : 0;
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
// ✅ Phase 11+: Ensure cache is initialized (lazy init for cold path)
if (!cache->slots) {
unified_cache_init();
// Re-check after init (may fail due to alloc failure)
if (!cache->slots) {
return NULL;
}
}
// Calculate available room in unified cache
int room = (int)cache->capacity - 1; // Leave 1 slot for full detection
if (cache->head > cache->tail) {
room = cache->head - cache->tail - 1;
} else if (cache->head < cache->tail) {
room = cache->capacity - (cache->tail - cache->head) - 1;
}
if (room <= 0) return HAK_BASE_FROM_RAW(NULL);
// Batch size limitクラス別チューニング
// - 通常: 128
// - C5〜C6129B〜512B: 256 まで拡張
// - C7≈1KB: 512 まで拡張して refill 頻度をさらに下げる
// - 安全性のため、下の out[] 配列サイズ512と常に整合させる
int max_batch;
if (class_idx == 7) {
max_batch = 512;
} else if (class_idx >= 5 && class_idx <= 6) {
max_batch = 256;
} else {
max_batch = 128;
}
if (room > max_batch) room = max_batch;
// NOTE:
// - C7 では max_batch を 512 まで拡張するため、スタック配列も 512 エントリ確保する。
// - これにより、room <= max_batch <= 512 が常に成り立ち、out[] オーバーランを防止する。
void* out[512];
int produced = 0;
int tls_carved = 0; // Debug bookkeeping: track TLS carve experiment hits
#if HAKMEM_BUILD_RELEASE
(void)tls_carved;
#endif
// ========== PAGE BOX HOT PATHTiny-Plus 層): Try page box FIRST ==========
// 将来的に C7 専用の page-level freelist 管理をここに統合する。
// いまは stub 実装で常に 0 を返すが、Box 境界としての接続だけ先に行う。
if (page_enabled && tiny_page_box_is_enabled(class_idx)) {
int page_produced = tiny_page_box_refill(class_idx, tls, out, room);
if (page_produced > 0) {
// Store blocks into cache and return first
void* first = out[0];
for (int i = 1; i < page_produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
#if !HAKMEM_BUILD_RELEASE
g_unified_cache_miss[class_idx]++;
#endif
tiny_class_stats_on_uc_miss(class_idx);
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
return HAK_BASE_FROM_RAW(first);
}
}
// ========== WARM POOL HOT PATH: Check warm pool FIRST ==========
// This is the critical optimization - avoid superslab_refill() registry scan
if (warm_enabled) {
if (class_idx == 7) {
const TinyClassPolicy* pol = tiny_policy_get(7);
static _Atomic int g_c7_policy_logged = 0;
if (atomic_exchange_explicit(&g_c7_policy_logged, 1, memory_order_acq_rel) == 0) {
fprintf(stderr,
"[C7_POLICY_AT_WARM] page=%u warm=%u cap=%u\n",
pol ? pol->page_box_enabled : 0,
pol ? pol->warm_enabled : 0,
pol ? pol->warm_cap : 0);
}
}
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_attempts, 1, memory_order_relaxed);
if (class_idx == 7) {
warm_pool_dbg_c7_attempt();
}
#endif
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
atomic_fetch_add_explicit(&g_rel_c7_warm_pop, 1, memory_order_relaxed);
}
#endif
SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx);
if (warm_ss) {
int allow_tls_bind = policy && policy->tls_carve_enabled;
int allow_tls_carve = allow_tls_bind;
int warm_mode = 0;
if (class_idx == 7) {
#if !HAKMEM_BUILD_RELEASE
warm_pool_dbg_c7_hit();
#endif
warm_mode = warm_tls_bind_mode_c7();
allow_tls_bind = (warm_mode >= 1);
allow_tls_carve = (warm_mode == 2);
}
if (allow_tls_bind) {
int cap = ss_slabs_capacity(warm_ss);
int slab_idx = -1;
// Simple heuristic: first slab matching class
for (int i = 0; i < cap; i++) {
if (tiny_get_class_from_ss(warm_ss, i) == class_idx) {
slab_idx = i;
break;
}
}
if (slab_idx >= 0) {
uint32_t tid = (uint32_t)(uintptr_t)pthread_self();
if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) {
if (class_idx == 7) {
warm_tls_bind_log_success(warm_ss, slab_idx);
}
// Mode 2: carve a single block via TLS fast path (policy enabled classes)
if (allow_tls_carve) {
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_tls_attempt();
}
#endif
TinyTLSCarveOneResult tls_carve =
tiny_tls_carve_one_block(tls, class_idx);
if (tls_carve.block) {
if (class_idx == 7) {
warm_tls_bind_log_tls_carve(warm_ss, slab_idx, tls_carve.block);
#if !HAKMEM_BUILD_RELEASE
warm_pool_dbg_c7_tls_success();
#endif
}
out[0] = tls_carve.block;
produced = 1;
tls_carved = 1;
} else {
if (class_idx == 7) {
warm_tls_bind_log_tls_fail(warm_ss, slab_idx);
#if !HAKMEM_BUILD_RELEASE
warm_pool_dbg_c7_tls_fail();
#endif
}
}
}
}
}
}
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_hits, 1, memory_order_relaxed);
#endif
// HOT PATH: Warm pool hit, try to carve directly
if (produced == 0) {
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_rel_c7_carve_attempt();
}
#endif
produced = slab_carve_from_ss(class_idx, warm_ss, out, room);
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
if (produced > 0) {
warm_pool_rel_c7_carve_success();
} else {
warm_pool_rel_c7_carve_zero();
}
}
#endif
if (produced > 0) {
// Update active counter for carved blocks
ss_active_add(warm_ss, (uint32_t)produced);
}
}
if (produced > 0) {
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_carve();
if (tls_carved) {
warm_pool_dbg_c7_uc_miss_tls();
} else {
warm_pool_dbg_c7_uc_miss_warm();
}
}
#endif
// Success! Return SuperSlab to warm pool for next use
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
atomic_fetch_add_explicit(&g_rel_c7_warm_push, 1, memory_order_relaxed);
}
#endif
tiny_warm_pool_push_with_cap(class_idx, warm_ss, warm_cap);
// Track warm pool hit (always compiled, ENV-gated printing)
warm_pool_record_hit(class_idx);
tiny_class_stats_on_warm_hit(class_idx);
// Store blocks into cache and return first
void* first = out[0];
for (int i = 1; i < produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
#if !HAKMEM_BUILD_RELEASE
g_unified_cache_miss[class_idx]++;
#endif
tiny_class_stats_on_uc_miss(class_idx);
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
// Per-class 集計C5C7 の refill コストを可視化)
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
return HAK_BASE_FROM_RAW(first);
}
// SuperSlab carve failed (produced == 0)
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_carve_zero, 1, memory_order_relaxed);
#endif
// This slab is either exhausted or has no more available capacity
// The statistics counter 'prefilled' tracks how often we try to prefill
if (produced == 0 && tiny_warm_pool_count(class_idx) == 0) {
// Pool is empty and carve failed - prefill would help here
warm_pool_record_prefilled(class_idx);
}
} else {
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_empty, 1, memory_order_relaxed);
#endif
}
// ========== COLD PATH: Warm pool miss, use superslab_refill ==========
// Track warm pool miss (always compiled, ENV-gated printing)
warm_pool_record_miss(class_idx);
}
// Step 1: Ensure SuperSlab available via normal refill
// Enhanced: Use Warm Pool Prefill Box for secondary prefill when pool is empty
if (warm_enabled) {
if (warm_pool_do_prefill(class_idx, tls, warm_cap) < 0) {
return HAK_BASE_FROM_RAW(NULL);
}
// After prefill: tls->ss has the final slab for carving
tls = &g_tls_slabs[class_idx]; // Reload (already done in prefill box)
} else {
if (!tls->ss) {
if (!superslab_refill(class_idx)) {
return HAK_BASE_FROM_RAW(NULL);
}
tls = &g_tls_slabs[class_idx];
}
}
// Step 2: Direct carve from SuperSlab into local array (bypass TLS SLL!)
TinySlabMeta* m = tls->meta;
size_t bs = tiny_stride_for_class(class_idx);
uint8_t* base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
while (produced < room) {
if (m->freelist) {
// Freelist pop
void* p = m->freelist;
void* next_node = tiny_next_read(class_idx, p);
// ROOT CAUSE FIX: Write header BEFORE exposing block (but AFTER reading next)
// For Class 0 (offset 0), next overlaps header, so we must read next first.
#if HAKMEM_TINY_HEADER_CLASSIDX
*(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
// Prevent compiler from reordering header write after out[] assignment
__atomic_thread_fence(__ATOMIC_RELEASE);
#endif
m->freelist = next_node;
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_freelist");
// PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p);
m->used++;
out[produced++] = p;
} else if (m->carved < m->capacity) {
// Linear carve (fresh block, no freelist link)
void* p = (void*)(base + ((size_t)m->carved * bs));
unified_refill_validate_base(class_idx, tls, m, p,
"unified_refill_carve");
// PageFaultTelemetry: record page touch for this BASE
pagefault_telemetry_touch(class_idx, p);
// ✅ CRITICAL: Write header (new block)
#if HAKMEM_TINY_HEADER_CLASSIDX
*(uint8_t*)p = (uint8_t)(0xa0 | (class_idx & 0x0f));
#endif
m->carved++;
m->used++;
out[produced++] = p;
} else {
// SuperSlab exhausted → refill and retry
if (!superslab_refill(class_idx)) break;
// ✅ CRITICAL: Reload TLS pointers after refill (avoid stale pointer bug)
tls = &g_tls_slabs[class_idx];
m = tls->meta;
base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
}
}
if (produced == 0) return HAK_BASE_FROM_RAW(NULL);
// Step 4: Update active counter
// Guard: tls->ss can be NULL if all SuperSlab refills failed
if (tls->ss) {
ss_active_add(tls->ss, (uint32_t)produced);
}
// Step 5: Store blocks into unified cache (skip first, return it)
void* first = out[0];
for (int i = 1; i < produced; i++) {
cache->slots[cache->tail] = out[i];
cache->tail = (cache->tail + 1) & cache->mask;
}
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_uc_miss_shared();
}
g_unified_cache_miss[class_idx]++;
#endif
tiny_class_stats_on_uc_miss(class_idx);
// Measure refill cycles
if (measure) {
uint64_t end_cycles = read_tsc();
uint64_t delta = end_cycles - start_cycles;
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_global,
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_global,
1, memory_order_relaxed);
// Per-class 集計
atomic_fetch_add_explicit(&g_unified_cache_refill_cycles_by_class[class_idx],
delta, memory_order_relaxed);
atomic_fetch_add_explicit(&g_unified_cache_misses_by_class[class_idx],
1, memory_order_relaxed);
}
return HAK_BASE_FROM_RAW(first); // Return first block (BASE pointer)
}
// ============================================================================
// Performance Measurement: Print Statistics
// ============================================================================
void unified_cache_print_measurements(void) {
if (!unified_cache_measure_enabled()) {
return; // Measurement disabled, nothing to print
}
uint64_t hits = atomic_load_explicit(&g_unified_cache_hits_global, memory_order_relaxed);
uint64_t misses = atomic_load_explicit(&g_unified_cache_misses_global, memory_order_relaxed);
uint64_t refill_cycles = atomic_load_explicit(&g_unified_cache_refill_cycles_global, memory_order_relaxed);
uint64_t total = hits + misses;
if (total == 0) {
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "Unified Cache Statistics\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "No operations recorded (measurement may be disabled)\n");
fprintf(stderr, "========================================\n\n");
return;
}
double hit_rate = (100.0 * hits) / total;
double avg_refill_cycles = misses > 0 ? (double)refill_cycles / misses : 0.0;
// Estimate time at 1GHz (conservative, most modern CPUs are 2-4GHz)
double avg_refill_us = avg_refill_cycles / 1000.0;
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "Unified Cache Statistics\n");
fprintf(stderr, "========================================\n");
fprintf(stderr, "Hits: %llu\n", (unsigned long long)hits);
fprintf(stderr, "Misses: %llu\n", (unsigned long long)misses);
fprintf(stderr, "Hit Rate: %.1f%%\n", hit_rate);
fprintf(stderr, "Avg Refill Cycles: %.0f (est. %.2fus @ 1GHz)\n",
avg_refill_cycles, avg_refill_us);
// Per-class breakdownTiny クラス 0-7、特に C5C7 を観測)
fprintf(stderr, "\nPer-class Unified Cache (Tiny classes):\n");
for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
uint64_t ch = atomic_load_explicit(&g_unified_cache_hits_by_class[cls],
memory_order_relaxed);
uint64_t cm = atomic_load_explicit(&g_unified_cache_misses_by_class[cls],
memory_order_relaxed);
uint64_t cc = atomic_load_explicit(&g_unified_cache_refill_cycles_by_class[cls],
memory_order_relaxed);
uint64_t ct = ch + cm;
if (ct == 0 && cc == 0) {
continue; // 未使用クラスは省略
}
double cls_hit_rate = ct > 0 ? (100.0 * (double)ch / (double)ct) : 0.0;
double cls_avg_refill = cm > 0 ? (double)cc / (double)cm : 0.0;
double cls_avg_us = cls_avg_refill / 1000.0;
fprintf(stderr,
" C%d: hits=%llu miss=%llu hit=%.1f%% avg_refill=%.0f cyc (%.2fus @1GHz)\n",
cls,
(unsigned long long)ch,
(unsigned long long)cm,
cls_hit_rate,
cls_avg_refill,
cls_avg_us);
}
fprintf(stderr, "========================================\n\n");
}