Fix C7 warm/TLS Release path and unify debug instrumentation

This commit is contained in:
Moe Charm (CI)
2025-12-05 23:41:01 +09:00
parent 96c2988381
commit d17ec46628
29 changed files with 1314 additions and 123 deletions

View File

@ -12,10 +12,19 @@
#include "../box/ss_slab_meta_box.h" // For ss_active_add() and slab metadata operations
#include "../box/warm_pool_stats_box.h" // Box: Warm Pool Statistics Recording (inline)
#include "../box/slab_carve_box.h" // Box: Slab Carving (inline O(slabs) scan)
#define WARM_POOL_REL_DEFINE
#include "../box/warm_pool_rel_counters_box.h" // Box: Release-side C7 counters
#undef WARM_POOL_REL_DEFINE
#include "../box/c7_meta_used_counter_box.h" // Box: C7 meta->used increment counters
#include "../box/warm_pool_prefill_box.h" // Box: Warm Pool Prefill (secondary optimization)
#include "../hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
#include "../box/tiny_page_box.h" // Tiny-Plus Page Box (C5C7 initial hook)
#include "../box/ss_tls_bind_box.h" // Box: TLS Bind (SuperSlab -> TLS binding)
#include "../box/tiny_tls_carve_one_block_box.h" // Box: TLS carve helper (shared)
#include "../box/warm_tls_bind_logger_box.h" // Box: Warm TLS Bind logging (throttled)
#define WARM_POOL_DBG_DEFINE
#include "../box/warm_pool_dbg_box.h" // Box: Warm Pool C7 debug counters
#undef WARM_POOL_DBG_DEFINE
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>
@ -84,6 +93,12 @@ __thread uint64_t g_unified_cache_push[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_unified_cache_full[TINY_NUM_CLASSES] = {0};
#endif
// Release-side lightweight telemetry (C7 Warm path only)
#if HAKMEM_BUILD_RELEASE
_Atomic uint64_t g_rel_c7_warm_pop = 0;
_Atomic uint64_t g_rel_c7_warm_push = 0;
#endif
// Warm Pool metrics (definition - declared in tiny_warm_pool.h as extern)
// Note: These are kept outside !HAKMEM_BUILD_RELEASE for profiling in release builds
__thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES] = {0};
@ -98,46 +113,36 @@ _Atomic uint64_t g_dbg_warm_pop_attempts = 0;
_Atomic uint64_t g_dbg_warm_pop_hits = 0;
_Atomic uint64_t g_dbg_warm_pop_empty = 0;
_Atomic uint64_t g_dbg_warm_pop_carve_zero = 0;
#endif
// Debug-only: cached ENV for Warm TLS Bind (C7)
static int g_warm_tls_bind_mode_c7 = -1;
// Warm TLS Bind (C7) mode selector
// mode 0: Legacy warm pathデバッグ専用・C7では非推奨
// mode 1: Bind-only 本番経路C7 標準)
// mode 2: Bind + TLS carve 実験経路Debug 専用)
// Release ビルドでは常に mode=1 に固定し、ENV は無視する。
static inline int warm_tls_bind_mode_c7(void) {
#if HAKMEM_BUILD_RELEASE
static int g_warm_tls_bind_mode_c7 = -1;
if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
// 0/empty: disabled, 1: bind only, 2: bind + TLS carve one block
g_warm_tls_bind_mode_c7 = (e && *e) ? atoi(e) : 0;
int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only
if (mode < 0) mode = 0;
if (mode > 2) mode = 2;
g_warm_tls_bind_mode_c7 = mode;
}
return g_warm_tls_bind_mode_c7;
}
static inline void* warm_tls_carve_one_block(int class_idx) {
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
TinySlabMeta* meta = tls->meta;
if (!meta || !tls->ss || tls->slab_base == NULL) return NULL;
if (meta->class_idx != (uint8_t)class_idx) return NULL;
if (tls->slab_idx < 0 || tls->slab_idx >= ss_slabs_capacity(tls->ss)) return NULL;
if (meta->freelist) {
void* block = meta->freelist;
meta->freelist = tiny_next_read(class_idx, block);
meta->used++;
ss_active_add(tls->ss, 1);
return block;
#else
static int g_warm_tls_bind_mode_c7 = -1;
if (__builtin_expect(g_warm_tls_bind_mode_c7 == -1, 0)) {
const char* e = getenv("HAKMEM_WARM_TLS_BIND_C7");
int mode = (e && *e) ? atoi(e) : 1; // default = Bind-only
if (mode < 0) mode = 0;
if (mode > 2) mode = 2;
g_warm_tls_bind_mode_c7 = mode;
}
if (meta->used < meta->capacity) {
size_t block_size = tiny_stride_for_class(meta->class_idx);
void* block = tiny_block_at_index(tls->slab_base, meta->used, block_size);
meta->used++;
ss_active_add(tls->ss, 1);
return block;
}
return NULL;
}
return g_warm_tls_bind_mode_c7;
#endif
}
// Forward declaration for Warm Pool stats printer (defined later in this file)
static inline void tiny_warm_pool_print_stats(void);
@ -157,6 +162,15 @@ int unified_cache_enabled(void) {
fprintf(stderr, "[Unified-INIT] unified_cache_enabled() = %d\n", g_enable);
fflush(stderr);
}
#else
if (g_enable) {
static int printed = 0;
if (!printed) {
fprintf(stderr, "[Rel-Unified] unified_cache_enabled() = %d\n", g_enable);
fflush(stderr);
printed = 1;
}
}
#endif
}
return g_enable;
@ -311,6 +325,32 @@ static inline void tiny_warm_pool_print_stats(void) {
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_hits, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_empty, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_dbg_warm_pop_carve_zero, memory_order_relaxed));
uint64_t c7_attempts = warm_pool_dbg_c7_attempts();
uint64_t c7_hits = warm_pool_dbg_c7_hits();
uint64_t c7_carve = warm_pool_dbg_c7_carves();
uint64_t c7_tls_attempts = warm_pool_dbg_c7_tls_attempts();
uint64_t c7_tls_success = warm_pool_dbg_c7_tls_successes();
uint64_t c7_tls_fail = warm_pool_dbg_c7_tls_failures();
uint64_t c7_uc_warm = warm_pool_dbg_c7_uc_miss_warm_refills();
uint64_t c7_uc_tls = warm_pool_dbg_c7_uc_miss_tls_refills();
uint64_t c7_uc_shared = warm_pool_dbg_c7_uc_miss_shared_refills();
if (c7_attempts || c7_hits || c7_carve ||
c7_tls_attempts || c7_tls_success || c7_tls_fail ||
c7_uc_warm || c7_uc_tls || c7_uc_shared) {
fprintf(stderr,
" [DBG_C7] warm_pop_attempts=%llu warm_pop_hits=%llu warm_pop_carve=%llu "
"tls_carve_attempts=%llu tls_carve_success=%llu tls_carve_fail=%llu "
"uc_miss_warm=%llu uc_miss_tls=%llu uc_miss_shared=%llu\n",
(unsigned long long)c7_attempts,
(unsigned long long)c7_hits,
(unsigned long long)c7_carve,
(unsigned long long)c7_tls_attempts,
(unsigned long long)c7_tls_success,
(unsigned long long)c7_tls_fail,
(unsigned long long)c7_uc_warm,
(unsigned long long)c7_uc_tls,
(unsigned long long)c7_uc_shared);
}
#endif
fflush(stderr);
}
@ -515,6 +555,7 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
// - これにより、room <= max_batch <= 512 が常に成り立ち、out[] オーバーランを防止する。
void* out[512];
int produced = 0;
int tls_carved = 0; // Debug bookkeeping: track TLS carve experiment hits
// ========== PAGE BOX HOT PATHTiny-Plus 層): Try page box FIRST ==========
// 将来的に C7 専用の page-level freelist 管理をここに統合する。
@ -554,10 +595,21 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
// This is the critical optimization - avoid superslab_refill() registry scan
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add_explicit(&g_dbg_warm_pop_attempts, 1, memory_order_relaxed);
if (class_idx == 7) {
warm_pool_dbg_c7_attempt();
}
#endif
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
atomic_fetch_add_explicit(&g_rel_c7_warm_pop, 1, memory_order_relaxed);
}
#endif
SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx);
if (warm_ss) {
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_hit();
}
// Debug-only: Warm TLS Bind experiment (C7 only)
if (class_idx == 7) {
int warm_mode = warm_tls_bind_mode_c7();
@ -577,25 +629,22 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
uint32_t tid = (uint32_t)(uintptr_t)pthread_self();
if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) {
static int logged = 0;
if (!logged) {
fprintf(stderr, "[WARM_TLS_BIND] C7 bind success: ss=%p slab=%d\n",
(void*)warm_ss, slab_idx);
logged = 1;
}
warm_tls_bind_log_success(warm_ss, slab_idx);
// Mode 2: carve a single block via TLS fast path
if (warm_mode == 2) {
void* tls_block = warm_tls_carve_one_block(class_idx);
if (tls_block) {
fprintf(stderr,
"[WARM_TLS_BIND] C7 TLS carve success: ss=%p slab=%d block=%p\n",
(void*)warm_ss, slab_idx, tls_block);
out[0] = tls_block;
warm_pool_dbg_c7_tls_attempt();
TinyTLSCarveOneResult tls_carve =
tiny_tls_carve_one_block(tls, class_idx);
if (tls_carve.block) {
warm_tls_bind_log_tls_carve(warm_ss, slab_idx, tls_carve.block);
warm_pool_dbg_c7_tls_success();
out[0] = tls_carve.block;
produced = 1;
tls_carved = 1;
} else {
fprintf(stderr,
"[WARM_TLS_BIND] C7 TLS carve failed, fallback\n");
warm_tls_bind_log_tls_fail(warm_ss, slab_idx);
warm_pool_dbg_c7_tls_fail();
}
}
}
@ -607,7 +656,21 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
#endif
// HOT PATH: Warm pool hit, try to carve directly
if (produced == 0) {
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_rel_c7_carve_attempt();
}
#endif
produced = slab_carve_from_ss(class_idx, warm_ss, out, room);
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
if (produced > 0) {
warm_pool_rel_c7_carve_success();
} else {
warm_pool_rel_c7_carve_zero();
}
}
#endif
if (produced > 0) {
// Update active counter for carved blocks
ss_active_add(warm_ss, (uint32_t)produced);
@ -615,7 +678,22 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
}
if (produced > 0) {
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_carve();
if (tls_carved) {
warm_pool_dbg_c7_uc_miss_tls();
} else {
warm_pool_dbg_c7_uc_miss_warm();
}
}
#endif
// Success! Return SuperSlab to warm pool for next use
#if HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
atomic_fetch_add_explicit(&g_rel_c7_warm_push, 1, memory_order_relaxed);
}
#endif
tiny_warm_pool_push(class_idx, warm_ss);
// Track warm pool hit (always compiled, ENV-gated printing)
@ -761,6 +839,9 @@ hak_base_ptr_t unified_cache_refill(int class_idx) {
}
#if !HAKMEM_BUILD_RELEASE
if (class_idx == 7) {
warm_pool_dbg_c7_uc_miss_shared();
}
g_unified_cache_miss[class_idx]++;
#endif