From 69e6df4cbc545783bd20d88153437205226499fc Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Sat, 29 Nov 2025 17:35:51 +0900 Subject: [PATCH] Phase 7-Step7: Replace g_tls_sll_enable with TINY_FRONT_TLS_SLL_ENABLED macro MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit **Goal**: Enable dead code elimination for TLS SLL checks in PGO mode **Changes**: 1. core/box/tiny_front_config_box.h: - Add TINY_FRONT_TLS_SLL_ENABLED macro (PGO: 1, Normal: tiny_tls_sll_enabled()) - Add tiny_tls_sll_enabled() wrapper function (static inline) 2. core/tiny_alloc_fast.inc.h (5 hot path locations): - Line 220: tiny_heap_v2_refill_mag() - early return check - Line 388: SLIM mode - SLL freelist check - Line 459: tiny_alloc_fast_pop() - Layer 1 SLL check - Line 774: Main alloc path - cached sll_enabled check (most critical!) - Line 815: Generic front - SLL toggle respect 3. core/hakmem_tiny_refill.inc.h (2 locations): - Line 186: bulk_mag_refill_fc() - refill from SLL - Line 213: bulk_mag_to_sll_if_room() - push to SLL **Performance**: 79.9M ops/s (maintained, +0.1M vs Step 6) - Normal mode: Same performance (runtime checks preserved) - PGO mode: Dead code elimination ready (if (!1) โ†’ removed by compiler) **Expected PGO benefit**: - Eliminate 7 TLS SLL checks across hot paths - Reduce instruction count in main alloc loop - Better branch prediction (no runtime checks) **Design**: Config Box as single entry point - All TLS SLL checks now use TINY_FRONT_TLS_SLL_ENABLED - Consistent pattern with FASTCACHE/SFC/HEAP_V2 macros - Include order independent (wrapper in config box header) ๐Ÿค– Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- core/box/tiny_front_config_box.h | 7 +++++++ core/hakmem_tiny_refill.inc.h | 8 +++++--- core/tiny_alloc_fast.inc.h | 19 ++++++++++--------- 3 files changed, 22 insertions(+), 12 deletions(-) diff --git a/core/box/tiny_front_config_box.h b/core/box/tiny_front_config_box.h index c3b363ce..1950ae17 100644 --- a/core/box/tiny_front_config_box.h +++ b/core/box/tiny_front_config_box.h @@ -52,6 +52,7 @@ #define TINY_FRONT_HEAP_V2_ENABLED 0 // Disabled (use Unified Cache) #define TINY_FRONT_SFC_ENABLED 1 // Enabled (SFC cascade) #define TINY_FRONT_FASTCACHE_ENABLED 0 // Disabled (use Unified Cache) +#define TINY_FRONT_TLS_SLL_ENABLED 1 // Enabled (TLS SLL freelist) #define TINY_FRONT_UNIFIED_GATE_ENABLED 1 // Enabled (Front Gate Unification) #define TINY_FRONT_METRICS_ENABLED 0 // Disabled (no runtime overhead) #define TINY_FRONT_DIAG_ENABLED 0 // Disabled (no diagnostics) @@ -94,12 +95,18 @@ static inline int sfc_cascade_enabled(void) { return g_sfc_enabled; } +static inline int tiny_tls_sll_enabled(void) { + extern int g_tls_sll_enable; + return g_tls_sll_enable; +} + // Config macros (runtime function calls) // These expand to actual function calls in normal mode #define TINY_FRONT_ULTRA_SLIM_ENABLED ultra_slim_mode_enabled() #define TINY_FRONT_HEAP_V2_ENABLED tiny_heap_v2_enabled() #define TINY_FRONT_SFC_ENABLED sfc_cascade_enabled() #define TINY_FRONT_FASTCACHE_ENABLED tiny_fastcache_enabled() +#define TINY_FRONT_TLS_SLL_ENABLED tiny_tls_sll_enabled() #define TINY_FRONT_UNIFIED_GATE_ENABLED front_gate_unified_enabled() #define TINY_FRONT_METRICS_ENABLED tiny_metrics_enabled() #define TINY_FRONT_DIAG_ENABLED tiny_diag_enabled() diff --git a/core/hakmem_tiny_refill.inc.h b/core/hakmem_tiny_refill.inc.h index c4e38487..78f52127 100644 --- a/core/hakmem_tiny_refill.inc.h +++ b/core/hakmem_tiny_refill.inc.h @@ -35,7 +35,7 @@ extern int g_fastcache_enable; extern uint16_t g_fast_cap[TINY_NUM_CLASSES]; extern __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES]; -extern int g_tls_sll_enable; +// Phase 7-Step7: g_tls_sll_enable now accessed via TINY_FRONT_TLS_SLL_ENABLED macro extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; extern _Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES]; @@ -182,7 +182,8 @@ static inline void* tiny_fast_refill_and_take(int class_idx, TinyTLSList* tls) { // 3) TLS SLL ใ‹ใ‚‰่ฉฐใ‚ๆ›ฟใˆ int filled = 0; - while (room > 0 && g_tls_sll_enable) { + // Phase 7-Step7: Use config macro for dead code elimination in PGO mode + while (room > 0 && TINY_FRONT_TLS_SLL_ENABLED) { void* h = NULL; if (!tls_sll_pop(class_idx, &h)) break; tiny_debug_validate_node_base(class_idx, h, "tiny_fast_refill_and_take"); @@ -208,7 +209,8 @@ static inline void* tiny_fast_refill_and_take(int class_idx, TinyTLSList* tls) { // tiny_free_magazine.inc.h ใ‹ใ‚‰ๅ‚็…งใ•ใ‚Œใ‚‹ใ€‚ static inline int bulk_mag_to_sll_if_room(int class_idx, TinyTLSMag* mag, int n) { - if (!g_tls_sll_enable || n <= 0) return 0; + // Phase 7-Step7: Use config macro for dead code elimination in PGO mode + if (!TINY_FRONT_TLS_SLL_ENABLED || n <= 0) return 0; uint32_t cap = sll_cap_for_class(class_idx, (uint32_t)mag->cap); uint32_t have = g_tls_sll[class_idx].count; diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h index 78b53e7f..1200addc 100644 --- a/core/tiny_alloc_fast.inc.h +++ b/core/tiny_alloc_fast.inc.h @@ -216,8 +216,8 @@ static inline int tiny_heap_v2_refill_mag(int class_idx) { if (class_idx < 0 || class_idx > 3) return 0; if (!tiny_heap_v2_class_enabled(class_idx)) return 0; - extern int g_tls_sll_enable; - if (!g_tls_sll_enable) return 0; + // Phase 7-Step7: Use config macro for dead code elimination in PGO mode + if (!TINY_FRONT_TLS_SLL_ENABLED) return 0; TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx]; const int cap = TINY_HEAP_V2_MAG_CAP; @@ -384,8 +384,8 @@ static inline void* tiny_alloc_fast_pop(int class_idx) { // SLIM MODE: Skip FastCache + SFC, go straight to SLL if (__builtin_expect(g_front_slim_enabled, 0)) { // Box Boundary: TLS SLL freelist pop (only layer in SLIM mode) - extern int g_tls_sll_enable; - if (__builtin_expect(g_tls_sll_enable, 1)) { + // Phase 7-Step7: Use config macro for dead code elimination in PGO mode + if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) { void* base = NULL; if (tls_sll_pop(class_idx, &base)) { // Front Gate: SLL hit (SLIM fast path - 3 instructions) @@ -455,8 +455,8 @@ static inline void* tiny_alloc_fast_pop(int class_idx) { // Box Boundary: Layer 1 - TLS SLL freelist ใฎๅ…ˆ้ ญใ‚’ pop๏ผˆenvใง็„กๅŠนๅŒ–ๅฏ๏ผ‰ // Note: This is in tiny_alloc_fast_pop(), not tiny_alloc_fast(), so use global variable - extern int g_tls_sll_enable; - if (__builtin_expect(g_tls_sll_enable, 1)) { + // Phase 7-Step7: Use config macro for dead code elimination in PGO mode + if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) { // Use Box TLS-SLL API (C7-safe pop) // CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!) // Reading head before pop causes stale read โ†’ rbp=0xa0 SEGV @@ -770,8 +770,8 @@ static inline void* tiny_alloc_fast(size_t size) { // P0.1: Cache g_tls_sll_enable once (Phase 3-4 instruction reduction) // Eliminates redundant global variable reads (2-3 instructions saved) - extern int g_tls_sll_enable; - const int sll_enabled = g_tls_sll_enable; + // Phase 7-Step7: Use config macro for dead code elimination in PGO mode + const int sll_enabled = TINY_FRONT_TLS_SLL_ENABLED; #if !HAKMEM_BUILD_RELEASE // Phase 3: Debug checks eliminated in release builds @@ -811,7 +811,8 @@ static inline void* tiny_alloc_fast(size_t size) { // Generic front (FastCache/SFC/SLL) // Respect SLL global toggle - if (__builtin_expect(g_tls_sll_enable, 1)) { + // Phase 7-Step7: Use config macro for dead code elimination in PGO mode + if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) { // For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads. if (class_idx <= 3) { #if HAKMEM_TINY_INLINE_SLL