From 66a29783a467ac42e4b8d0d485609214c009d43f Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Fri, 21 Nov 2025 05:33:17 +0900 Subject: [PATCH] Phase 19-1: Quick Prune (Frontend SLIM mode) - Experimental implementation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit ## Implementation Added `HAKMEM_TINY_FRONT_SLIM=1` ENV gate to skip FastCache + SFC layers, going straight to SLL (Single-Linked List) for direct backend access. ### Code Changes **File**: `core/tiny_alloc_fast.inc.h` (lines 201-230) Added early return gate in `tiny_alloc_fast_pop()`: ```c // Phase 19-1: Quick Prune (Frontend SLIM mode) static __thread int g_front_slim_checked = 0; static __thread int g_front_slim_enabled = 0; if (g_front_slim_enabled) { // Skip FastCache + SFC, go straight to SLL extern int g_tls_sll_enable; if (g_tls_sll_enable) { void* base = NULL; if (tls_sll_pop(class_idx, &base)) { g_front_sll_hit[class_idx]++; return base; // SLL hit (SLIM fast path) } } return NULL; // SLL miss → caller refills } // else: Existing FC → SFC → SLL cascade (unchanged) ``` ### Design Rationale **Goal**: Skip unused frontend layers to reduce branch misprediction overhead **Strategy**: Based on ChatGPT-sensei analysis showing FC/SFC hit rates near 0% **Expected**: 22M → 27-30M ops/s (+22-36%) **Features**: - ✅ A/B testable via ENV (instant rollback: ENV=0) - ✅ Existing code unchanged (backward compatible) - ✅ TLS-cached enable check (amortized overhead) --- ## Performance Results ### Benchmark: Random Mixed 256B (1M iterations) ``` Baseline (SLIM OFF): 23.2M, 23.7M, 23.2M ops/s (avg: 23.4M) Phase 19-1 (SLIM ON): 22.8M, 22.8M, 23.7M ops/s (avg: 23.1M) Difference: -1.3% (within noise, no improvement) ⚠️ Expected: +22-36% ← NOT achieved ``` ### Stability Testing - ✅ 100K short run: No SEGV, no crashes - ✅ 1M iterations: Stable performance across 3 runs - ✅ Functional correctness: All allocations successful --- ## Analysis: Why Quick Prune Failed ### Hypothesis 1: FC/SFC Overhead Already Minimal - FC/SFC checks are branch-predicted (miss path well-optimized) - Skipping these layers provides negligible cycle savings - Premise of "0% hit rate" may not reflect actual benefit of having layers ### Hypothesis 2: ENV Check Overhead Cancels Gains - TLS variable initialization (`g_front_slim_checked`) - `getenv()` call overhead on first allocation - Cost of SLIM gate check == cost of skipping FC/SFC ### Hypothesis 3: Incorrect Premise - Task-sensei's "FC/SFC hit rate 0%" assumption may be wrong - Layers may provide cache locality benefits even with low hit rate - Removing layers disrupts cache line prefetching --- ## Conclusion & Next Steps **Phase 19-1 Status**: ❌ Experimental - No performance improvement **Key Learnings**: 1. Frontend layer pruning alone is insufficient 2. Branch prediction in existing code is already effective 3. Structural change (not just pruning) needed for significant gains **Recommendation**: Proceed to Phase 19-2 (Front-V2 tcache single-layer) - Phase 19-1 approach (pruning) = failed - Phase 19-2 approach (structural redesign) = recommended - Expected: 31ns → 15ns via tcache-style single TLS magazine --- ## ENV Usage ```bash # Enable SLIM mode (experimental, no gain observed) export HAKMEM_TINY_FRONT_SLIM=1 ./bench_random_mixed_hakmem 1000000 256 42 # Disable SLIM mode (default, recommended) unset HAKMEM_TINY_FRONT_SLIM ./bench_random_mixed_hakmem 1000000 256 42 ``` --- ## Files Modified - `core/tiny_alloc_fast.inc.h` - Added Phase 19-1 Quick Prune gate ## Investigation Report Task-sensei analysis documented entry point (`tiny_alloc_fast_pop()` line 176), identified skip targets (FC: lines 208-220, SFC: lines 222-250), and confirmed SLL as primary fast path (88-99% hit rate from prior analysis). --- 📝 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Co-Authored-By: Task-sensei (tiny_alloc_fast.inc.h structure analysis) Co-Authored-By: ChatGPT (Phase 19 strategy design) --- core/tiny_alloc_fast.inc.h | 31 +++++++++++++++++++++++++++++++ 1 file changed, 31 insertions(+) diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h index 75e6a653..2e7b17a6 100644 --- a/core/tiny_alloc_fast.inc.h +++ b/core/tiny_alloc_fast.inc.h @@ -198,6 +198,37 @@ static inline void* tiny_alloc_fast_pop(int class_idx) { } return NULL; #else + // ========== Phase 19-1: Quick Prune (Frontend SLIM mode) ========== + // ENV: HAKMEM_TINY_FRONT_SLIM=1 + // Goal: Skip FastCache + SFC layers, go straight to SLL (88-99% hit rate) + // Expected: 22M → 27-30M ops/s (+22-36%) + static __thread int g_front_slim_checked = 0; + static __thread int g_front_slim_enabled = 0; + + if (__builtin_expect(!g_front_slim_checked, 0)) { + const char* e = getenv("HAKMEM_TINY_FRONT_SLIM"); + g_front_slim_enabled = (e && *e && *e != '0') ? 1 : 0; + g_front_slim_checked = 1; + } + + // SLIM MODE: Skip FastCache + SFC, go straight to SLL + if (__builtin_expect(g_front_slim_enabled, 0)) { + // Box Boundary: TLS SLL freelist pop (only layer in SLIM mode) + extern int g_tls_sll_enable; + if (__builtin_expect(g_tls_sll_enable, 1)) { + void* base = NULL; + if (tls_sll_pop(class_idx, &base)) { + // Front Gate: SLL hit (SLIM fast path - 3 instructions) + extern unsigned long long g_front_sll_hit[]; + g_front_sll_hit[class_idx]++; + return base; + } + } + // SLIM mode miss → return NULL (caller refills) + return NULL; + } + // ========== End Phase 19-1: Quick Prune ========== + // Phase 7 Task 3: Profiling overhead removed in release builds // In release mode, compiler can completely eliminate profiling code #if !HAKMEM_BUILD_RELEASE