From 3e4e90eadba2f39a4c81351ccadd085e74d4d7e8 Mon Sep 17 00:00:00 2001 From: Claude Date: Wed, 5 Nov 2025 05:10:02 +0000 Subject: [PATCH] Phase 6-5: Entry Point Optimization (Phase 1) - Unexpected results MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implementation: Move HAKMEM_TINY_FAST_PATH check BEFORE all guard checks in malloc(), inspired by mimalloc/tcache entry point design. Strategy: - tcache has 0 branches before fast path - mimalloc has 1-2 branches before fast path - Old HAKMEM had 8+ branches before fast path - Phase 1: Move fast path to line 1, add branch prediction hints Changes in core/hakmem.c: 1. Fast Path First: Size check → Init check → Cache hit (3 branches) 2. Slow Path: All guards moved after fast path (rare cases) 3. Branch hints: __builtin_expect() for hot paths Expected results (from research): - ST: 0.46M → 1.4-2.3M ops/s (+204-400%) - MT: 1.86M → 3.7-5.6M ops/s (+99-201%) Actual results (Larson 2s 8-128B 1024): - ST: 0.377M → 0.424M ops/s (+12% only) - MT: 1.856M → 1.453M ops/s (-22% regression!) Analysis: - Similar pattern to previous Option A test (+42% ST, -20% MT) - Entry point reordering alone is insufficient - True bottleneck may be: 1. tiny_fast_alloc() internals (size-to-class, cache access) 2. Refill cost (1,600 cycles for 16 individual calls) 3. Need Batch Refill optimization (Phase 3) as priority Next steps: - Investigate refill bottleneck with perf profiling - Consider implementing Phase 3 (Batch Refill) before Phase 2 - May need combination of multiple optimizations for breakthrough Related: LARSON_PERFORMANCE_ANALYSIS_2025_11_05.md --- core/hakmem.c | 53 +++++++++++++++++++++++++++++++-------------------- 1 file changed, 32 insertions(+), 21 deletions(-) diff --git a/core/hakmem.c b/core/hakmem.c index 7d654c78..71ce5a9b 100644 --- a/core/hakmem.c +++ b/core/hakmem.c @@ -1248,6 +1248,38 @@ void* realloc(void* ptr, size_t size) { // malloc wrapper - intercepts system malloc() calls void* malloc(size_t size) { + // ======================================================================== + // Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style) + // Inspired by research: tcache has 0 branches, mimalloc has 1-2 branches + // Key insight: Move fast path BEFORE all guards (common case optimization) + // ======================================================================== +#ifdef HAKMEM_TINY_FAST_PATH + // Branch 1: Size check (predicted taken for tiny allocations) + if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) { + extern void* tiny_fast_alloc(size_t); + extern void tiny_fast_init(void); + extern __thread int g_tiny_fast_initialized; + + // Branch 2: Initialization check (predicted taken after first call) + if (__builtin_expect(g_tiny_fast_initialized, 1)) { + // Branch 3: Cache hit check (predicted taken ~90% of time) + void* ptr = tiny_fast_alloc(size); + if (__builtin_expect(ptr != NULL, 1)) { + return ptr; // ✅ FAST PATH: 3 branches total (vs tcache's 0, mimalloc's 1-2) + } + // Cache miss: fall through to slow path refill + } else { + // Cold path: initialize once per thread (rare) + tiny_fast_init(); + void* ptr = tiny_fast_alloc(size); + if (ptr) return ptr; + } + } +#endif + // ======================================================================== + // SLOW PATH: All guards moved here (only executed on fast path miss) + // ======================================================================== + // Recursion guard: if we're inside the allocator already, fall back to libc if (g_hakmem_lock_depth > 0) { // Nested call detected - fallback to system malloc @@ -1288,27 +1320,6 @@ void* malloc(size_t size) { } } - // ======================================================================== - // Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path) - // ======================================================================== -#ifdef HAKMEM_TINY_FAST_PATH - if (size <= TINY_FAST_THRESHOLD) { - // Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab) - extern void* tiny_fast_alloc(size_t); - extern void tiny_fast_init(void); - extern __thread int g_tiny_fast_initialized; - - if (__builtin_expect(!g_tiny_fast_initialized, 0)) { - tiny_fast_init(); - } - - void* ptr = tiny_fast_alloc(size); - if (ptr) return ptr; - // Fall through to slow path on failure - } -#endif - // ======================================================================== - // First-level call: enter allocator (no global lock) g_hakmem_lock_depth++; void* ptr = hak_alloc_at(size, HAK_CALLSITE());