From b64cfc055ea6500d54c1a2ce56b2252f3156813c Mon Sep 17 00:00:00 2001
From: Claude <noreply@anthropic.com>
Date: Wed, 5 Nov 2025 04:44:50 +0000
Subject: [PATCH] Implement Option A: Fast Path priority optimization (Phase
 6-4)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Changes:
- Reorder malloc() to prioritize Fast Path (initialized + tiny size check first)
- Move Fast Path check before all guard checks (recursion, LD_PRELOAD, etc.)
- Optimize free() with same strategy (initialized check first)
- Add branch prediction hints (__builtin_expect)

Implementation:
- malloc(): Fast Path now executes with 3 branches total
  - Branch 1+2: g_initialized && size <= TINY_FAST_THRESHOLD
  - Branch 3: tiny_fast_alloc() cache hit check
  - Slow Path: All guard checks moved after Fast Path miss

- free(): Fast Path with 1-2 branches
  - Branch 1: g_initialized check
  - Direct to hak_free_at() on normal case

Performance Results (Larson benchmark, size=8-128B):

Single-thread (threads=1):
- Before: 0.46M ops/s (10.7% of system malloc)
- After:  0.65M ops/s (15.4% of system malloc)
- Change: +42% improvement ✓

Multi-thread (threads=4):
- Before: 1.81M ops/s (25.0% of system malloc)
- After:  1.44M ops/s (19.9% of system malloc)
- Change: -20% regression ✗

Analysis:
- ST improvement shows Fast Path optimization works
- MT regression suggests contention or cache issues
- Did not meet target (+200-400%), further optimization needed

Next Steps:
- Investigate MT regression (cache coherency?)
- Consider more aggressive inlining
- Explore Option B (Refill optimization)
---
 core/hakmem.c | 110 ++++++++++++++++++++++++++++++--------------------
 1 file changed, 66 insertions(+), 44 deletions(-)

diff --git a/core/hakmem.c b/core/hakmem.c
index 7d654c78..222e96b7 100644
--- a/core/hakmem.c
+++ b/core/hakmem.c
@@ -1248,6 +1248,37 @@ void* realloc(void* ptr, size_t size) {
 
 // malloc wrapper - intercepts system malloc() calls
 void* malloc(size_t size) {
+    // ========================================================================
+    // Phase 6-4: ULTRA-FAST PATH (Option A optimization)
+    // Priority: initialized + tiny size → direct to fast cache (2-3 branches)
+    // Expected hit rate: 95%+ for tiny allocations
+    // ========================================================================
+#ifdef HAKMEM_TINY_FAST_PATH
+    // Branch 1+2: initialized check + size check (combined for branch prediction)
+    if (__builtin_expect(g_initialized && size <= TINY_FAST_THRESHOLD, 1)) {
+        extern void* tiny_fast_alloc(size_t);
+        extern void tiny_fast_init(void);
+        extern __thread int g_tiny_fast_initialized;
+
+        // Branch 3: init check (rarely taken)
+        if (__builtin_expect(!g_tiny_fast_initialized, 0)) {
+            tiny_fast_init();
+        }
+
+        // Fast path: TLS cache pop (3-4 instructions inside tiny_fast_alloc)
+        void* ptr = tiny_fast_alloc(size);
+        if (__builtin_expect(ptr != NULL, 1)) {
+            return ptr;  // 🚀 FAST PATH HIT: 3 branches total!
+        }
+        // Fall through to slow path on cache miss
+    }
+#endif
+    // ========================================================================
+
+    // ========================================================================
+    // SLOW PATH: All guard checks (for non-tiny, uninitialized, or special cases)
+    // ========================================================================
+
     // Recursion guard: if we're inside the allocator already, fall back to libc
     if (g_hakmem_lock_depth > 0) {
         // Nested call detected - fallback to system malloc
@@ -1288,27 +1319,6 @@ void* malloc(size_t size) {
         }
     }
 
-    // ========================================================================
-    // Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path)
-    // ========================================================================
-#ifdef HAKMEM_TINY_FAST_PATH
-    if (size <= TINY_FAST_THRESHOLD) {
-        // Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab)
-        extern void* tiny_fast_alloc(size_t);
-        extern void tiny_fast_init(void);
-        extern __thread int g_tiny_fast_initialized;
-
-        if (__builtin_expect(!g_tiny_fast_initialized, 0)) {
-            tiny_fast_init();
-        }
-
-        void* ptr = tiny_fast_alloc(size);
-        if (ptr) return ptr;
-        // Fall through to slow path on failure
-    }
-#endif
-    // ========================================================================
-
     // First-level call: enter allocator (no global lock)
     g_hakmem_lock_depth++;
     void* ptr = hak_alloc_at(size, HAK_CALLSITE());
@@ -1320,6 +1330,40 @@ void* malloc(size_t size) {
 void free(void* ptr) {
     if (!ptr) return;  // NULL check
 
+    // ========================================================================
+    // Phase 6-4: ULTRA-FAST PATH (Option A optimization)
+    // Priority: initialized → direct to fast free path (1-2 branches)
+    // Expected hit rate: 95%+ for tiny allocations
+    // ========================================================================
+
+    // Branch 1: initialized check (fast path for common case)
+    if (__builtin_expect(g_initialized, 1)) {
+        // Fast path: normal operation, no special handling needed
+
+        // Phase 6 Fast Path variants (when enabled)
+#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
+        g_hakmem_lock_depth++;
+        hak_tiny_free_ultra_simple(ptr);
+        g_hakmem_lock_depth--;
+        return;
+#elif defined(HAKMEM_TINY_PHASE6_METADATA)
+        g_hakmem_lock_depth++;
+        hak_tiny_free_metadata(ptr);
+        g_hakmem_lock_depth--;
+        return;
+#else
+        // Default fast path
+        g_hakmem_lock_depth++;
+        hak_free_at(ptr, 0, HAK_CALLSITE());
+        g_hakmem_lock_depth--;
+        return;
+#endif
+    }
+
+    // ========================================================================
+    // SLOW PATH: All guard checks (for uninitialized or special cases)
+    // ========================================================================
+
     // Recursion guard: if we're inside the allocator already, fall back to libc
     if (g_hakmem_lock_depth > 0) {
         // Nested call detected - fallback to system free
@@ -1356,29 +1400,7 @@ void free(void* ptr) {
         }
     }
 
-    // ========================================================================
-    // Phase 6 Fast Path: Ultra-Simple Free (when enabled)
-    // ========================================================================
-    // This bypasses free.part.0 complexity (38.43% overhead in perf analysis)
-    // - free.part.0: 15.83% → eliminated!
-    // - mid_lookup: 9.55% → eliminated for tiny!
-    // - pthread locks: 8.81% → eliminated!
-    // Two variants:
-    //   Phase 6-1.5: Alignment guessing (3-4 instructions, 235 M ops/sec)
-    //   Phase 6-1.6: Metadata header (1-2 instructions, ~480 M ops/sec expected)
-#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
-    g_hakmem_lock_depth++;
-    hak_tiny_free_ultra_simple(ptr);
-    g_hakmem_lock_depth--;
-    return;
-#elif defined(HAKMEM_TINY_PHASE6_METADATA)
-    g_hakmem_lock_depth++;
-    hak_tiny_free_metadata(ptr);
-    g_hakmem_lock_depth--;
-    return;
-#endif
-    // ========================================================================
-
+    // Fallback (should not reach here in normal case)
     g_hakmem_lock_depth++;
     hak_free_at(ptr, 0, HAK_CALLSITE());
     g_hakmem_lock_depth--;