diff --git a/core/hakmem.c b/core/hakmem.c index 7d654c78..71ce5a9b 100644 --- a/core/hakmem.c +++ b/core/hakmem.c @@ -1248,6 +1248,38 @@ void* realloc(void* ptr, size_t size) { // malloc wrapper - intercepts system malloc() calls void* malloc(size_t size) { + // ======================================================================== + // Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style) + // Inspired by research: tcache has 0 branches, mimalloc has 1-2 branches + // Key insight: Move fast path BEFORE all guards (common case optimization) + // ======================================================================== +#ifdef HAKMEM_TINY_FAST_PATH + // Branch 1: Size check (predicted taken for tiny allocations) + if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) { + extern void* tiny_fast_alloc(size_t); + extern void tiny_fast_init(void); + extern __thread int g_tiny_fast_initialized; + + // Branch 2: Initialization check (predicted taken after first call) + if (__builtin_expect(g_tiny_fast_initialized, 1)) { + // Branch 3: Cache hit check (predicted taken ~90% of time) + void* ptr = tiny_fast_alloc(size); + if (__builtin_expect(ptr != NULL, 1)) { + return ptr; // ✅ FAST PATH: 3 branches total (vs tcache's 0, mimalloc's 1-2) + } + // Cache miss: fall through to slow path refill + } else { + // Cold path: initialize once per thread (rare) + tiny_fast_init(); + void* ptr = tiny_fast_alloc(size); + if (ptr) return ptr; + } + } +#endif + // ======================================================================== + // SLOW PATH: All guards moved here (only executed on fast path miss) + // ======================================================================== + // Recursion guard: if we're inside the allocator already, fall back to libc if (g_hakmem_lock_depth > 0) { // Nested call detected - fallback to system malloc @@ -1288,27 +1320,6 @@ void* malloc(size_t size) { } } - // ======================================================================== - // Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path) - // ======================================================================== -#ifdef HAKMEM_TINY_FAST_PATH - if (size <= TINY_FAST_THRESHOLD) { - // Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab) - extern void* tiny_fast_alloc(size_t); - extern void tiny_fast_init(void); - extern __thread int g_tiny_fast_initialized; - - if (__builtin_expect(!g_tiny_fast_initialized, 0)) { - tiny_fast_init(); - } - - void* ptr = tiny_fast_alloc(size); - if (ptr) return ptr; - // Fall through to slow path on failure - } -#endif - // ======================================================================== - // First-level call: enter allocator (no global lock) g_hakmem_lock_depth++; void* ptr = hak_alloc_at(size, HAK_CALLSITE());