diff --git a/core/hakmem.c b/core/hakmem.c index 7d654c78..222e96b7 100644 --- a/core/hakmem.c +++ b/core/hakmem.c @@ -1248,6 +1248,37 @@ void* realloc(void* ptr, size_t size) { // malloc wrapper - intercepts system malloc() calls void* malloc(size_t size) { + // ======================================================================== + // Phase 6-4: ULTRA-FAST PATH (Option A optimization) + // Priority: initialized + tiny size → direct to fast cache (2-3 branches) + // Expected hit rate: 95%+ for tiny allocations + // ======================================================================== +#ifdef HAKMEM_TINY_FAST_PATH + // Branch 1+2: initialized check + size check (combined for branch prediction) + if (__builtin_expect(g_initialized && size <= TINY_FAST_THRESHOLD, 1)) { + extern void* tiny_fast_alloc(size_t); + extern void tiny_fast_init(void); + extern __thread int g_tiny_fast_initialized; + + // Branch 3: init check (rarely taken) + if (__builtin_expect(!g_tiny_fast_initialized, 0)) { + tiny_fast_init(); + } + + // Fast path: TLS cache pop (3-4 instructions inside tiny_fast_alloc) + void* ptr = tiny_fast_alloc(size); + if (__builtin_expect(ptr != NULL, 1)) { + return ptr; // 🚀 FAST PATH HIT: 3 branches total! + } + // Fall through to slow path on cache miss + } +#endif + // ======================================================================== + + // ======================================================================== + // SLOW PATH: All guard checks (for non-tiny, uninitialized, or special cases) + // ======================================================================== + // Recursion guard: if we're inside the allocator already, fall back to libc if (g_hakmem_lock_depth > 0) { // Nested call detected - fallback to system malloc @@ -1288,27 +1319,6 @@ void* malloc(size_t size) { } } - // ======================================================================== - // Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path) - // ======================================================================== -#ifdef HAKMEM_TINY_FAST_PATH - if (size <= TINY_FAST_THRESHOLD) { - // Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab) - extern void* tiny_fast_alloc(size_t); - extern void tiny_fast_init(void); - extern __thread int g_tiny_fast_initialized; - - if (__builtin_expect(!g_tiny_fast_initialized, 0)) { - tiny_fast_init(); - } - - void* ptr = tiny_fast_alloc(size); - if (ptr) return ptr; - // Fall through to slow path on failure - } -#endif - // ======================================================================== - // First-level call: enter allocator (no global lock) g_hakmem_lock_depth++; void* ptr = hak_alloc_at(size, HAK_CALLSITE()); @@ -1320,6 +1330,40 @@ void* malloc(size_t size) { void free(void* ptr) { if (!ptr) return; // NULL check + // ======================================================================== + // Phase 6-4: ULTRA-FAST PATH (Option A optimization) + // Priority: initialized → direct to fast free path (1-2 branches) + // Expected hit rate: 95%+ for tiny allocations + // ======================================================================== + + // Branch 1: initialized check (fast path for common case) + if (__builtin_expect(g_initialized, 1)) { + // Fast path: normal operation, no special handling needed + + // Phase 6 Fast Path variants (when enabled) +#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE + g_hakmem_lock_depth++; + hak_tiny_free_ultra_simple(ptr); + g_hakmem_lock_depth--; + return; +#elif defined(HAKMEM_TINY_PHASE6_METADATA) + g_hakmem_lock_depth++; + hak_tiny_free_metadata(ptr); + g_hakmem_lock_depth--; + return; +#else + // Default fast path + g_hakmem_lock_depth++; + hak_free_at(ptr, 0, HAK_CALLSITE()); + g_hakmem_lock_depth--; + return; +#endif + } + + // ======================================================================== + // SLOW PATH: All guard checks (for uninitialized or special cases) + // ======================================================================== + // Recursion guard: if we're inside the allocator already, fall back to libc if (g_hakmem_lock_depth > 0) { // Nested call detected - fallback to system free @@ -1356,29 +1400,7 @@ void free(void* ptr) { } } - // ======================================================================== - // Phase 6 Fast Path: Ultra-Simple Free (when enabled) - // ======================================================================== - // This bypasses free.part.0 complexity (38.43% overhead in perf analysis) - // - free.part.0: 15.83% → eliminated! - // - mid_lookup: 9.55% → eliminated for tiny! - // - pthread locks: 8.81% → eliminated! - // Two variants: - // Phase 6-1.5: Alignment guessing (3-4 instructions, 235 M ops/sec) - // Phase 6-1.6: Metadata header (1-2 instructions, ~480 M ops/sec expected) -#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE - g_hakmem_lock_depth++; - hak_tiny_free_ultra_simple(ptr); - g_hakmem_lock_depth--; - return; -#elif defined(HAKMEM_TINY_PHASE6_METADATA) - g_hakmem_lock_depth++; - hak_tiny_free_metadata(ptr); - g_hakmem_lock_depth--; - return; -#endif - // ======================================================================== - + // Fallback (should not reach here in normal case) g_hakmem_lock_depth++; hak_free_at(ptr, 0, HAK_CALLSITE()); g_hakmem_lock_depth--;