Phase 6-5: Entry Point Optimization (Phase 1) - Unexpected results
Implementation: Move HAKMEM_TINY_FAST_PATH check BEFORE all guard checks in malloc(), inspired by mimalloc/tcache entry point design. Strategy: - tcache has 0 branches before fast path - mimalloc has 1-2 branches before fast path - Old HAKMEM had 8+ branches before fast path - Phase 1: Move fast path to line 1, add branch prediction hints Changes in core/hakmem.c: 1. Fast Path First: Size check → Init check → Cache hit (3 branches) 2. Slow Path: All guards moved after fast path (rare cases) 3. Branch hints: __builtin_expect() for hot paths Expected results (from research): - ST: 0.46M → 1.4-2.3M ops/s (+204-400%) - MT: 1.86M → 3.7-5.6M ops/s (+99-201%) Actual results (Larson 2s 8-128B 1024): - ST: 0.377M → 0.424M ops/s (+12% only) - MT: 1.856M → 1.453M ops/s (-22% regression!) Analysis: - Similar pattern to previous Option A test (+42% ST, -20% MT) - Entry point reordering alone is insufficient - True bottleneck may be: 1. tiny_fast_alloc() internals (size-to-class, cache access) 2. Refill cost (1,600 cycles for 16 individual calls) 3. Need Batch Refill optimization (Phase 3) as priority Next steps: - Investigate refill bottleneck with perf profiling - Consider implementing Phase 3 (Batch Refill) before Phase 2 - May need combination of multiple optimizations for breakthrough Related: LARSON_PERFORMANCE_ANALYSIS_2025_11_05.md
This commit is contained in:
@ -1248,6 +1248,38 @@ void* realloc(void* ptr, size_t size) {
|
||||
|
||||
// malloc wrapper - intercepts system malloc() calls
|
||||
void* malloc(size_t size) {
|
||||
// ========================================================================
|
||||
// Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style)
|
||||
// Inspired by research: tcache has 0 branches, mimalloc has 1-2 branches
|
||||
// Key insight: Move fast path BEFORE all guards (common case optimization)
|
||||
// ========================================================================
|
||||
#ifdef HAKMEM_TINY_FAST_PATH
|
||||
// Branch 1: Size check (predicted taken for tiny allocations)
|
||||
if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) {
|
||||
extern void* tiny_fast_alloc(size_t);
|
||||
extern void tiny_fast_init(void);
|
||||
extern __thread int g_tiny_fast_initialized;
|
||||
|
||||
// Branch 2: Initialization check (predicted taken after first call)
|
||||
if (__builtin_expect(g_tiny_fast_initialized, 1)) {
|
||||
// Branch 3: Cache hit check (predicted taken ~90% of time)
|
||||
void* ptr = tiny_fast_alloc(size);
|
||||
if (__builtin_expect(ptr != NULL, 1)) {
|
||||
return ptr; // ✅ FAST PATH: 3 branches total (vs tcache's 0, mimalloc's 1-2)
|
||||
}
|
||||
// Cache miss: fall through to slow path refill
|
||||
} else {
|
||||
// Cold path: initialize once per thread (rare)
|
||||
tiny_fast_init();
|
||||
void* ptr = tiny_fast_alloc(size);
|
||||
if (ptr) return ptr;
|
||||
}
|
||||
}
|
||||
#endif
|
||||
// ========================================================================
|
||||
// SLOW PATH: All guards moved here (only executed on fast path miss)
|
||||
// ========================================================================
|
||||
|
||||
// Recursion guard: if we're inside the allocator already, fall back to libc
|
||||
if (g_hakmem_lock_depth > 0) {
|
||||
// Nested call detected - fallback to system malloc
|
||||
@ -1288,27 +1320,6 @@ void* malloc(size_t size) {
|
||||
}
|
||||
}
|
||||
|
||||
// ========================================================================
|
||||
// Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path)
|
||||
// ========================================================================
|
||||
#ifdef HAKMEM_TINY_FAST_PATH
|
||||
if (size <= TINY_FAST_THRESHOLD) {
|
||||
// Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab)
|
||||
extern void* tiny_fast_alloc(size_t);
|
||||
extern void tiny_fast_init(void);
|
||||
extern __thread int g_tiny_fast_initialized;
|
||||
|
||||
if (__builtin_expect(!g_tiny_fast_initialized, 0)) {
|
||||
tiny_fast_init();
|
||||
}
|
||||
|
||||
void* ptr = tiny_fast_alloc(size);
|
||||
if (ptr) return ptr;
|
||||
// Fall through to slow path on failure
|
||||
}
|
||||
#endif
|
||||
// ========================================================================
|
||||
|
||||
// First-level call: enter allocator (no global lock)
|
||||
g_hakmem_lock_depth++;
|
||||
void* ptr = hak_alloc_at(size, HAK_CALLSITE());
|
||||
|
||||
Reference in New Issue
Block a user