Option A (Full): Inline TLS cache access in malloc()
Implementation: 1. Added g_initialized check to fast path (skip bootstrap overhead) 2. Inlined hak_tiny_size_to_class() - LUT lookup (~1 load) 3. Inlined TLS cache pop - direct g_tls_sll_head access (3-4 instructions) 4. Eliminated function call overhead on fast path hit Result: +11.5% improvement (1.31M → 1.46M ops/s avg, threads=4) - Before: Function call + internal processing (~15-20 instructions) - After: LUT + TLS load + pop + return (~5-6 instructions) Still below target (1.81M ops/s). Next: RDTSC profiling to identify remaining bottleneck.
This commit is contained in:
@ -1239,23 +1239,37 @@ __thread uint64_t g_malloc_fast_path_tried = 0;
|
||||
__thread uint64_t g_malloc_fast_path_null = 0;
|
||||
__thread uint64_t g_malloc_slow_path = 0;
|
||||
|
||||
// Option A (Full): Inline TLS cache access (zero function call overhead)
|
||||
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
|
||||
|
||||
void* malloc(size_t size) {
|
||||
// ========================================================================
|
||||
// Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style)
|
||||
// Phase 6-1.7: Box Theory Integration - Zero overhead path
|
||||
// Option A (Full): Inline TLS cache access (LARSON_PERFORMANCE_ANALYSIS.md)
|
||||
// ========================================================================
|
||||
// CRITICAL: This MUST be before all guard checks to achieve 3-4 instruction fast path!
|
||||
// Removed all counter overhead for maximum performance
|
||||
// Eliminates function call overhead by inlining TLS cache pop directly!
|
||||
// Expected: +200-400% (system tcache equivalent design)
|
||||
// ========================================================================
|
||||
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
||||
if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) {
|
||||
// Box 5: Ultra-fast TLS freelist pop (3-4 instructions)
|
||||
// LTO (-flto) should inline this wrapper automatically
|
||||
if (__builtin_expect(g_initialized && size <= TINY_FAST_THRESHOLD, 1)) {
|
||||
// Inline size-to-class mapping (LUT: 1 load)
|
||||
int cls = hak_tiny_size_to_class(size);
|
||||
if (__builtin_expect(cls >= 0, 1)) {
|
||||
// Inline TLS cache pop (3-4 instructions, zero function call!)
|
||||
void* head = g_tls_sll_head[cls];
|
||||
if (__builtin_expect(head != NULL, 1)) {
|
||||
g_tls_sll_head[cls] = *(void**)head; // Pop: next = *head
|
||||
return head; // 🚀 TRUE FAST PATH: No function calls!
|
||||
}
|
||||
}
|
||||
// Cache miss or invalid class → call wrapper for refill
|
||||
void* ptr = hak_tiny_alloc_fast_wrapper(size);
|
||||
if (__builtin_expect(ptr != NULL, 1)) {
|
||||
return ptr; // ✅ FAST PATH SUCCESS: Zero overhead!
|
||||
return ptr;
|
||||
}
|
||||
// Miss: fall through to slow path with full initialization
|
||||
// Refill failed: fall through to slow path
|
||||
}
|
||||
#endif
|
||||
// ========================================================================
|
||||
|
||||
Reference in New Issue
Block a user