Option A (Full): Inline TLS cache access in malloc()

Implementation:
1. Added g_initialized check to fast path (skip bootstrap overhead)
2. Inlined hak_tiny_size_to_class() - LUT lookup (~1 load)
3. Inlined TLS cache pop - direct g_tls_sll_head access (3-4 instructions)
4. Eliminated function call overhead on fast path hit

Result: +11.5% improvement (1.31M → 1.46M ops/s avg, threads=4)
- Before: Function call + internal processing (~15-20 instructions)
- After: LUT + TLS load + pop + return (~5-6 instructions)

Still below target (1.81M ops/s). Next: RDTSC profiling to identify remaining bottleneck.
This commit is contained in:
Claude
2025-11-05 07:07:47 +00:00
parent d099719141
commit 5ec9d1746f

View File

@ -1239,23 +1239,37 @@ __thread uint64_t g_malloc_fast_path_tried = 0;
__thread uint64_t g_malloc_fast_path_null = 0; __thread uint64_t g_malloc_fast_path_null = 0;
__thread uint64_t g_malloc_slow_path = 0; __thread uint64_t g_malloc_slow_path = 0;
// Option A (Full): Inline TLS cache access (zero function call overhead)
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
void* malloc(size_t size) { void* malloc(size_t size) {
// ======================================================================== // ========================================================================
// Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style) // Phase 6-5: ULTRA-FAST PATH FIRST (mimalloc/tcache style)
// Phase 6-1.7: Box Theory Integration - Zero overhead path // Phase 6-1.7: Box Theory Integration - Zero overhead path
// Option A (Full): Inline TLS cache access (LARSON_PERFORMANCE_ANALYSIS.md)
// ======================================================================== // ========================================================================
// CRITICAL: This MUST be before all guard checks to achieve 3-4 instruction fast path! // CRITICAL: This MUST be before all guard checks to achieve 3-4 instruction fast path!
// Removed all counter overhead for maximum performance // Eliminates function call overhead by inlining TLS cache pop directly!
// Expected: +200-400% (system tcache equivalent design)
// ======================================================================== // ========================================================================
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
if (__builtin_expect(size <= TINY_FAST_THRESHOLD, 1)) { if (__builtin_expect(g_initialized && size <= TINY_FAST_THRESHOLD, 1)) {
// Box 5: Ultra-fast TLS freelist pop (3-4 instructions) // Inline size-to-class mapping (LUT: 1 load)
// LTO (-flto) should inline this wrapper automatically int cls = hak_tiny_size_to_class(size);
if (__builtin_expect(cls >= 0, 1)) {
// Inline TLS cache pop (3-4 instructions, zero function call!)
void* head = g_tls_sll_head[cls];
if (__builtin_expect(head != NULL, 1)) {
g_tls_sll_head[cls] = *(void**)head; // Pop: next = *head
return head; // 🚀 TRUE FAST PATH: No function calls!
}
}
// Cache miss or invalid class → call wrapper for refill
void* ptr = hak_tiny_alloc_fast_wrapper(size); void* ptr = hak_tiny_alloc_fast_wrapper(size);
if (__builtin_expect(ptr != NULL, 1)) { if (__builtin_expect(ptr != NULL, 1)) {
return ptr; // ✅ FAST PATH SUCCESS: Zero overhead! return ptr;
} }
// Miss: fall through to slow path with full initialization // Refill failed: fall through to slow path
} }
#endif #endif
// ======================================================================== // ========================================================================