// tiny_alloc_fast_inline.h - Phase 7 Task 2: Aggressive inline TLS cache access // Purpose: Eliminate function call overhead (5-10 cycles) in hot path // Design: Macro-based inline expansion of TLS freelist operations // Performance: Expected +10-15% (22M → 24-25M ops/s) #ifndef TINY_ALLOC_FAST_INLINE_H #define TINY_ALLOC_FAST_INLINE_H #include #include "hakmem_build_flags.h" #include "tiny_remote.h" // for TINY_REMOTE_SENTINEL (defense-in-depth) // External TLS variables (defined in hakmem_tiny.c) extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; #ifndef TINY_NUM_CLASSES #define TINY_NUM_CLASSES 8 #endif // ========== Inline Macro: TLS Freelist Pop ========== // // Aggressive inline expansion of tiny_alloc_fast_pop() // Saves: 5-10 cycles (function call overhead + register spilling) // // Assembly comparison (x86-64): // Function call: // push %rbx ; Save registers // mov %edi, %ebx ; class_idx to %ebx // call tiny_alloc_fast_pop ; Call (5-10 cycles overhead) // pop %rbx ; Restore registers // test %rax, %rax ; Check result // // Inline macro: // mov g_tls_sll_head(%rdi), %rax ; Direct access (3-4 cycles) // test %rax, %rax // je .miss // mov (%rax), %rdx // mov %rdx, g_tls_sll_head(%rdi) // // Result: 5-10 fewer instructions, better register allocation // #define TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr_out) do { \ void* _head = g_tls_sll_head[(class_idx)]; \ if (__builtin_expect(_head != NULL, 1)) { \ if (__builtin_expect((uintptr_t)_head == TINY_REMOTE_SENTINEL, 0)) { \ /* Break the chain defensively if sentinel leaked into TLS SLL */ \ g_tls_sll_head[(class_idx)] = NULL; \ if (g_tls_sll_count[(class_idx)] > 0) g_tls_sll_count[(class_idx)]--; \ (ptr_out) = NULL; \ } else { \ /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \ size_t _off = 0; \ #if HAKMEM_TINY_HEADER_CLASSIDX \ _off = ((class_idx) == 7) ? 0 : 1; \ #endif \ void* _next = *(void**)((uint8_t*)_head + _off); \ g_tls_sll_head[(class_idx)] = _next; \ if (g_tls_sll_count[(class_idx)] > 0) { \ g_tls_sll_count[(class_idx)]--; \ } \ (ptr_out) = _head; \ if (__builtin_expect((class_idx) == 7, 0)) { \ *(void**)(ptr_out) = NULL; \ } \ } \ } else { \ (ptr_out) = NULL; \ } \ } while(0) // ========== Inline Macro: TLS Freelist Push ========== // // Aggressive inline expansion of tiny_alloc_fast_push() // Saves: 5-10 cycles (function call overhead) // // Assembly comparison: // Function call: // mov %rdi, %rsi ; ptr to %rsi // mov %ebx, %edi ; class_idx to %edi // call tiny_alloc_fast_push ; Call (5-10 cycles) // // Inline macro: // mov g_tls_sll_head(%rdi), %rax ; Direct inline (2-3 cycles) // mov %rax, (%rsi) // mov %rsi, g_tls_sll_head(%rdi) // #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \ /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \ size_t _off = 0; \ #if HAKMEM_TINY_HEADER_CLASSIDX \ _off = ((class_idx) == 7) ? 0 : 1; \ #endif \ *(void**)((uint8_t*)(ptr) + _off) = g_tls_sll_head[(class_idx)]; \ g_tls_sll_head[(class_idx)] = (ptr); \ g_tls_sll_count[(class_idx)]++; \ } while(0) // ========== Performance Notes ========== // // Benchmark results (expected): // - Random Mixed 128B: 21M → 23M ops/s (+10%) // - Random Mixed 256B: 19M → 22M ops/s (+15%) // - Larson 1T: 2.7M → 3.0M ops/s (+11%) // // Key optimizations: // 1. No function call overhead (save 5-10 cycles) // 2. Better register allocation (inline knows full context) // 3. No stack frame setup/teardown // 4. Compiler can optimize across macro boundaries // // Trade-offs: // 1. Code size: +100-200 bytes (each call site expanded) // 2. Debug visibility: Macros harder to step through // 3. Maintenance: Changes must be kept in sync with function version // // Recommendation: Use inline macros for CRITICAL hot paths only // (alloc/free fast path), keep functions for diagnostics/debugging #endif // TINY_ALLOC_FAST_INLINE_H