// tiny_alloc_fast_inline.h - Phase 7 Task 2: Aggressive inline TLS cache access // Purpose: Eliminate function call overhead (5-10 cycles) in hot path // Design: Macro-based inline expansion of TLS freelist operations // Performance: Expected +10-15% (22M → 24-25M ops/s) #ifndef TINY_ALLOC_FAST_INLINE_H #define TINY_ALLOC_FAST_INLINE_H #include #include #include "hakmem_build_flags.h" #include "tiny_remote.h" // for TINY_REMOTE_SENTINEL (defense-in-depth) #include "box/tiny_next_ptr_box.h" // Phase E1-CORRECT: unified next pointer API #include "tiny_region_id.h" // For HEADER_MAGIC, HEADER_CLASS_MASK (Fix #7) // External TLS variables (defined in hakmem_tiny.c) extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; #ifndef TINY_NUM_CLASSES #define TINY_NUM_CLASSES 8 #endif // ========== Inline Macro: TLS Freelist Pop ========== // // Aggressive inline expansion of tiny_alloc_fast_pop() // Saves: 5-10 cycles (function call overhead + register spilling) // // Assembly comparison (x86-64): // Function call: // push %rbx ; Save registers // mov %edi, %ebx ; class_idx to %ebx // call tiny_alloc_fast_pop ; Call (5-10 cycles overhead) // pop %rbx ; Restore registers // test %rax, %rax ; Check result // // Inline macro: // mov g_tls_sll_head(%rdi), %rax ; Direct access (3-4 cycles) // test %rax, %rax // je .miss // mov (%rax), %rdx // mov %rdx, g_tls_sll_head(%rdi) // // Result: 5-10 fewer instructions, better register allocation // #define TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr_out) do { \ void* _head = g_tls_sll_head[(class_idx)]; \ if (__builtin_expect(_head != NULL, 1)) { \ if (__builtin_expect((uintptr_t)_head == TINY_REMOTE_SENTINEL, 0)) { \ /* Break the chain defensively if sentinel leaked into TLS SLL */ \ g_tls_sll_head[(class_idx)] = NULL; \ if (g_tls_sll_count[(class_idx)] > 0) g_tls_sll_count[(class_idx)]--; \ (ptr_out) = NULL; \ } else { \ /* Phase E1-CORRECT: Use Box API for next pointer read */ \ void* _next = tiny_next_read(class_idx, _head); \ g_tls_sll_head[(class_idx)] = _next; \ if (g_tls_sll_count[(class_idx)] > 0) { \ g_tls_sll_count[(class_idx)]--; \ } \ /* Phase E1-CORRECT: All classes return user pointer (base+1) */ \ (ptr_out) = (void*)((uint8_t*)_head + 1); \ } \ } else { \ (ptr_out) = NULL; \ } \ } while(0) // ========== Inline Macro: TLS Freelist Push ========== // // Aggressive inline expansion of tiny_alloc_fast_push() // Saves: 5-10 cycles (function call overhead) // // Assembly comparison: // Function call: // mov %rdi, %rsi ; ptr to %rsi // mov %ebx, %edi ; class_idx to %edi // call tiny_alloc_fast_push ; Call (5-10 cycles) // // Inline macro: // mov g_tls_sll_head(%rdi), %rax ; Direct inline (2-3 cycles) // mov %rax, (%rsi) // mov %rsi, g_tls_sll_head(%rdi) // #if HAKMEM_TINY_HEADER_CLASSIDX // Phase E1-CORRECT: Restore header on FREE for ALL classes (including C7) // ROOT CAUSE: User may have overwritten byte 0 (header). tls_sll_splice() checks // byte 0 for HEADER_MAGIC. Without restoration, it finds 0x00 → uses wrong offset → SEGV. // COST: 1 byte write (~1-2 cycles per free, negligible). #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \ *(uint8_t*)(ptr) = HEADER_MAGIC | ((class_idx) & HEADER_CLASS_MASK); \ tiny_next_write(class_idx, (ptr), g_tls_sll_head[(class_idx)]); \ g_tls_sll_head[(class_idx)] = (ptr); \ g_tls_sll_count[(class_idx)]++; \ } while(0) #else #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \ tiny_next_write(class_idx, (ptr), g_tls_sll_head[(class_idx)]); \ g_tls_sll_head[(class_idx)] = (ptr); \ g_tls_sll_count[(class_idx)]++; \ } while(0) #endif // ========== Performance Notes ========== // // Benchmark results (expected): // - Random Mixed 128B: 21M → 23M ops/s (+10%) // - Random Mixed 256B: 19M → 22M ops/s (+15%) // - Larson 1T: 2.7M → 3.0M ops/s (+11%) // // Key optimizations: // 1. No function call overhead (save 5-10 cycles) // 2. Better register allocation (inline knows full context) // 3. No stack frame setup/teardown // 4. Compiler can optimize across macro boundaries // // Trade-offs: // 1. Code size: +100-200 bytes (each call site expanded) // 2. Debug visibility: Macros harder to step through // 3. Maintenance: Changes must be kept in sync with function version // // Recommendation: Use inline macros for CRITICAL hot paths only // (alloc/free fast path), keep functions for diagnostics/debugging #endif // TINY_ALLOC_FAST_INLINE_H