// tiny_alloc_fast_inline.h - Phase 7 Task 2: Aggressive inline TLS cache access // Purpose: Eliminate function call overhead (5-10 cycles) in hot path // Design: Macro-based inline expansion of TLS freelist operations // Performance: Expected +10-15% (22M → 24-25M ops/s) #ifndef TINY_ALLOC_FAST_INLINE_H #define TINY_ALLOC_FAST_INLINE_H #include #include #include #include "hakmem_build_flags.h" #include "tiny_remote.h" // for TINY_REMOTE_SENTINEL (defense-in-depth) #include "box/tiny_next_ptr_box.h" // Phase E1-CORRECT: unified next pointer API #include "tiny_region_id.h" // For HEADER_MAGIC, HEADER_CLASS_MASK (Fix #7) #include "box/tls_sll_box.h" // External TLS variables (defined in hakmem_tiny.c) // Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; extern __thread const char* g_tls_sll_last_writer[TINY_NUM_CLASSES]; #ifndef TINY_NUM_CLASSES #define TINY_NUM_CLASSES 8 #endif // ========== Inline Macro: TLS Freelist Pop ========== // // Aggressive inline expansion of tiny_alloc_fast_pop() // Saves: 5-10 cycles (function call overhead + register spilling) // // Assembly comparison (x86-64): // Function call: // push %rbx ; Save registers // mov %edi, %ebx ; class_idx to %ebx // call tiny_alloc_fast_pop ; Call (5-10 cycles overhead) // pop %rbx ; Restore registers // test %rax, %rax ; Check result // // Inline macro: // mov g_tls_sll_head(%rdi), %rax ; Direct access (3-4 cycles) // test %rax, %rax // je .miss // mov (%rax), %rdx // mov %rdx, g_tls_sll_head(%rdi) // // Result: 5-10 fewer instructions, better register allocation // #define TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr_out) do { \ extern int g_tls_sll_class_mask; \ if (__builtin_expect(((g_tls_sll_class_mask & (1u << (class_idx))) == 0), 0)) { \ (ptr_out) = NULL; \ break; \ } \ void* _head = g_tls_sll[(class_idx)].head; \ if (__builtin_expect(_head != NULL, 1)) { \ if (__builtin_expect((uintptr_t)_head == TINY_REMOTE_SENTINEL, 0)) { \ /* Break the chain defensively if sentinel leaked into TLS SLL */ \ g_tls_sll[(class_idx)].head = NULL; \ g_tls_sll_last_writer[(class_idx)] = "fast_pop_sentinel"; \ if (g_tls_sll[(class_idx)].count > 0) g_tls_sll[(class_idx)].count--; \ (ptr_out) = NULL; \ } else { \ /* Phase E1-CORRECT: Use Box API for next pointer read */ \ void* _next = tiny_next_read(class_idx, _head); \ if (__builtin_expect(class_idx == 4 || class_idx == 6, 0)) { \ tls_sll_diag_next(class_idx, _head, _next, "fast_pop_next"); \ } \ g_tls_sll[(class_idx)].head = _next; \ g_tls_sll_last_writer[(class_idx)] = "fast_pop"; \ if ((class_idx == 4 || class_idx == 6) && _next && ((uintptr_t)_next < 4096 || (uintptr_t)_next > 0x00007fffffffffffULL)) { \ static __thread uint8_t s_fast_pop_invalid_log[8] = {0}; \ if (s_fast_pop_invalid_log[(class_idx)] < 4) { \ fprintf(stderr, "[TLS_SLL_FAST_POP_INVALID] cls=%d head=%p next=%p\n", (class_idx), _head, _next); \ s_fast_pop_invalid_log[(class_idx)]++; \ } \ g_tls_sll[(class_idx)].head = NULL; \ /* keep count unchanged to flag drop */ \ g_tls_sll_last_writer[(class_idx)] = "fast_pop_post_invalid"; \ (ptr_out) = NULL; \ } else { \ if (g_tls_sll[(class_idx)].count > 0) { \ g_tls_sll[(class_idx)].count--; \ } \ /* Phase 7: Fast path returns BASE pointer; HAK_RET_ALLOC does BASE→USER */ \ (ptr_out) = _head; \ } \ } \ } else { \ (ptr_out) = NULL; \ } \ } while(0) // ========== Inline Macro: TLS Freelist Push ========== // // Aggressive inline expansion of tiny_alloc_fast_push() // Saves: 5-10 cycles (function call overhead) // // Assembly comparison: // Function call: // mov %rdi, %rsi ; ptr to %rsi // mov %ebx, %edi ; class_idx to %edi // call tiny_alloc_fast_push ; Call (5-10 cycles) // // Inline macro: // mov g_tls_sll_head(%rdi), %rax ; Direct inline (2-3 cycles) // mov %rax, (%rsi) // mov %rsi, g_tls_sll_head(%rdi) // #if HAKMEM_TINY_HEADER_CLASSIDX // Phase E1-CORRECT: Restore header on FREE for ALL classes (including C7) // ROOT CAUSE: User may have overwritten byte 0 (header). tls_sll_splice() checks // byte 0 for HEADER_MAGIC. Without restoration, it finds 0x00 → uses wrong offset → SEGV. // COST: 1 byte write (~1-2 cycles per free, negligible). #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \ extern int g_tls_sll_class_mask; \ if (__builtin_expect(((g_tls_sll_class_mask & (1u << (class_idx))) == 0), 0)) { \ break; \ } \ if (!(ptr)) break; \ /* Phase E1-CORRECT: API ptr is USER pointer (= base+1). Convert back to BASE. */ \ uint8_t* _base = (uint8_t*)(ptr) - 1; \ /* Light header diag: alert if header already mismatched before we overwrite */ \ do { \ static _Atomic uint32_t g_fast_hdr_diag = 0; \ uint8_t _expect = HEADER_MAGIC | ((class_idx) & HEADER_CLASS_MASK); \ uint8_t _got = *_base; \ if (_got != _expect) { \ uint32_t _n = atomic_fetch_add_explicit(&g_fast_hdr_diag, 1, memory_order_relaxed); \ if (_n < 16) { \ fprintf(stderr, "[FAST_PUSH_HDR_MISMATCH] cls=%d base=%p got=0x%02x expect=0x%02x\n", (class_idx), _base, _got, _expect); \ } \ } \ } while (0); \ /* Restore header at BASE (not at user). */ \ *_base = HEADER_MAGIC | ((class_idx) & HEADER_CLASS_MASK); \ /* Link node using BASE as the canonical SLL node address. */ \ tiny_next_write((class_idx), _base, g_tls_sll[(class_idx)].head); \ g_tls_sll[(class_idx)].head = _base; \ g_tls_sll_last_writer[(class_idx)] = "fast_push"; \ g_tls_sll[(class_idx)].count++; \ } while(0) #else #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \ tiny_next_write(class_idx, (ptr), g_tls_sll[(class_idx)].head); \ g_tls_sll[(class_idx)].head = (ptr); \ g_tls_sll_last_writer[(class_idx)] = "fast_push"; \ g_tls_sll[(class_idx)].count++; \ } while(0) #endif // ========== Performance Notes ========== // // Benchmark results (expected): // - Random Mixed 128B: 21M → 23M ops/s (+10%) // - Random Mixed 256B: 19M → 22M ops/s (+15%) // - Larson 1T: 2.7M → 3.0M ops/s (+11%) // // Key optimizations: // 1. No function call overhead (save 5-10 cycles) // 2. Better register allocation (inline knows full context) // 3. No stack frame setup/teardown // 4. Compiler can optimize across macro boundaries // // Trade-offs: // 1. Code size: +100-200 bytes (each call site expanded) // 2. Debug visibility: Macros harder to step through // 3. Maintenance: Changes must be kept in sync with function version // // Recommendation: Use inline macros for CRITICAL hot paths only // (alloc/free fast path), keep functions for diagnostics/debugging #endif // TINY_ALLOC_FAST_INLINE_H