// tiny_front_hot_box.h - Phase 4-Step2: Tiny Front Hot Path Box // Purpose: Ultra-fast allocation path (5-7 branches max) // Contract: TLS cache hit path only, falls back to cold path on miss // Performance: Target +10-15% (60.6M → 68-75M ops/s) // // Design Principles (Box Pattern): // 1. Single Responsibility: Hot path ONLY (cache hit) // 2. Clear Contract: Assumes cache initialized, returns NULL on miss // 3. Observable: Debug metrics (zero overhead in Release) // 4. Safe: Pointer safety via branch hints, type-safe operations // 5. Testable: Isolated from cold path, easy to benchmark // // Branch Count Analysis: // Hot Path (cache hit): // 1. class_idx range check (UNLIKELY) // 2. cache empty check (LIKELY hit) // 3. (header write - no branch) // Total: 2 branches (down from 4-5) // // Cold Path (cache miss): // Return NULL → caller handles via tiny_cold_refill_and_alloc() #ifndef TINY_FRONT_HOT_BOX_H #define TINY_FRONT_HOT_BOX_H #include #include #include "../hakmem_build_flags.h" #include "../hakmem_tiny_config.h" #include "../tiny_region_id.h" #include "../front/tiny_unified_cache.h" // For TinyUnifiedCache #include "tiny_header_box.h" // Phase 5 E5-2: For tiny_header_finalize_alloc #include "tiny_unified_lifo_box.h" // Phase 15 v1: UnifiedCache FIFO→LIFO #include "tiny_c6_inline_slots_env_box.h" // Phase 75-1: C6 inline slots ENV gate #include "../front/tiny_c6_inline_slots.h" // Phase 75-1: C6 inline slots API #include "tiny_c5_inline_slots_env_box.h" // Phase 75-2: C5 inline slots ENV gate #include "../front/tiny_c5_inline_slots.h" // Phase 75-2: C5 inline slots API #include "tiny_c4_inline_slots_env_box.h" // Phase 76-1: C4 inline slots ENV gate #include "../front/tiny_c4_inline_slots.h" // Phase 76-1: C4 inline slots API #include "tiny_c2_local_cache_env_box.h" // Phase 79-1: C2 local cache ENV gate #include "../front/tiny_c2_local_cache.h" // Phase 79-1: C2 local cache API #include "tiny_c3_inline_slots_env_box.h" // Phase 77-1: C3 inline slots ENV gate #include "../front/tiny_c3_inline_slots.h" // Phase 77-1: C3 inline slots API #include "tiny_inline_slots_fixed_mode_box.h" // Phase 78-1: Optional fixed-mode gating #include "tiny_inline_slots_switch_dispatch_box.h" // Phase 80-1: Switch dispatch for C4/C5/C6 #include "tiny_inline_slots_switch_dispatch_fixed_box.h" // Phase 83-1: Switch dispatch fixed mode #include "tiny_c6_inline_slots_ifl_env_box.h" // Phase 91: C6 intrusive LIFO inline slots ENV gate #include "tiny_c6_inline_slots_ifl_tls_box.h" // Phase 91: C6 intrusive LIFO inline slots TLS state // ============================================================================ // Branch Prediction Macros (Pointer Safety - Prediction Hints) // ============================================================================ // TINY_HOT_LIKELY: Hint compiler that condition is VERY likely true // Usage: if (TINY_HOT_LIKELY(ptr != NULL)) { ... } // Result: CPU pipeline optimized for hot path, cold path predicted as unlikely #define TINY_HOT_LIKELY(x) __builtin_expect(!!(x), 1) // TINY_HOT_UNLIKELY: Hint compiler that condition is VERY unlikely // Usage: if (TINY_HOT_UNLIKELY(error)) { ... } // Result: CPU pipeline avoids speculative execution of error path #define TINY_HOT_UNLIKELY(x) __builtin_expect(!!(x), 0) // ============================================================================ // Debug Metrics (Zero Overhead in Release) // ============================================================================ #if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED // Increment cache hit counter (debug/observe only; zero overhead when compiled-out) #define TINY_HOT_METRICS_HIT(class_idx) \ do { extern __thread uint64_t g_unified_cache_hit[]; \ g_unified_cache_hit[class_idx]++; } while(0) // Increment cache miss counter (debug/observe only; zero overhead when compiled-out) #define TINY_HOT_METRICS_MISS(class_idx) \ do { extern __thread uint64_t g_unified_cache_miss[]; \ g_unified_cache_miss[class_idx]++; } while(0) #else // Release builds: macros expand to nothing (zero overhead) #define TINY_HOT_METRICS_HIT(class_idx) ((void)0) #define TINY_HOT_METRICS_MISS(class_idx) ((void)0) #endif // ============================================================================ // Box 2: Tiny Hot Alloc (Ultra-Fast Path) // ============================================================================ // Ultra-fast allocation from TLS unified cache // // CONTRACT: // Input: class_idx (0-7, caller must validate) // Output: USER pointer (base+1) on success, NULL on miss // Precondition: Cache initialized (caller ensures via lazy init or prewarm) // Postcondition: Cache head advanced, object header written // // PERFORMANCE: // Hot path (cache hit): 2 branches, 2-3 cache misses // Cold path (cache miss): Returns NULL (caller handles) // // BRANCH ANALYSIS: // 1. class_idx range check (UNLIKELY, safety) // 2. cache empty check (LIKELY hit) // 3. (no branch for header write, direct store) // // ASSEMBLY (expected, x86-64): // mov g_unified_cache@TPOFF(%rax,%rdi,8), %rcx ; TLS cache access // movzwl (%rcx), %edx ; head // movzwl 2(%rcx), %esi ; tail // cmp %dx, %si ; head != tail ? // je .Lcache_miss // mov 8(%rcx), %rax ; slots // mov (%rax,%rdx,8), %rax ; base = slots[head] // inc %dx ; head++ // and 6(%rcx), %dx ; head & mask // mov %dx, (%rcx) ; store head // movb $0xA0, (%rax) ; header magic // or %dil, (%rax) ; header |= class_idx // lea 1(%rax), %rax ; base+1 → USER // ret // .Lcache_miss: // xor %eax, %eax ; return NULL // ret // __attribute__((always_inline)) static inline void* tiny_hot_alloc_fast(int class_idx) { extern __thread TinyUnifiedCache g_unified_cache[]; // Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization) // Phase 83-1: Per-op branch removed via fixed-mode caching // C2/C3 excluded (NO-GO from Phase 77-1/79-1) if (tiny_inline_slots_switch_dispatch_enabled_fast()) { // Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6) switch (class_idx) { case 4: if (tiny_c4_inline_slots_enabled_fast()) { void* base = c4_inline_pop(c4_inline_tls()); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } } break; case 5: if (tiny_c5_inline_slots_enabled_fast()) { void* base = c5_inline_pop(c5_inline_tls()); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } } break; case 6: // Phase 91: C6 Intrusive LIFO Inline Slots (check BEFORE FIFO) if (tiny_c6_inline_slots_ifl_enabled_fast()) { void* base = tiny_c6_inline_slots_ifl_pop_fast(); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } } // Phase 75-1: C6 Inline Slots (FIFO - fallback) if (tiny_c6_inline_slots_enabled_fast()) { void* base = c6_inline_pop(c6_inline_tls()); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } } break; default: // C0-C3, C7: fall through to unified_cache break; } // Switch mode: fall through to unified_cache after miss } else { // If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks // NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path // Phase 77-1: C3 Inline Slots early-exit (ENV gated) // Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3 if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) { void* base = c3_inline_pop(c3_inline_tls()); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } // C3 inline miss → fall through to C4/C5/C6/unified cache } // Phase 76-1: C4 Inline Slots early-exit (ENV gated) // Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4 if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) { void* base = c4_inline_pop(c4_inline_tls()); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } // C4 inline miss → fall through to C5/C6/unified cache } // Phase 75-2: C5 Inline Slots early-exit (ENV gated) // Try C5 inline slots SECOND (before C6 and unified cache) for class 5 if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) { void* base = c5_inline_pop(c5_inline_tls()); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } // C5 inline miss → fall through to C6/unified cache } // Phase 91: C6 Intrusive LIFO Inline Slots early-exit (ENV gated) // Try C6 IFL THIRD (before C6 FIFO and unified cache) for class 6 if (class_idx == 6 && tiny_c6_inline_slots_ifl_enabled_fast()) { void* base = tiny_c6_inline_slots_ifl_pop_fast(); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } // C6 IFL miss → fall through to C6 FIFO } // Phase 75-1: C6 Inline Slots early-exit (ENV gated) // Try C6 inline slots THIRD (before unified cache) for class 6 if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) { void* base = c6_inline_pop(c6_inline_tls()); if (TINY_HOT_LIKELY(base != NULL)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } // C6 inline miss → fall through to unified cache } } // End of if-chain mode // TLS cache access (1 cache miss) // NOTE: Range check removed - caller (hak_tiny_size_to_class) guarantees valid class_idx TinyUnifiedCache* cache = &g_unified_cache[class_idx]; #if HAKMEM_TINY_UNIFIED_LIFO_COMPILED // Phase 15 v1: Mode check at entry (once per call, not scattered in hot path) // Phase 22: Compile-out when disabled (default OFF) int lifo_mode = tiny_unified_lifo_enabled(); // Phase 15 v1: LIFO vs FIFO mode switch if (lifo_mode) { // === LIFO MODE: Stack-based (LIFO) === // Try pop from stack (tail is stack depth) void* base = unified_cache_try_pop_lifo(class_idx); if (__builtin_expect(base != NULL, 1)) { TINY_HOT_METRICS_HIT(class_idx); #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; #endif } // LIFO miss → fall through to cold path TINY_HOT_METRICS_MISS(class_idx); return NULL; } #endif // === FIFO MODE: Ring-based (existing, default) === // Branch 1: Cache empty check (LIKELY hit) // Hot path: cache has objects (head != tail) // Cold path: cache empty (head == tail) → refill needed if (TINY_HOT_LIKELY(cache->head != cache->tail)) { // === HOT PATH: Cache hit (2-3 instructions) === // Pop from cache (1 cache miss for array access) void* base = cache->slots[cache->head]; cache->head = (cache->head + 1) & cache->mask; // Fast modulo (power of 2) // Debug metrics (zero overhead in release) TINY_HOT_METRICS_HIT(class_idx); // Write header + return USER pointer (no branch) // E5-2: Use finalize (enables write-once optimization for C1-C6) #if HAKMEM_TINY_HEADER_CLASSIDX return tiny_header_finalize_alloc(base, class_idx); #else return base; // No-header mode: return BASE directly #endif } // === COLD PATH: Cache miss === // Don't refill here - let caller handle via tiny_cold_refill_and_alloc() // This keeps hot path small and predictable TINY_HOT_METRICS_MISS(class_idx); return NULL; } // ============================================================================ // Box 2b: Tiny Hot Free (Ultra-Fast Path) // ============================================================================ // Ultra-fast free to TLS unified cache // // CONTRACT: // Input: class_idx (0-7), base pointer (BASE, not USER) // Output: 1=SUCCESS (pushed to cache), 0=FULL (caller handles) // Precondition: Cache initialized, base is valid BASE pointer // Postcondition: Cache tail advanced, object pushed to cache // // PERFORMANCE: // Hot path (cache not full): 2 branches, 2-3 cache misses // Cold path (cache full): Returns 0 (caller handles) // // BRANCH ANALYSIS: // 1. class_idx range check (UNLIKELY, safety) // 2. cache full check (UNLIKELY full) // __attribute__((always_inline)) static inline int tiny_hot_free_fast(int class_idx, void* base) { extern __thread TinyUnifiedCache g_unified_cache[]; // TLS cache access (1 cache miss) // NOTE: Range check removed - caller guarantees valid class_idx TinyUnifiedCache* cache = &g_unified_cache[class_idx]; #if HAKMEM_TINY_UNIFIED_LIFO_COMPILED // Phase 15 v1: Mode check at entry (once per call, not scattered in hot path) // Phase 22: Compile-out when disabled (default OFF) int lifo_mode = tiny_unified_lifo_enabled(); // Phase 15 v1: LIFO vs FIFO mode switch if (lifo_mode) { // === LIFO MODE: Stack-based (LIFO) === // Try push to stack (tail is stack depth) if (unified_cache_try_push_lifo(class_idx, base)) { #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_push[]; g_unified_cache_push[class_idx]++; #endif return 1; // SUCCESS } // LIFO overflow → fall through to cold path #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_full[]; g_unified_cache_full[class_idx]++; #endif return 0; // FULL } #endif // === FIFO MODE: Ring-based (existing, default) === // Calculate next tail (for full check) uint16_t next_tail = (cache->tail + 1) & cache->mask; // Branch 1: Cache full check (UNLIKELY full) // Hot path: cache has space (next_tail != head) // Cold path: cache full (next_tail == head) → drain needed if (TINY_HOT_LIKELY(next_tail != cache->head)) { // === HOT PATH: Cache has space (2-3 instructions) === // Push to cache (1 cache miss for array write) cache->slots[cache->tail] = base; cache->tail = next_tail; // Debug metrics (zero overhead in release) #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_push[]; g_unified_cache_push[class_idx]++; #endif return 1; // SUCCESS } // === COLD PATH: Cache full === // Don't drain here - let caller handle via tiny_cold_drain_and_free() #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_full[]; g_unified_cache_full[class_idx]++; #endif return 0; // FULL } // ============================================================================ // Performance Notes // ============================================================================ // Expected improvements (Phase 4-Step2): // - Random Mixed 256: 60.6M → 68-75M ops/s (+10-15%) // - Tiny Hot 64B: Current → +10-15% // // Key optimizations: // 1. Branch reduction: 4-5 → 2 branches (hot path) // 2. Branch hints: LIKELY/UNLIKELY guide CPU pipeline // 3. Hot/Cold separation: Keeps hot path small (better i-cache) // 4. Always inline: Eliminates function call overhead // 5. Metrics gated: Zero overhead in release builds // // Trade-offs: // 1. Code size: +50-100 bytes per call site (inline expansion) // 2. Cold path complexity: Caller must handle NULL/0 returns // 3. Cache assumption: Assumes cache initialized (lazy init moved to caller) #endif // TINY_FRONT_HOT_BOX_H