hakmem/core/box/tiny_front_hot_box.h

// tiny_front_hot_box.h - Phase 4-Step2: Tiny Front Hot Path Box
// Purpose: Ultra-fast allocation path (5-7 branches max)
// Contract: TLS cache hit path only, falls back to cold path on miss
// Performance: Target +10-15% (60.6M → 68-75M ops/s)
//
// Design Principles (Box Pattern):
// 1. Single Responsibility: Hot path ONLY (cache hit)
// 2. Clear Contract: Assumes cache initialized, returns NULL on miss
// 3. Observable: Debug metrics (zero overhead in Release)
// 4. Safe: Pointer safety via branch hints, type-safe operations
// 5. Testable: Isolated from cold path, easy to benchmark
//
// Branch Count Analysis:
//   Hot Path (cache hit):
//     1. class_idx range check (UNLIKELY)
//     2. cache empty check (LIKELY hit)
//     3. (header write - no branch)
//     Total: 2 branches (down from 4-5)
//
//   Cold Path (cache miss):
//     Return NULL → caller handles via tiny_cold_refill_and_alloc()

#ifndef TINY_FRONT_HOT_BOX_H
#define TINY_FRONT_HOT_BOX_H

#include <stdint.h>
#include <stddef.h>
#include "../hakmem_build_flags.h"
#include "../hakmem_tiny_config.h"
#include "../tiny_region_id.h"
#include "../front/tiny_unified_cache.h"  // For TinyUnifiedCache
#include "tiny_header_box.h"              // Phase 5 E5-2: For tiny_header_finalize_alloc
#include "tiny_unified_lifo_box.h"        // Phase 15 v1: UnifiedCache FIFO→LIFO
#include "tiny_c6_inline_slots_env_box.h" // Phase 75-1: C6 inline slots ENV gate
#include "../front/tiny_c6_inline_slots.h" // Phase 75-1: C6 inline slots API
#include "tiny_c5_inline_slots_env_box.h" // Phase 75-2: C5 inline slots ENV gate
#include "../front/tiny_c5_inline_slots.h" // Phase 75-2: C5 inline slots API
#include "tiny_c4_inline_slots_env_box.h" // Phase 76-1: C4 inline slots ENV gate
#include "../front/tiny_c4_inline_slots.h" // Phase 76-1: C4 inline slots API
#include "tiny_c2_local_cache_env_box.h" // Phase 79-1: C2 local cache ENV gate
#include "../front/tiny_c2_local_cache.h" // Phase 79-1: C2 local cache API
#include "tiny_c3_inline_slots_env_box.h" // Phase 77-1: C3 inline slots ENV gate
#include "../front/tiny_c3_inline_slots.h" // Phase 77-1: C3 inline slots API
#include "tiny_inline_slots_fixed_mode_box.h" // Phase 78-1: Optional fixed-mode gating
#include "tiny_inline_slots_switch_dispatch_box.h" // Phase 80-1: Switch dispatch for C4/C5/C6
#include "tiny_inline_slots_switch_dispatch_fixed_box.h" // Phase 83-1: Switch dispatch fixed mode

// ============================================================================
// Branch Prediction Macros (Pointer Safety - Prediction Hints)
// ============================================================================

// TINY_HOT_LIKELY: Hint compiler that condition is VERY likely true
// Usage: if (TINY_HOT_LIKELY(ptr != NULL)) { ... }
// Result: CPU pipeline optimized for hot path, cold path predicted as unlikely
#define TINY_HOT_LIKELY(x)    __builtin_expect(!!(x), 1)

// TINY_HOT_UNLIKELY: Hint compiler that condition is VERY unlikely
// Usage: if (TINY_HOT_UNLIKELY(error)) { ... }
// Result: CPU pipeline avoids speculative execution of error path
#define TINY_HOT_UNLIKELY(x)  __builtin_expect(!!(x), 0)

// ============================================================================
// Debug Metrics (Zero Overhead in Release)
// ============================================================================

#if !HAKMEM_BUILD_RELEASE || HAKMEM_UNIFIED_CACHE_STATS_COMPILED
// Increment cache hit counter (debug/observe only; zero overhead when compiled-out)
#define TINY_HOT_METRICS_HIT(class_idx) \
    do { extern __thread uint64_t g_unified_cache_hit[]; \
         g_unified_cache_hit[class_idx]++; } while(0)

// Increment cache miss counter (debug/observe only; zero overhead when compiled-out)
#define TINY_HOT_METRICS_MISS(class_idx) \
    do { extern __thread uint64_t g_unified_cache_miss[]; \
         g_unified_cache_miss[class_idx]++; } while(0)
#else
// Release builds: macros expand to nothing (zero overhead)
#define TINY_HOT_METRICS_HIT(class_idx)  ((void)0)
#define TINY_HOT_METRICS_MISS(class_idx) ((void)0)
#endif

// ============================================================================
// Box 2: Tiny Hot Alloc (Ultra-Fast Path)
// ============================================================================

// Ultra-fast allocation from TLS unified cache
//
// CONTRACT:
//   Input:  class_idx (0-7, caller must validate)
//   Output: USER pointer (base+1) on success, NULL on miss
//   Precondition: Cache initialized (caller ensures via lazy init or prewarm)
//   Postcondition: Cache head advanced, object header written
//
// PERFORMANCE:
//   Hot path (cache hit): 2 branches, 2-3 cache misses
//   Cold path (cache miss): Returns NULL (caller handles)
//
// BRANCH ANALYSIS:
//   1. class_idx range check (UNLIKELY, safety)
//   2. cache empty check (LIKELY hit)
//   3. (no branch for header write, direct store)
//
// ASSEMBLY (expected, x86-64):
//   mov    g_unified_cache@TPOFF(%rax,%rdi,8), %rcx   ; TLS cache access
//   movzwl (%rcx), %edx                                ; head
//   movzwl 2(%rcx), %esi                               ; tail
//   cmp    %dx, %si                                    ; head != tail ?
//   je     .Lcache_miss
//   mov    8(%rcx), %rax                               ; slots
//   mov    (%rax,%rdx,8), %rax                         ; base = slots[head]
//   inc    %dx                                         ; head++
//   and    6(%rcx), %dx                                ; head & mask
//   mov    %dx, (%rcx)                                 ; store head
//   movb   $0xA0, (%rax)                               ; header magic
//   or     %dil, (%rax)                                ; header |= class_idx
//   lea    1(%rax), %rax                               ; base+1 → USER
//   ret
// .Lcache_miss:
//   xor    %eax, %eax                                  ; return NULL
//   ret
//
__attribute__((always_inline))
static inline void* tiny_hot_alloc_fast(int class_idx) {
    extern __thread TinyUnifiedCache g_unified_cache[];

    // Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization)
    // Phase 83-1: Per-op branch removed via fixed-mode caching
    // C2/C3 excluded (NO-GO from Phase 77-1/79-1)
    if (tiny_inline_slots_switch_dispatch_enabled_fast()) {
        // Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6)
        switch (class_idx) {
            case 4:
                if (tiny_c4_inline_slots_enabled_fast()) {
                    void* base = c4_inline_pop(c4_inline_tls());
                    if (TINY_HOT_LIKELY(base != NULL)) {
                        TINY_HOT_METRICS_HIT(class_idx);
                        #if HAKMEM_TINY_HEADER_CLASSIDX
                        return tiny_header_finalize_alloc(base, class_idx);
                        #else
                        return base;
                        #endif
                    }
                }
                break;
            case 5:
                if (tiny_c5_inline_slots_enabled_fast()) {
                    void* base = c5_inline_pop(c5_inline_tls());
                    if (TINY_HOT_LIKELY(base != NULL)) {
                        TINY_HOT_METRICS_HIT(class_idx);
                        #if HAKMEM_TINY_HEADER_CLASSIDX
                        return tiny_header_finalize_alloc(base, class_idx);
                        #else
                        return base;
                        #endif
                    }
                }
                break;
            case 6:
                if (tiny_c6_inline_slots_enabled_fast()) {
                    void* base = c6_inline_pop(c6_inline_tls());
                    if (TINY_HOT_LIKELY(base != NULL)) {
                        TINY_HOT_METRICS_HIT(class_idx);
                        #if HAKMEM_TINY_HEADER_CLASSIDX
                        return tiny_header_finalize_alloc(base, class_idx);
                        #else
                        return base;
                        #endif
                    }
                }
                break;
            default:
                // C0-C3, C7: fall through to unified_cache
                break;
        }
        // Switch mode: fall through to unified_cache after miss
    } else {
        // If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks
        // NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path

    // Phase 77-1: C3 Inline Slots early-exit (ENV gated)
    // Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3
    if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) {
        void* base = c3_inline_pop(c3_inline_tls());
        if (TINY_HOT_LIKELY(base != NULL)) {
            TINY_HOT_METRICS_HIT(class_idx);
            #if HAKMEM_TINY_HEADER_CLASSIDX
            return tiny_header_finalize_alloc(base, class_idx);
            #else
            return base;
            #endif
        }
        // C3 inline miss → fall through to C4/C5/C6/unified cache
    }

    // Phase 76-1: C4 Inline Slots early-exit (ENV gated)
    // Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4
    if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) {
        void* base = c4_inline_pop(c4_inline_tls());
        if (TINY_HOT_LIKELY(base != NULL)) {
            TINY_HOT_METRICS_HIT(class_idx);
            #if HAKMEM_TINY_HEADER_CLASSIDX
            return tiny_header_finalize_alloc(base, class_idx);
            #else
            return base;
            #endif
        }
        // C4 inline miss → fall through to C5/C6/unified cache
    }

    // Phase 75-2: C5 Inline Slots early-exit (ENV gated)
    // Try C5 inline slots SECOND (before C6 and unified cache) for class 5
    if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) {
        void* base = c5_inline_pop(c5_inline_tls());
        if (TINY_HOT_LIKELY(base != NULL)) {
            TINY_HOT_METRICS_HIT(class_idx);
            #if HAKMEM_TINY_HEADER_CLASSIDX
            return tiny_header_finalize_alloc(base, class_idx);
            #else
            return base;
            #endif
        }
        // C5 inline miss → fall through to C6/unified cache
    }

        // Phase 75-1: C6 Inline Slots early-exit (ENV gated)
        // Try C6 inline slots THIRD (before unified cache) for class 6
        if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) {
            void* base = c6_inline_pop(c6_inline_tls());
            if (TINY_HOT_LIKELY(base != NULL)) {
                TINY_HOT_METRICS_HIT(class_idx);
                #if HAKMEM_TINY_HEADER_CLASSIDX
                return tiny_header_finalize_alloc(base, class_idx);
                #else
                return base;
                #endif
            }
            // C6 inline miss → fall through to unified cache
        }
    } // End of if-chain mode

    // TLS cache access (1 cache miss)
    // NOTE: Range check removed - caller (hak_tiny_size_to_class) guarantees valid class_idx
    TinyUnifiedCache* cache = &g_unified_cache[class_idx];

#if HAKMEM_TINY_UNIFIED_LIFO_COMPILED
    // Phase 15 v1: Mode check at entry (once per call, not scattered in hot path)
    // Phase 22: Compile-out when disabled (default OFF)
    int lifo_mode = tiny_unified_lifo_enabled();

    // Phase 15 v1: LIFO vs FIFO mode switch
    if (lifo_mode) {
        // === LIFO MODE: Stack-based (LIFO) ===
        // Try pop from stack (tail is stack depth)
        void* base = unified_cache_try_pop_lifo(class_idx);
        if (__builtin_expect(base != NULL, 1)) {
            TINY_HOT_METRICS_HIT(class_idx);
            #if HAKMEM_TINY_HEADER_CLASSIDX
            return tiny_header_finalize_alloc(base, class_idx);
            #else
            return base;
            #endif
        }
        // LIFO miss → fall through to cold path
        TINY_HOT_METRICS_MISS(class_idx);
        return NULL;
    }
#endif

    // === FIFO MODE: Ring-based (existing, default) ===
    // Branch 1: Cache empty check (LIKELY hit)
    // Hot path: cache has objects (head != tail)
    // Cold path: cache empty (head == tail) → refill needed
    if (TINY_HOT_LIKELY(cache->head != cache->tail)) {
        // === HOT PATH: Cache hit (2-3 instructions) ===

        // Pop from cache (1 cache miss for array access)
        void* base = cache->slots[cache->head];
        cache->head = (cache->head + 1) & cache->mask;  // Fast modulo (power of 2)

        // Debug metrics (zero overhead in release)
        TINY_HOT_METRICS_HIT(class_idx);

        // Write header + return USER pointer (no branch)
        // E5-2: Use finalize (enables write-once optimization for C1-C6)
        #if HAKMEM_TINY_HEADER_CLASSIDX
        return tiny_header_finalize_alloc(base, class_idx);
        #else
        return base;  // No-header mode: return BASE directly
        #endif
    }

    // === COLD PATH: Cache miss ===
    // Don't refill here - let caller handle via tiny_cold_refill_and_alloc()
    // This keeps hot path small and predictable
    TINY_HOT_METRICS_MISS(class_idx);
    return NULL;
}

// ============================================================================
// Box 2b: Tiny Hot Free (Ultra-Fast Path)
// ============================================================================

// Ultra-fast free to TLS unified cache
//
// CONTRACT:
//   Input:  class_idx (0-7), base pointer (BASE, not USER)
//   Output: 1=SUCCESS (pushed to cache), 0=FULL (caller handles)
//   Precondition: Cache initialized, base is valid BASE pointer
//   Postcondition: Cache tail advanced, object pushed to cache
//
// PERFORMANCE:
//   Hot path (cache not full): 2 branches, 2-3 cache misses
//   Cold path (cache full): Returns 0 (caller handles)
//
// BRANCH ANALYSIS:
//   1. class_idx range check (UNLIKELY, safety)
//   2. cache full check (UNLIKELY full)
//
__attribute__((always_inline))
static inline int tiny_hot_free_fast(int class_idx, void* base) {
    extern __thread TinyUnifiedCache g_unified_cache[];

    // TLS cache access (1 cache miss)
    // NOTE: Range check removed - caller guarantees valid class_idx
    TinyUnifiedCache* cache = &g_unified_cache[class_idx];

#if HAKMEM_TINY_UNIFIED_LIFO_COMPILED
    // Phase 15 v1: Mode check at entry (once per call, not scattered in hot path)
    // Phase 22: Compile-out when disabled (default OFF)
    int lifo_mode = tiny_unified_lifo_enabled();

    // Phase 15 v1: LIFO vs FIFO mode switch
    if (lifo_mode) {
        // === LIFO MODE: Stack-based (LIFO) ===
        // Try push to stack (tail is stack depth)
        if (unified_cache_try_push_lifo(class_idx, base)) {
            #if !HAKMEM_BUILD_RELEASE
            extern __thread uint64_t g_unified_cache_push[];
            g_unified_cache_push[class_idx]++;
            #endif
            return 1;  // SUCCESS
        }
        // LIFO overflow → fall through to cold path
        #if !HAKMEM_BUILD_RELEASE
        extern __thread uint64_t g_unified_cache_full[];
        g_unified_cache_full[class_idx]++;
        #endif
        return 0;  // FULL
    }
#endif

    // === FIFO MODE: Ring-based (existing, default) ===
    // Calculate next tail (for full check)
    uint16_t next_tail = (cache->tail + 1) & cache->mask;

    // Branch 1: Cache full check (UNLIKELY full)
    // Hot path: cache has space (next_tail != head)
    // Cold path: cache full (next_tail == head) → drain needed
    if (TINY_HOT_LIKELY(next_tail != cache->head)) {
        // === HOT PATH: Cache has space (2-3 instructions) ===

        // Push to cache (1 cache miss for array write)
        cache->slots[cache->tail] = base;
        cache->tail = next_tail;

        // Debug metrics (zero overhead in release)
        #if !HAKMEM_BUILD_RELEASE
        extern __thread uint64_t g_unified_cache_push[];
        g_unified_cache_push[class_idx]++;
        #endif

        return 1;  // SUCCESS
    }

    // === COLD PATH: Cache full ===
    // Don't drain here - let caller handle via tiny_cold_drain_and_free()
    #if !HAKMEM_BUILD_RELEASE
    extern __thread uint64_t g_unified_cache_full[];
    g_unified_cache_full[class_idx]++;
    #endif

    return 0;  // FULL
}

// ============================================================================
// Performance Notes
// ============================================================================

// Expected improvements (Phase 4-Step2):
// - Random Mixed 256: 60.6M → 68-75M ops/s (+10-15%)
// - Tiny Hot 64B: Current → +10-15%
//
// Key optimizations:
// 1. Branch reduction: 4-5 → 2 branches (hot path)
// 2. Branch hints: LIKELY/UNLIKELY guide CPU pipeline
// 3. Hot/Cold separation: Keeps hot path small (better i-cache)
// 4. Always inline: Eliminates function call overhead
// 5. Metrics gated: Zero overhead in release builds
//
// Trade-offs:
// 1. Code size: +50-100 bytes per call site (inline expansion)
// 2. Cold path complexity: Caller must handle NULL/0 returns
// 3. Cache assumption: Assumes cache initialized (lazy init moved to caller)

#endif // TINY_FRONT_HOT_BOX_H