hakmem/core/tiny_alloc_fast.inc.h

// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions)
// Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%)
// Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend
// Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart
//
// Box 5-NEW: SFC (Super Front Cache) Integration
// Architecture: SFC (Layer 0, 128-256 slots) → SLL (Layer 1, unlimited) → SuperSlab (Layer 2+)
// Cascade Refill: SFC ← SLL (one-way, safe)
// Goal: +200% performance (4.19M → 12M+ ops/s)
//
// Phase 2b: Adaptive TLS Cache Sizing
// Hot classes grow to 2048 slots, cold classes shrink to 16 slots
// Expected: +3-10% performance, -30-50% TLS cache memory overhead
#pragma once
#include "tiny_atomic.h"
#include "hakmem_tiny.h"
#include "tiny_route.h"
#include "tiny_alloc_fast_sfc.inc.h"  // Box 5-NEW: SFC Layer
#include "hakmem_tiny_fastcache.inc.h"  // Array stack (FastCache) for C0–C3
#include "hakmem_tiny_tls_list.h"      // TLS List (for tiny_fast_refill_and_take)
#include "tiny_region_id.h"            // Phase 7: Header-based class_idx lookup
#include "tiny_adaptive_sizing.h"      // Phase 2b: Adaptive sizing
#include "box/tls_sll_box.h"           // Box TLS-SLL: C7-safe push/pop/splice
#include "box/tiny_next_ptr_box.h"     // Box API: Next pointer read/write
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
#include "box/front_gate_box.h"
#endif
#include "hakmem_tiny_integrity.h"     // PRIORITY 1-4: Corruption detection
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
#include "front/tiny_front_c23.h"      // Phase B: Ultra-simple C2/C3 front
#include "front/tiny_heap_v2.h"        // Phase 13-A: TinyHeapV2 magazine front
#include "front/tiny_ultra_hot.h"      // Phase 14: TinyUltraHot C1/C2 ultra-fast path
#endif
#include <stdio.h>

// Phase 7 Task 2: Aggressive inline TLS cache access
// Enable with: make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1
#ifndef HAKMEM_TINY_AGGRESSIVE_INLINE
#define HAKMEM_TINY_AGGRESSIVE_INLINE 0
#endif

#if HAKMEM_TINY_AGGRESSIVE_INLINE
#include "tiny_alloc_fast_inline.h"
#endif

// ========== Debug Counters (compile-time gated) ==========
#if HAKMEM_DEBUG_COUNTERS
// Refill-stage counters (defined in hakmem_tiny.c)
extern unsigned long long g_rf_total_calls[];
extern unsigned long long g_rf_hit_bench[];
extern unsigned long long g_rf_hit_hot[];
extern unsigned long long g_rf_hit_mail[];
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_hit_ss[];
extern unsigned long long g_rf_hit_reg[];
extern unsigned long long g_rf_mmap_calls[];

// Publish hits (defined in hakmem_tiny.c)
extern unsigned long long g_pub_mail_hits[];
extern unsigned long long g_pub_bench_hits[];
extern unsigned long long g_pub_hot_hits[];

// Free pipeline (defined in hakmem_tiny.c)
extern unsigned long long g_free_via_tls_sll[];
#endif

// ========== Box 5: Allocation Fast Path ==========
// 箱理論の Fast Allocation 層。TLS freelist から直接 pop（3-4命令）。
// 不変条件:
// - TLS freelist が非空なら即座に return (no lock, no sync)
// - Miss なら Backend (Box 3: SuperSlab) に委譲
// - Cross-thread allocation は考慮しない（Backend が処理）

// External TLS variables (defined in hakmem_tiny.c)
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];

// External backend functions
// P0 Fix: Use appropriate refill function based on P0 status
#if HAKMEM_TINY_P0_BATCH_REFILL
extern int sll_refill_batch_from_ss(int class_idx, int max_take);
#else
extern int sll_refill_small_from_ss(int class_idx, int max_take);
#endif
// NEW: Direct SS→FC refill (bypasses SLL)
extern int ss_refill_fc_fill(int class_idx, int want);
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
extern int hak_tiny_size_to_class(size_t size);
extern int tiny_refill_failfast_level(void);
extern const size_t g_tiny_class_sizes[];
// Hot-class toggle: class5 (256B) dedicated TLS fast path
extern int g_tiny_hotpath_class5;

// Minimal class5 refill helper: fixed, branch-light refill into TLS List, then take one
// Preconditions: class_idx==5 and g_tiny_hotpath_class5==1
static inline void* tiny_class5_minirefill_take(void) {
    extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
    TinyTLSList* tls5 = &g_tls_lists[5];
    // Fast pop if available
    void* base = tls_list_pop(tls5, 5);
    if (base) {
        // ✅ FIX #16: Return BASE pointer (not USER)
        // Caller will apply HAK_RET_ALLOC which does BASE → USER conversion
        return base;
    }
    // Robust refill via generic helper（header対応・境界検証済み）
    return tiny_fast_refill_and_take(5, tls5);
}

// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
extern int g_refill_count_global;
extern int g_refill_count_hot;
extern int g_refill_count_mid;
extern int g_refill_count_class[TINY_NUM_CLASSES];

// HAK_RET_ALLOC macro is now defined in core/hakmem_tiny.c
// See lines 116-152 for single definition point based on HAKMEM_TINY_HEADER_CLASSIDX

// ========== RDTSC Profiling (lightweight) ==========
#ifdef __x86_64__
static inline uint64_t tiny_fast_rdtsc(void) {
    unsigned int lo, hi;
    __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
    return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
#endif

// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
static __thread uint64_t g_tiny_alloc_hits = 0;
static __thread uint64_t g_tiny_alloc_cycles = 0;
static __thread uint64_t g_tiny_refill_calls = 0;
static __thread uint64_t g_tiny_refill_cycles = 0;
static int g_tiny_profile_enabled = -1;  // -1: uninitialized

static inline int tiny_profile_enabled(void) {
    if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) {
        const char* env = getenv("HAKMEM_TINY_PROFILE");
        g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
    }
    return g_tiny_profile_enabled;
}

// Print profiling results at exit
static void tiny_fast_print_profile(void) __attribute__((destructor));
static void tiny_fast_print_profile(void) {
    if (!tiny_profile_enabled()) return;
    if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;

    fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
    if (g_tiny_alloc_hits > 0) {
        fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
                (unsigned long)g_tiny_alloc_hits,
                (unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
    }
    if (g_tiny_refill_calls > 0) {
        fprintf(stderr, "[REFILL]    count=%lu, avg_cycles=%lu\n",
                (unsigned long)g_tiny_refill_calls,
                (unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
    }
    fprintf(stderr, "===================================================\n\n");
}

// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========

// External SFC control (defined in hakmem_tiny_sfc.c)
extern int g_sfc_enabled;

// Allocation fast path (inline for zero-cost)
// Returns: pointer on success, NULL on miss (caller should try refill/slow)
//
// Box 5-NEW Architecture:
//   Layer 0: SFC (128-256 slots, high hit rate) [if enabled]
//   Layer 1: SLL (unlimited, existing)
//   Cascade: SFC miss → try SLL → refill
//
// Assembly (x86-64, optimized):
//   mov    rax, QWORD PTR g_sfc_head[class_idx]      ; SFC: Load head
//   test   rax, rax                                   ; Check NULL
//   jne    .sfc_hit                                   ; If not empty, SFC hit!
//   mov    rax, QWORD PTR g_tls_sll_head[class_idx]  ; SLL: Load head
//   test   rax, rax                                   ; Check NULL
//   je     .miss                                      ; If empty, miss
//   mov    rdx, QWORD PTR [rax]                       ; Load next
//   mov    QWORD PTR g_tls_sll_head[class_idx], rdx  ; Update head
//   ret                                               ; Return ptr
// .sfc_hit:
//   mov    rdx, QWORD PTR [rax]                       ; Load next
//   mov    QWORD PTR g_sfc_head[class_idx], rdx      ; Update head
//   ret
// .miss:
//   ; Fall through to refill
//
// Expected: 3-4 instructions on SFC hit, 6-8 on SLL hit
static inline void* tiny_alloc_fast_pop(int class_idx) {
    // PRIORITY 1: Bounds check before any TLS array access
    HAK_CHECK_CLASS_IDX(class_idx, "tiny_alloc_fast_pop");
#if !HAKMEM_BUILD_RELEASE
    // Phase 3: Debug counters eliminated in release builds
    atomic_fetch_add(&g_integrity_check_class_bounds, 1);

    // DEBUG: Log class 2 pops (DISABLED for performance)
    static _Atomic uint64_t g_fast_pop_count = 0;
    uint64_t pop_call = atomic_fetch_add(&g_fast_pop_count, 1);
    if (0 && class_idx == 2 && pop_call > 5840 && pop_call < 5900) {
        fprintf(stderr, "[FAST_POP_C2] call=%lu cls=%d head=%p count=%u\n",
                pop_call, class_idx, g_tls_sll_head[class_idx], g_tls_sll_count[class_idx]);
        fflush(stderr);
    }
#endif

    // Phase E1-CORRECT: C7 now has headers, can use fast path
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
    void* out = NULL;
    if (front_gate_try_pop(class_idx, &out)) {
        return out;
    }
    return NULL;
#else
    // Phase 7 Task 3: Profiling overhead removed in release builds
    // In release mode, compiler can completely eliminate profiling code
#if !HAKMEM_BUILD_RELEASE
    uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
#endif

    // Phase 1: Try array stack (FastCache) first for hottest tiny classes (C0–C3)
    if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) {
        void* fc = fastcache_pop(class_idx);
        if (__builtin_expect(fc != NULL, 1)) {
            // Frontend FastCache hit
            extern unsigned long long g_front_fc_hit[];
            g_front_fc_hit[class_idx]++;
            return fc;
        } else {
            extern unsigned long long g_front_fc_miss[];
            g_front_fc_miss[class_idx]++;
        }
    }

    // Box 5-NEW: Layer 0 - Try SFC first (if enabled)
    // Cache g_sfc_enabled in TLS to avoid global load on every allocation
    static __thread int sfc_check_done = 0;
    static __thread int sfc_is_enabled = 0;
    if (__builtin_expect(!sfc_check_done, 0)) {
        sfc_is_enabled = g_sfc_enabled;
        sfc_check_done = 1;
    }

    if (__builtin_expect(sfc_is_enabled, 1)) {
        void* base = sfc_alloc(class_idx);
        if (__builtin_expect(base != NULL, 1)) {
            // Front Gate: SFC hit
            extern unsigned long long g_front_sfc_hit[];
            g_front_sfc_hit[class_idx]++;
            // 🚀 SFC HIT! (Layer 0)
#if !HAKMEM_BUILD_RELEASE
            if (start) {
                g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
                g_tiny_alloc_hits++;
            }
#endif
            // ✅ FIX #16: Return BASE pointer (not USER)
            // Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
            // which does the BASE → USER conversion. Double conversion was causing corruption!
            return base;
        }
        // SFC miss → try SLL (Layer 1)
    }

    // Box Boundary: Layer 1 - TLS SLL freelist の先頭を pop（envで無効化可）
    extern int g_tls_sll_enable;  // set at init via HAKMEM_TINY_TLS_SLL
    if (__builtin_expect(g_tls_sll_enable, 1)) {
        // Use Box TLS-SLL API (C7-safe pop)
        // CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!)
        // Reading head before pop causes stale read → rbp=0xa0 SEGV
        void* base = NULL;
        if (tls_sll_pop(class_idx, &base)) {
            // Front Gate: SLL hit (fast path 3 instructions)
            extern unsigned long long g_front_sll_hit[];
            g_front_sll_hit[class_idx]++;

#if HAKMEM_DEBUG_COUNTERS
            // Track TLS freelist hits (compile-time gated, zero runtime cost when disabled)
            g_free_via_tls_sll[class_idx]++;
#endif

#if !HAKMEM_BUILD_RELEASE
            // Debug: Track profiling (release builds skip this overhead)
            if (start) {
                g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
                g_tiny_alloc_hits++;
            }
#endif
            // ✅ FIX #16: Return BASE pointer (not USER)
            // Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
            // which does the BASE → USER conversion. Double conversion was causing corruption!
            return base;
        }
    }

    // Fast path miss → NULL (caller should refill)
    return NULL;
#endif
}

// ========== Cascade Refill: SFC ← SLL (Box Theory boundary) ==========

// Cascade refill: Transfer blocks from SLL to SFC (one-way, safe)
// Returns: number of blocks transferred
//
// Contract:
// - Transfer ownership: SLL → SFC
// - No circular dependency: one-way only
// - Boundary clear: SLL pop → SFC push
// - Fallback safe: if SFC full, stop (no overflow)
// Env-driven cascade percentage (0-100), default 50%
static inline int sfc_cascade_pct(void) {
    static int pct = -1;
    if (__builtin_expect(pct == -1, 0)) {
        const char* e = getenv("HAKMEM_SFC_CASCADE_PCT");
        int v = e && *e ? atoi(e) : 50;
        if (v < 0) v = 0; if (v > 100) v = 100;
        pct = v;
    }
    return pct;
}

static inline int sfc_refill_from_sll(int class_idx, int target_count) {
    // PRIORITY 1: Bounds check
    HAK_CHECK_CLASS_IDX(class_idx, "sfc_refill_from_sll");
    atomic_fetch_add(&g_integrity_check_class_bounds, 1);

    int transferred = 0;
    uint32_t cap = g_sfc_capacity[class_idx];

    // Adjust target based on cascade percentage
    int pct = sfc_cascade_pct();
    int want = (target_count * pct) / 100;
    if (want <= 0) want = target_count / 2;  // safety fallback

    while (transferred < want && g_tls_sll_count[class_idx] > 0) {
        // Check SFC capacity before transfer
        if (g_sfc_count[class_idx] >= cap) {
            break;  // SFC full, stop
        }

        // Pop from SLL (Layer 1) using Box TLS-SLL API (C7-safe)
        void* ptr = NULL;
        if (!tls_sll_pop(class_idx, &ptr)) {
            break;  // SLL empty
        }

        // Push to SFC (Layer 0) — header-aware
        tiny_next_write(class_idx, ptr, g_sfc_head[class_idx]);
        g_sfc_head[class_idx] = ptr;
        g_sfc_count[class_idx]++;

        transferred++;
    }

    return transferred;
}

// ========== Refill Path: Backend Integration ==========

// Refill TLS freelist from backend (SuperSlab/ACE/Learning layer)
// Returns: number of blocks refilled
//
// Box 5-NEW Architecture:
//   SFC enabled:  SuperSlab → SLL → SFC (cascade)
//   SFC disabled: SuperSlab → SLL (direct, old path)
//
// This integrates with existing HAKMEM infrastructure:
// - SuperSlab provides memory chunks
// - ACE provides adaptive capacity learning
// - L25 provides mid-large integration
//
// Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 16)
// - Smaller count (8-16): better for diverse workloads, faster warmup
// - Larger count (64-128): better for homogeneous workloads, fewer refills
static inline int tiny_alloc_fast_refill(int class_idx) {
    // Phase E1-CORRECT: C7 now has headers, can use refill

    // Phase 7 Task 3: Profiling overhead removed in release builds
    // In release mode, compiler can completely eliminate profiling code
#if !HAKMEM_BUILD_RELEASE
    uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
#endif

    // Phase 2b: Check available capacity before refill
    int available_capacity = get_available_capacity(class_idx);
    if (available_capacity <= 0) {
        // Cache is full, don't refill
        return 0;
    }

    // Phase 7 Task 3: Simplified refill count (cached per-class in TLS)
    // Previous: Complex precedence logic on every miss (5-10 cycles overhead)
    // Now: Simple TLS cache lookup (1-2 cycles)
    static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
    // Simple adaptive booster: bump per-class refill size when refills are frequent.
    static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0};
    int cnt = s_refill_count[class_idx];
    if (__builtin_expect(cnt == 0, 0)) {
        // First miss: Initialize from globals (parsed at init time)
        int v = HAKMEM_TINY_REFILL_DEFAULT;  // Default from hakmem_build_flags.h

        // Precedence: per-class > hot/mid > global
        if (g_refill_count_class[class_idx] > 0) {
            v = g_refill_count_class[class_idx];
        } else if (class_idx <= 3 && g_refill_count_hot > 0) {
            v = g_refill_count_hot;
        } else if (class_idx >= 4 && g_refill_count_mid > 0) {
            v = g_refill_count_mid;
        } else if (g_refill_count_global > 0) {
            v = g_refill_count_global;
        }

        // Clamp to sane range (min: 8, max: 256)
        if (v < 8) v = 8;        // Minimum: avoid thrashing
        if (v > 256) v = 256;    // Maximum: avoid excessive TLS memory

        s_refill_count[class_idx] = v;
        cnt = v;
    }

    // Phase 2b: Clamp refill count to available capacity
    if (cnt > available_capacity) {
        cnt = available_capacity;
    }

#if HAKMEM_DEBUG_COUNTERS
    // Track refill calls (compile-time gated)
    g_rf_total_calls[class_idx]++;
#endif

    // Box Boundary: Delegate to Backend (Box 3: SuperSlab)
    // Refill Dispatch: Standard (ss_refill_fc_fill) vs Legacy SLL (A/B only)
    // Standard: Enabled by FRONT_DIRECT=1, REFILL_BATCH=1, or P0_DIRECT_FC_ALL=1
    // Legacy:   Fallback for compatibility (will be deprecated)
    int refilled = 0;

    // NEW: Front-Direct refill control (A/B toggle)
    static __thread int s_use_front_direct = -1;
    if (__builtin_expect(s_use_front_direct == -1, 0)) {
        // Check multiple ENV flags (any one enables Front-Direct)
        const char* e1 = getenv("HAKMEM_TINY_FRONT_DIRECT");
        const char* e2 = getenv("HAKMEM_TINY_P0_DIRECT_FC_ALL");
        const char* e3 = getenv("HAKMEM_TINY_REFILL_BATCH");
        s_use_front_direct = ((e1 && *e1 && *e1 != '0') ||
                              (e2 && *e2 && *e2 != '0') ||
                              (e3 && *e3 && *e3 != '0')) ? 1 : 0;
    }

    // Refill dispatch
    if (s_use_front_direct) {
        // NEW: Direct SS→FC (bypasses SLL)
        refilled = ss_refill_fc_fill(class_idx, cnt);
    } else {
        // Legacy: SS→SLL→FC (via batch or generic)
#if HAKMEM_TINY_P0_BATCH_REFILL
        refilled = sll_refill_batch_from_ss(class_idx, cnt);
#else
        refilled = sll_refill_small_from_ss(class_idx, cnt);
#endif
    }

    // Lightweight adaptation: if refills keep happening, increase per-class refill.
    // Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
    if (refilled > 0) {
        uint8_t c = ++s_refill_calls[class_idx];
        if (class_idx == 7) {
            // Every 4 refills, increase target by +16 up to 128 (unless overridden).
            if ((c & 0x03u) == 0) {
                int target = s_refill_count[class_idx];
                if (target < 128) {
                    target += 16;
                    if (target > 128) target = 128;
                    s_refill_count[class_idx] = target;
                }
            }
        }
    } else {
        // No refill performed (capacity full): slowly decay the counter.
        if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--;
    }

    // Phase 2b: Track refill and adapt cache size
    if (refilled > 0) {
        track_refill_for_adaptation(class_idx);
    }

    // Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default)
    // NEW: Default OFF, enable via HAKMEM_TINY_SFC_CASCADE=1
    // Skip entirely when Front-Direct is active (direct SS→FC path)
    static __thread int sfc_cascade_enabled = -1;
    if (__builtin_expect(sfc_cascade_enabled == -1, 0)) {
        // Front-Direct bypasses SLL, so SFC cascade is pointless
        if (s_use_front_direct) {
            sfc_cascade_enabled = 0;
        } else {
            // Check ENV flag (default: OFF)
            const char* e = getenv("HAKMEM_TINY_SFC_CASCADE");
            sfc_cascade_enabled = (e && *e && *e != '0') ? 1 : 0;
        }
    }

    // Only cascade if explicitly enabled AND we have refilled blocks in SLL
    if (sfc_cascade_enabled && g_sfc_enabled && refilled > 0) {
        // Skip SFC cascade for class5 when dedicated hotpath is enabled
        if (g_tiny_hotpath_class5 && class_idx == 5) {
            // no-op: keep refilled blocks in TLS List/SLL
        } else {
        // Transfer half of refilled blocks to SFC (keep half in SLL for future)
        int sfc_target = refilled / 2;
        if (sfc_target > 0) {
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
            front_gate_after_refill(class_idx, refilled);
#else
            int transferred = sfc_refill_from_sll(class_idx, sfc_target);
            (void)transferred;  // Unused, but could track stats
#endif
        }
        }
    }

#if !HAKMEM_BUILD_RELEASE
    // Debug: Track profiling (release builds skip this overhead)
    if (start) {
        g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
        g_tiny_refill_calls++;
    }
#endif

    return refilled;
}

// ========== Combined Fast Path (Alloc + Refill) ==========

// Complete fast path allocation (inline for zero-cost)
// Returns: pointer on success, NULL on failure (OOM or size too large)
//
// Flow:
// 1. TLS freelist pop (3-4 instructions) - Hit rate ~95%
// 2. Miss → Refill from backend (~5% cases)
// 3. Refill success → Retry pop
// 4. Refill failure → Slow path (OOM or new SuperSlab allocation)
//
// Example usage:
//   void* ptr = tiny_alloc_fast(64);
//   if (!ptr) {
//       // OOM handling
//   }
static inline void* tiny_alloc_fast(size_t size) {
#if !HAKMEM_BUILD_RELEASE
    // Phase 3: Debug counters eliminated in release builds
    static _Atomic uint64_t alloc_call_count = 0;
    uint64_t call_num = atomic_fetch_add(&alloc_call_count, 1);
#endif

    // 1. Size → class index (inline, fast)
    int class_idx = hak_tiny_size_to_class(size);

    if (__builtin_expect(class_idx < 0, 0)) {
        return NULL;  // Size > 1KB, not Tiny
    }

#if !HAKMEM_BUILD_RELEASE
    // Phase 3: Debug checks eliminated in release builds
    // CRITICAL: Bounds check to catch corruption
    if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) {
        fprintf(stderr, "[TINY_ALLOC_FAST] FATAL: class_idx=%d out of bounds! size=%zu call=%lu\n",
                class_idx, size, call_num);
        fflush(stderr);
        abort();
    }

    // Debug logging (DISABLED for performance)
    if (0 && call_num > 14250 && call_num < 14280) {
        fprintf(stderr, "[TINY_ALLOC] call=%lu size=%zu class=%d sll_head[%d]=%p count=%u\n",
                call_num, size, class_idx, class_idx,
                g_tls_sll_head[class_idx], g_tls_sll_count[class_idx]);
        fflush(stderr);
    }
#endif

    ROUTE_BEGIN(class_idx);

    void* ptr = NULL;
    const int hot_c5 = (g_tiny_hotpath_class5 && class_idx == 5);

    // Phase B: Ultra-simple front for C2/C3 (128B/256B)
    // ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1
    // Target: 15-20M ops/s (vs current 8-9M ops/s)
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
    if (tiny_front_c23_enabled() && (class_idx == 2 || class_idx == 3)) {
        void* c23_ptr = tiny_front_c23_alloc(size, class_idx);
        if (c23_ptr) {
            HAK_RET_ALLOC(class_idx, c23_ptr);
        }
        // Fall through to existing path if C23 path failed (NULL)
    }
#endif

    // Phase 14-C: TinyUltraHot Borrowing Design (正史から借りる設計)
    // ENV-gated: HAKMEM_TINY_ULTRA_HOT=1 (default: ON)
    // Targets C2-C5 (16B-128B)
    // Design: UltraHot は TLS SLL から借りたブロックを magazine に保持
    //   - Hit: magazine から返す (L0, fastest)
    //   - Miss: TLS SLL から refill して再試行
    if (__builtin_expect(ultra_hot_enabled(), 1)) {
        void* base = ultra_hot_alloc(size);
        if (base) {
            HAK_RET_ALLOC(class_idx, base);  // Header write + return USER pointer
        }
        // Miss → TLS SLL から借りて refill（正史から借用）
        if (class_idx >= 2 && class_idx <= 5) {
            ultra_hot_try_refill(class_idx);
            // Retry after refill
            base = ultra_hot_alloc(size);
            if (base) {
                HAK_RET_ALLOC(class_idx, base);
            }
        }
    }

    // Phase 13-A: TinyHeapV2 (per-thread magazine, experimental)
    // ENV-gated: HAKMEM_TINY_HEAP_V2=1
    // Targets class 0-3 (8-64B) only, falls back to existing path if NULL
    // PERF: Pass class_idx directly to avoid redundant size→class conversion
    if (__builtin_expect(tiny_heap_v2_enabled(), 0) && class_idx <= 3) {
        void* base = tiny_heap_v2_alloc_by_class(class_idx);
        if (base) {
            HAK_RET_ALLOC(class_idx, base);  // Header write + return USER pointer
        }
    }

    // NEW: Front-Direct/SLL-OFF bypass control (TLS cached, lazy init)
    static __thread int s_front_direct_alloc = -1;
    if (__builtin_expect(s_front_direct_alloc == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_FRONT_DIRECT");
        s_front_direct_alloc = (e && *e && *e != '0') ? 1 : 0;
    }

    if (__builtin_expect(hot_c5, 0)) {
        // class5: 専用最短経路（generic frontは一切通らない）
        void* p = tiny_class5_minirefill_take();
        if (p) HAK_RET_ALLOC(class_idx, p);

        int refilled = tiny_alloc_fast_refill(class_idx);
        if (__builtin_expect(refilled > 0, 1)) {
            p = tiny_class5_minirefill_take();
            if (p) HAK_RET_ALLOC(class_idx, p);
        }

        // slow pathへ（genericフロントは回避）
        ptr = hak_tiny_alloc_slow(size, class_idx);
        if (ptr) HAK_RET_ALLOC(class_idx, ptr);
        return ptr;  // NULL if OOM
    }

    // Generic front (FastCache/SFC/SLL)
    // Respect SLL global toggle AND Front-Direct mode; when either disabled, skip TLS SLL entirely
    if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
        // For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads.
        if (class_idx <= 3) {
#if HAKMEM_TINY_INLINE_SLL
            // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
            TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
#else
            // Default: Safe Box API (Box TLS-SLL) for all standard builds
            ptr = tiny_alloc_fast_pop(class_idx);
#endif
        } else {
            void* base = NULL;
            if (tls_sll_pop(class_idx, &base)) ptr = base; else ptr = NULL;
        }
    } else {
        ptr = NULL;  // SLL disabled OR Front-Direct active → bypass SLL
    }

    if (__builtin_expect(ptr != NULL, 1)) {
        HAK_RET_ALLOC(class_idx, ptr);
    }

    // Generic: Refill and take (Front-Direct vs Legacy)
    if (s_front_direct_alloc) {
        // Front-Direct: Direct SS→FC refill (bypasses SLL/TLS List)
        int refilled_fc = tiny_alloc_fast_refill(class_idx);
        if (__builtin_expect(refilled_fc > 0, 1)) {
            void* fc_ptr = fastcache_pop(class_idx);
            if (fc_ptr) {
                HAK_RET_ALLOC(class_idx, fc_ptr);
            }
        }
    } else {
        // Legacy: Refill to TLS List/SLL
        extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
        void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
        if (took) {
            HAK_RET_ALLOC(class_idx, took);
        }
    }

    // Backend refill後に再トライ
    {
        int refilled = tiny_alloc_fast_refill(class_idx);
        if (__builtin_expect(refilled > 0, 1)) {
            // Skip SLL retry if Front-Direct OR SLL disabled
            if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
                if (class_idx <= 3) {
#if HAKMEM_TINY_INLINE_SLL
                    // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
                    TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
#else
                    // Default: Safe Box API (Box TLS-SLL) for all standard builds
                    ptr = tiny_alloc_fast_pop(class_idx);
#endif
                } else {
                    void* base2 = NULL;
                    if (tls_sll_pop(class_idx, &base2)) ptr = base2; else ptr = NULL;
                }
            } else {
                ptr = NULL;  // SLL disabled OR Front-Direct active → bypass SLL
            }
            if (ptr) {
                HAK_RET_ALLOC(class_idx, ptr);
            }
        }
    }

    // 5. Refill failure or still empty → slow path (OOM or new SuperSlab)
    // Box Boundary: Delegate to Slow Path (Box 3 backend)
    ptr = hak_tiny_alloc_slow(size, class_idx);
    if (ptr) {
        HAK_RET_ALLOC(class_idx, ptr);
    }

    return ptr;  // NULL if OOM
}

// ========== Push to TLS Freelist (for free path) ==========

// Push block to TLS freelist (used by free fast path)
// This is a "helper" for Box 6 (Free Fast Path)
//
// Invariant: ptr must belong to current thread (no ownership check here)
// Caller (Box 6) is responsible for ownership verification
static inline void tiny_alloc_fast_push(int class_idx, void* ptr) {
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
    front_gate_push_tls(class_idx, ptr);
#else
    // Box Boundary: Push to TLS freelist using Box TLS-SLL API (C7-safe)
    uint32_t capacity = UINT32_MAX;  // Unlimited for helper function
    if (!tls_sll_push(class_idx, ptr, capacity)) {
        // C7 rejected or SLL somehow full (should not happen)
        // In release builds, this is a no-op (caller expects success)
#if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[WARN] tls_sll_push failed in tiny_alloc_fast_push cls=%d ptr=%p\n",
                class_idx, ptr);
#endif
    }
#endif
}

// ========== Statistics & Diagnostics ==========

// Get TLS freelist stats (for debugging/profiling)
typedef struct {
    int class_idx;
    void* head;
    uint32_t count;
} TinyAllocFastStats;

static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
    TinyAllocFastStats stats = {
        .class_idx = class_idx,
        .head = g_tls_sll_head[class_idx],
        .count = g_tls_sll_count[class_idx]
    };
    return stats;
}

// Reset TLS freelist (for testing/benchmarking)
// WARNING: This leaks memory! Only use in controlled test environments.
static inline void tiny_alloc_fast_reset(int class_idx) {
    g_tls_sll_head[class_idx] = NULL;
    g_tls_sll_count[class_idx] = 0;
}

// ========== Performance Notes ==========
//
// Expected metrics (based on System tcache & HAKX +171% results):
// - Fast path hit rate: 95%+ (workload dependent)
// - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs)
// - Miss penalty: ~20-50 instructions (refill from SuperSlab)
// - Throughput improvement: +10-25% vs current multi-layer design
//
// Key optimizations:
// 1. `__builtin_expect` for branch prediction (hot path first)
// 2. `static inline` for zero-cost abstraction
// 3. TLS variables (no atomic ops, no locks)
// 4. Minimal work in fast path (defer stats/accounting to backend)
//
// Comparison with current design:
// - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...)
// - New: 3-4 instructions (TLS freelist pop only)
// - Reduction: -80% instructions in hot path
//
// Inspired by:
// - System tcache (glibc malloc) - 3-4 instruction fast path
// - HAKX Mid-Large (+171%) - "Simple Front + Smart Back"
// - Box Theory - Clear boundaries, minimal coupling