hakmem/core/tiny_free_fast.inc.h

// tiny_free_fast.inc.h - Box 6: Free Fast Path (same-thread free only)
// Purpose: Ultra-fast TLS freelist push (ownership check + 2-3 instructions)
// Invariant: owner_tid == my_tid → push to TLS, else → delegate to remote path
// Design: Clear boundary between same-thread (fast) and cross-thread (remote)
#pragma once
#include "tiny_atomic.h"
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include "slab_handle.h"
#include "tiny_alloc_fast_sfc.inc.h"  // For sfc_free_push

// ========== Debug Counters (compile-time gated) ==========
#if HAKMEM_DEBUG_COUNTERS
// Free pipeline counters (defined in hakmem_tiny.c)
extern unsigned long long g_free_via_ss_local[];
extern unsigned long long g_free_via_ss_remote[];
#endif

// ========== Box 6: Free Fast Path ==========
// 箱理論の Fast Free 層。Same-thread free のみ処理（2-3命令 + ownership check）。
// 不変条件:
// - owner_tid == my_tid → TLS freelist に push (no lock, no sync)
// - owner_tid != my_tid → Box 2 (Remote Queue) に委譲
// - Cross-thread free は絶対に TLS freelist に入れない（A213 エラー防止）

// External functions (Backend)
extern void hak_tiny_free(void* ptr);
extern void hak_tiny_free_with_slab(void* ptr, TinySlab* slab);
// hak_free_at signature: (void* ptr, size_t hint_sz, hak_callsite_t site)
// where hak_callsite_t is const void*
extern void hak_free_at(void* ptr, size_t hint_sz, const void* site);
extern SuperSlab* hak_super_lookup(void* ptr);
extern TinySlab* hak_tiny_owner_slab(void* ptr);
extern int g_use_superslab;

// External helpers
extern uint32_t tiny_self_u32(void);
extern pthread_t tiny_self_pt(void);

// External TLS variables (from Box 5)
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
// Hot-class toggle: class5 (256B) dedicated TLS fast path
extern int g_tiny_hotpath_class5;
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];

// Box 5 helper (TLS push)
extern void tiny_alloc_fast_push(int class_idx, void* ptr);

// ========== Ownership Check ==========

// Check if ptr belongs to current thread (SuperSlab path)
// Returns: 1 if same-thread, 0 if cross-thread
//
// Box Boundary: This is the critical check that prevents TOCTOU races
// - owner_tid == my_tid → Safe to push to TLS freelist
// - owner_tid != my_tid → MUST delegate to remote path
//
// Invariant: This check MUST be atomic (no TOCTOU between check and push)
static inline int tiny_free_is_same_thread_ss(SuperSlab* ss, int slab_idx, uint32_t my_tid) {
    TinySlabMeta* meta = &ss->slabs[slab_idx];
    uint8_t my_tid_low = (uint8_t)my_tid;
    uint8_t owner = tiny_atomic_load_u8_relaxed(&meta->owner_tid_low);
    return (owner == my_tid_low && owner != 0);
}

// Check if ptr belongs to current thread (Legacy TinySlab path)
// Returns: 1 if same-thread, 0 if cross-thread
static inline int tiny_free_is_same_thread_legacy(TinySlab* slab) {
    pthread_t my_tid = tiny_self_pt();
    return pthread_equal(slab->owner_tid, my_tid);
}

// ========== Fast Path: Same-Thread Free (2-3 instructions) ==========

// Free fast path for SuperSlab-backed allocation
// Returns: 1 on success (same-thread, pushed to TLS), 0 on failure (cross-thread)
//
// Assembly (x86-64, optimized):
//   mov    eax, DWORD PTR [meta->owner_tid]   ; Load owner_tid
//   cmp    eax, my_tid                        ; Compare with my_tid
//   jne    .cross_thread                      ; If not equal, cross-thread
//   mov    rax, QWORD PTR g_tls_sll_head[cls] ; Load head
//   mov    QWORD PTR [ptr], rax               ; ptr->next = head
//   mov    QWORD PTR g_tls_sll_head[cls], ptr ; head = ptr
//   ret                                       ; Done
// .cross_thread:
//   ; Delegate to remote path
//
// Expected: 2-3 instructions on same-thread path (1 cmp, 1 load, 1 store)
//
// ⚠️  CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
static inline int tiny_free_fast_ss(SuperSlab* ss, int slab_idx, void* base, uint32_t my_tid) {
    // BUGFIX: Validate slab_idx before array access (prevents buffer overflow at ss->slabs[-1])
    int cap = ss_slabs_capacity(ss);
    if (__builtin_expect(slab_idx < 0 || slab_idx >= cap, 0)) {
        return 0;  // Invalid index, reject
    }
    TinySlabMeta* meta = &ss->slabs[slab_idx];

    // Debug: Track tiny_free_fast_ss calls
#if !HAKMEM_BUILD_RELEASE
    static __thread int free_ss_debug_enabled = -1;
    static __thread int free_ss_debug_count = 0;
    if (__builtin_expect(free_ss_debug_enabled == -1, 0)) {
        free_ss_debug_enabled = getenv("HAKMEM_SFC_DEBUG") ? 1 : 0;
    }
    if (free_ss_debug_enabled && free_ss_debug_count < 20) {
        free_ss_debug_count++;
        int is_same = tiny_free_is_same_thread_ss(ss, slab_idx, my_tid);
        extern int g_sfc_enabled;
        fprintf(stderr, "[FREE_SS] base=%p, cls=%u, same_thread=%d, sfc_enabled=%d\n",
                base,
                meta->class_idx,
                is_same,
                g_sfc_enabled);
    }
#endif

    // Box 6 Boundary: Ownership check (TOCTOU-safe)
    if (__builtin_expect(!tiny_free_is_same_thread_ss(ss, slab_idx, my_tid), 0)) {
#if HAKMEM_DEBUG_COUNTERS
        // Track cross-thread frees (compile-time gated)
        g_free_via_ss_remote[meta->class_idx]++;
#endif
        return 0;  // Cross-thread → caller should delegate to remote path
    }

    // Fast path: Same-thread free (2-3 instructions)
    int class_idx = meta->class_idx;
    // Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)

#if HAKMEM_DEBUG_COUNTERS
    // Track same-thread frees (compile-time gated)
    g_free_via_ss_local[class_idx]++;
#endif

    // Box 5 integration: class5 can use dedicated TLS List hotpath
    extern int g_sfc_enabled;
    if (__builtin_expect(g_tiny_hotpath_class5 && class_idx == 5, 0)) {
        TinyTLSList* tls5 = &g_tls_lists[5];
        // Use guarded push for class5 to avoid sentinel/next poisoning during triage
        tls_list_push(tls5, base, 5);
    } else if (g_sfc_enabled) {
        // Box 5-NEW: Try SFC (128-256 slots)
        if (!sfc_free_push(class_idx, base)) {
            // SFC full → skip caching, use slow path (return 0)
            // Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
            return 0;
        }
    } else {
        // Box 5-OLD: Use SLL (16 slots)
        tiny_alloc_fast_push(class_idx, base);
    }

    // Active accounting (Box 3: SuperSlab)
    // This is relatively cheap (atomic decrement) and necessary for memory management
    ss_active_dec_one(ss);

    return 1;  // Success
}

// Free fast path for Legacy TinySlab-backed allocation
// Returns: 1 on success (same-thread), 0 on failure (cross-thread)
//
// ⚠️  CRITICAL: ptr parameter must be BASE pointer (already converted from USER via ptr-1)
static inline int tiny_free_fast_legacy(TinySlab* slab, void* base) {
    // Box 6 Boundary: Ownership check
    if (__builtin_expect(!tiny_free_is_same_thread_legacy(slab), 0)) {
        return 0;  // Cross-thread → caller should delegate to precise path
    }

    // Fast path: Same-thread free
    int class_idx = slab->class_idx;
    // Phase E1-CORRECT: base pointer already converted by caller (no double conversion!)

    // Box 5-NEW/5-OLD integration: Push to TLS freelist (SFC or SLL)
    extern int g_sfc_enabled;
    if (g_sfc_enabled) {
        // Box 5-NEW: Try SFC (128 slots)
        if (!sfc_free_push(class_idx, base)) {
            // SFC full → skip caching, use slow path (return 0)
            // Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
            return 0;
        }
    } else {
        // Box 5-OLD: Use SLL (16 slots)
        tiny_alloc_fast_push(class_idx, base);
    }

    return 1;  // Success
}

// ========== Combined Fast Free (Lookup + Ownership + Push) ==========

// Complete fast free path (inline for zero-cost)
// Returns: none (delegates to backend on cross-thread or non-tiny)
//
// Flow:
// 1. Lookup ptr → SuperSlab or TinySlab
// 2. Ownership check (owner_tid == my_tid)
// 3. Same-thread → TLS freelist push (2-3 instructions)
// 4. Cross-thread → Delegate to Box 2 (Remote Queue)
// 5. Not Tiny → Delegate to backend (Mid/Large)
//
// Example usage:
//   tiny_free_fast(ptr);  // Always succeeds (delegates on failure)
static inline void tiny_free_fast(void* ptr) {
    // Optional runtime gate to disable fast free and route to slow path
    // Env: HAKMEM_TINY_FREE_FAST (default: 1). Additionally, if
    // HAKMEM_TINY_FREE_TO_SS=1 is set, prefer SS path by disabling fast free.
    static int s_free_fast_en = -1;
    if (__builtin_expect(s_free_fast_en == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_FREE_FAST");
        int v = (e && *e && *e != '0') ? 1 : 1; // default ON
        const char* to_ss = getenv("HAKMEM_TINY_FREE_TO_SS");
        if (to_ss && *to_ss && *to_ss != '0') v = 0; // FREE_TO_SS implies slow path
        s_free_fast_en = v;
    }
    if (!s_free_fast_en) {
        // Delegate to precise slow path (handles same/remote + publish)
        hak_tiny_free(ptr);
        return;
    }
    // 1. SuperSlab-backed tiny pointer?
    if (__builtin_expect(g_use_superslab != 0, 1)) {
        SuperSlab* ss = hak_super_lookup(ptr);
        if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) {
            // ✅ FIX: Phase E1-CORRECT - Convert USER → BASE before slab index calculation
            void* base = (void*)((uint8_t*)ptr - 1);
            int slab_idx = slab_index_for(ss, base);
            uint32_t self_tid = tiny_self_u32();

            // Box 6 Boundary: Try same-thread fast path
            // CRITICAL: Pass BASE pointer (already converted above)
            if (tiny_free_fast_ss(ss, slab_idx, base, self_tid)) {
                return;  // Success: same-thread, pushed to TLS
            }

            // Cross-thread free → Box 2 (Remote Queue)
            // Delegate to full tiny free (handles remote push)
            hak_tiny_free(ptr);
            return;
        }
    }

    // 2. Legacy TinySlab-backed pointer?
    TinySlab* slab = hak_tiny_owner_slab(ptr);
    if (__builtin_expect(slab != NULL, 0)) {
        // Convert USER → BASE (for Legacy path)
        void* base_legacy = (void*)((uint8_t*)ptr - 1);

        // Box 6 Boundary: Try same-thread fast path
        // CRITICAL: Pass BASE pointer (already converted above)
        if (tiny_free_fast_legacy(slab, base_legacy)) {
            return;  // Success: same-thread, pushed to TLS
        }

        // Cross-thread free → precise path with known slab
        hak_tiny_free_with_slab(ptr, slab);
        return;
    }

    // 3. Not a tiny allocation → Delegate to backend (Mid/Large/Mmap)
    hak_free_at(ptr, 0, 0);
}

// ========== Guard/Debug Variants ==========

// Free with additional safety checks (for debugging/testing)
// This variant includes:
// - Sentinel checks (0xBADA55)
// - Double-free detection
// - Ownership validation
//
// Usage: Enable with HAKMEM_SAFE_FREE=1 environment variable
static inline void tiny_free_fast_guarded(void* ptr) {
    // TODO: Implement guard checks if needed
    // For now, delegate to standard fast path
    tiny_free_fast(ptr);
}

// ========== Statistics & Diagnostics ==========

// Free fast path stats (for profiling)
typedef struct {
    uint64_t same_thread_count;   // Same-thread frees (TLS push)
    uint64_t cross_thread_count;  // Cross-thread frees (remote queue)
    uint64_t non_tiny_count;      // Non-tiny frees (backend)
} TinyFreeFastStats;

// Get free fast path stats (TLS-local)
static __thread TinyFreeFastStats g_tiny_free_fast_stats = {0};

static inline TinyFreeFastStats tiny_free_fast_stats_get(void) {
    return g_tiny_free_fast_stats;
}

// Reset free fast path stats (for testing/benchmarking)
static inline void tiny_free_fast_stats_reset(void) {
    g_tiny_free_fast_stats.same_thread_count = 0;
    g_tiny_free_fast_stats.cross_thread_count = 0;
    g_tiny_free_fast_stats.non_tiny_count = 0;
}

// ========== Performance Notes ==========
//
// Expected metrics:
// - Same-thread hit rate: 80-90% (workload dependent)
// - Same-thread latency: 2-3 instructions (ownership check + push)
// - Cross-thread penalty: ~50-100 instructions (remote queue push)
// - Throughput improvement: +10-20% vs current multi-layer design
//
// Key optimizations:
// 1. Ownership check first (fail-fast on cross-thread)
// 2. `__builtin_expect` for branch prediction (same-thread is common)
// 3. `static inline` for zero-cost abstraction
// 4. TLS variables (no atomic ops in same-thread path)
//
// TOCTOU Race Prevention (Box 4 Boundary):
// - Ownership check is atomic (tiny_atomic_load_u32_relaxed)
// - No time window between check and push (single function)
// - Cross-thread frees are immediately delegated (no TLS touch)
//
// Comparison with current design:
// - Current: 38.43% free overhead (free.part.0 + mid_lookup + locks)
// - New: 2-3 instructions (ownership check + TLS push)
// - Reduction: -90% instructions in same-thread path
//
// Inspired by:
// - System tcache (glibc malloc) - fast same-thread free
// - Box Theory - Clear ownership boundaries
// - TOCTOU fix (Box 4) - Atomic ownership check