hakmem/core/front/malloc_tiny_fast.h

// malloc_tiny_fast.h - Phase 26: Front Gate Unification (Tiny Fast Path)
//
// Goal: Eliminate 3-layer overhead (malloc → hak_alloc_at → wrapper → tiny_alloc_fast)
// Target: +10-15% performance (11.35M → 12.5-13.5M ops/s)
//
// Design (ChatGPT analysis):
//   - Replace: malloc → hak_alloc_at (236 lines) → wrapper (diagnostics) → tiny_alloc_fast
//   - With: malloc → malloc_tiny_fast (single-layer, direct to Unified Cache)
//   - Preserves: Safety checks (lock depth, initializing, LD_SAFE, jemalloc block)
//   - Leverages: Phase 23 Unified Cache (tcache-style, 2-3 cache misses)
//
// Performance:
//   - Current overhead: malloc(8.97%) + routing + wrapper(3.63%) + tiny(5.37%) = 17.97%
//   - BenchFast ceiling: 8-10 instructions (~1-2% overhead)
//   - Gap: ~16%
//   - Target: Close half the gap (+10-15% improvement)
//
// ENV Variables:
//   HAKMEM_FRONT_GATE_UNIFIED=1  # Enable Front Gate Unification (default: 0, OFF)

#ifndef HAK_FRONT_MALLOC_TINY_FAST_H
#define HAK_FRONT_MALLOC_TINY_FAST_H

#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include <pthread.h>  // For pthread_self() in cross-thread check
#include "../hakmem_build_flags.h"
#include "../hakmem_tiny_config.h"  // For TINY_NUM_CLASSES
#include "../hakmem_super_registry.h"  // For cross-thread owner check
#include "../superslab/superslab_inline.h"  // For ss_fast_lookup, slab_index_for (Phase 12)
#include "../box/ss_slab_meta_box.h"   // For ss_slab_meta_owner_tid_low_get
#include "../box/free_remote_box.h"    // For tiny_free_remote_box
#include "tiny_unified_cache.h"     // For unified_cache_pop_or_refill
#include "../tiny_region_id.h"      // For tiny_region_id_write_header
#include "../hakmem_tiny.h"         // For hak_tiny_size_to_class
#include "../box/tiny_front_hot_box.h"  // Phase 4-Step2: Hot Path Box
#include "../box/tiny_front_cold_box.h" // Phase 4-Step2: Cold Path Box

// Helper: current thread id (low 32 bits) for owner check
#ifndef TINY_SELF_U32_LOCAL_DEFINED
#define TINY_SELF_U32_LOCAL_DEFINED
static inline uint32_t tiny_self_u32_local(void) {
    return (uint32_t)(uintptr_t)pthread_self();
}
#endif

// ============================================================================
// ENV Control (cached, lazy init)
// ============================================================================

// Enable flag (default: 0, OFF)
static inline int front_gate_unified_enabled(void) {
    static int g_enable = -1;
    if (__builtin_expect(g_enable == -1, 0)) {
        const char* e = getenv("HAKMEM_FRONT_GATE_UNIFIED");
        g_enable = (e && *e && *e == '0') ? 0 : 1;  // default ON
#if !HAKMEM_BUILD_RELEASE
        if (g_enable) {
            fprintf(stderr, "[FrontGate-INIT] front_gate_unified_enabled() = %d\n", g_enable);
            fflush(stderr);
        }
#endif
    }
    return g_enable;
}

// ============================================================================
// Phase 4-Step2: malloc_tiny_fast() - Hot/Cold Path Box (ACTIVE)
// ============================================================================

// Ultra-thin Tiny allocation using Hot/Cold Path Box (Phase 4-Step2)
//
// IMPROVEMENTS over Phase 26-A:
//   - Branch reduction: Hot path has only 1 branch (cache empty check)
//   - Branch hints: TINY_HOT_LIKELY/UNLIKELY for better CPU prediction
//   - Hot/Cold separation: Keeps hot path small (better i-cache locality)
//   - Explicit fallback: Clear hot→cold transition
//
// PERFORMANCE:
//   - Baseline (Phase 26-A, no PGO): 53.3 M ops/s
//   - Hot/Cold Box (no PGO): 57.2 M ops/s (+7.3%)
//
// DESIGN:
//   1. size → class_idx (same as Phase 26-A)
//   2. Hot path: tiny_hot_alloc_fast() - cache hit (1 branch)
//   3. Cold path: tiny_cold_refill_and_alloc() - cache miss (noinline, cold)
//
// Preconditions:
//   - Called AFTER malloc() safety checks (lock depth, initializing, LD_SAFE)
//   - size <= tiny_get_max_size() (caller verified)
// Returns:
//   - USER pointer on success
//   - NULL on failure (caller falls back to normal path)
//
__attribute__((always_inline))
static inline void* malloc_tiny_fast(size_t size) {
    // 1. size → class_idx (inline table lookup, 1-2 instructions)
    int class_idx = hak_tiny_size_to_class(size);

    // 2. Phase 4-Step2: Hot/Cold Path Box
    // Try hot path first (cache hit, 1 branch)
    void* ptr = tiny_hot_alloc_fast(class_idx);
    if (TINY_HOT_LIKELY(ptr != NULL)) {
        // Hot path: Cache hit → return USER pointer
        return ptr;
    }

    // 3. Cold path: Cache miss → refill + alloc
    // noinline, cold attribute keeps this code out of hot path
    return tiny_cold_refill_and_alloc(class_idx);
}

// ============================================================================
// Phase 26-B: free_tiny_fast() - Ultra-thin Tiny deallocation
// ============================================================================

// Single-layer Tiny deallocation (bypasses hak_free_at + wrapper + diagnostics)
// Preconditions:
//   - ptr is from malloc_tiny_fast() (has valid header)
//   - Front Gate Unified is enabled
// Returns:
//   - 1 on success (pushed to Unified Cache)
//   - 0 on failure (caller falls back to normal free path)
__attribute__((always_inline))
static inline int free_tiny_fast(void* ptr) {
    if (__builtin_expect(!ptr, 0)) return 0;

    #ifdef HAKMEM_TINY_HEADER_CLASSIDX
    // 1. ページ境界ガード:
    //    ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。
    //    その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。
    uintptr_t off = (uintptr_t)ptr & 0xFFFu;
    if (__builtin_expect(off == 0, 0)) {
        return 0;
    }

    // 2. Fast header magic validation (必須)
    //    Release ビルドでは tiny_region_id_read_header() が magic を省略するため、
    //    ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。
    uint8_t* header_ptr = (uint8_t*)ptr - 1;
    uint8_t header = *header_ptr;
    uint8_t magic = header & 0xF0u;
    if (__builtin_expect(magic != HEADER_MAGIC, 0)) {
        // Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ
        return 0;
    }

    // 3. class_idx 抽出（下位4bit）
    int class_idx = (int)(header & HEADER_CLASS_MASK);
    if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
        return 0;
    }

    // 4. BASE を計算して Unified Cache に push
    void* base = (void*)((char*)ptr - 1);

    // Cross-thread free detection (Larson MT crash fix, ENV gated)
    {
        static __thread int g_larson_fix = -1;
        if (__builtin_expect(g_larson_fix == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
            g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
#if !HAKMEM_BUILD_RELEASE
            fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL");
            fflush(stderr);
#endif
        }

        if (__builtin_expect(g_larson_fix, 0)) {
            // Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
            SuperSlab* ss = ss_fast_lookup(base);
            if (ss) {
                int slab_idx = slab_index_for(ss, base);
                if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) {
                    uint32_t self_tid = tiny_self_u32_local();
                    uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
                    // LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes)
                    uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
#if !HAKMEM_BUILD_RELEASE
                    static _Atomic uint64_t g_owner_check_count = 0;
                    uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1);
                    if (oc < 10) {
                        fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n",
                                ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp));
                        fflush(stderr);
                    }
#endif

                    if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
                        // Cross-thread free → route to remote queue instead of poisoning TLS cache
#if !HAKMEM_BUILD_RELEASE
                        static _Atomic uint64_t g_cross_thread_count = 0;
                        uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1);
                        if (ct < 20) {
                            fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n",
                                    ptr, owner_tid_low, self_tid_cmp, self_tid);
                            fflush(stderr);
                        }
#endif
                        TinySlabMeta* meta = &ss->slabs[slab_idx];
                        if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
                            return 1;  // handled via remote queue
                        }
                        return 0;  // remote push failed; fall back to normal path
                    }
                }
            }
        }
    }

    // Debug: Log free operations (first 5000, all classes)
#if !HAKMEM_BUILD_RELEASE
    {
        extern _Atomic uint64_t g_debug_op_count;
        extern __thread TinyTLSSLL g_tls_sll[];
        uint64_t op = atomic_fetch_add(&g_debug_op_count, 1);
        // Note: Shares g_debug_op_count with alloc logging, so bump the window.
        if (op < 5000) {
            fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast tls_count_before=%u\n",
                    (unsigned long)op, class_idx, ptr, base,
                    g_tls_sll[class_idx].count);
            fflush(stderr);
        }
    }
#endif

    int pushed = unified_cache_push(class_idx, base);
    if (__builtin_expect(pushed, 1)) {
        return 1;  // Success
    }

    // Unified Cache full → 通常 free 経路へ
    return 0;
    #else
    // No header mode - fall back to normal free
    return 0;
    #endif
}

#endif // HAK_FRONT_MALLOC_TINY_FAST_H