// tiny_free_fast_v2.inc.h - Phase 7: Ultra-Fast Free Path (Header-based) // Purpose: Eliminate SuperSlab lookup bottleneck (52.63% CPU → <5%) // Design: Read class_idx from inline header (O(1), 2-3 cycles) // Performance: 1.2M → 40-60M ops/s (30-50x improvement) // // Key Innovation: Smart Headers // - 1-byte header before each block stores class_idx // - Slab[0]: 0% overhead (reuses 960B wasted padding) // - Other slabs: ~1.5% overhead (1 byte per block) // - Total: <2% memory overhead for 30-50x speed gain // // Flow (3-5 instructions, 5-10 cycles): // 1. Read class_idx from header (ptr-1) [1 instruction, 2-3 cycles] // 2. Push to TLS freelist [2-3 instructions, 3-5 cycles] // 3. Done! (No lookup, no validation, no atomic) #pragma once #include // For getenv() in cross-thread check ENV gate #include // For pthread_self() in cross-thread check #include "tiny_region_id.h" #include "hakmem_build_flags.h" #include "hakmem_tiny_config.h" // For TINY_TLS_MAG_CAP, TINY_NUM_CLASSES #include "box/tls_sll_box.h" // Box TLS-SLL API #include "box/tls_sll_drain_box.h" // Box TLS-SLL Drain (Option B) #include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection #include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls) // Ring Cache and Unified Cache removed (A/B test: OFF is faster) #include "hakmem_super_registry.h" // For hak_super_lookup (cross-thread check) #include "superslab/superslab_inline.h" // For slab_index_for (cross-thread check) #include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary #include "box/free_remote_box.h" // For tiny_free_remote_box (cross-thread routing) #include "box/ptr_conversion_box.h" // Phase 10: Correct pointer arithmetic // Phase 7: Header-based ultra-fast free #if HAKMEM_TINY_HEADER_CLASSIDX // External TLS variables (defined in hakmem_tiny.c) extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; extern int g_tls_sll_enable; // Honored for fast free: when 0, fall back to slow path // External functions extern void hak_tiny_free(void* ptr); // Fallback for non-header allocations // Inline helper: Get current thread ID (lower 32 bits) #ifndef TINY_SELF_U32_LOCAL_DEFINED #define TINY_SELF_U32_LOCAL_DEFINED static inline uint32_t tiny_self_u32_local(void) { return (uint32_t)(uintptr_t)pthread_self(); } #endif // ========== Ultra-Fast Free (Header-based) ========== // Ultra-fast free for header-based allocations // Returns: 1 if handled, 0 if needs slow path // // Performance: 3-5 instructions, 5-10 cycles // vs Current: 330+ lines, 500+ cycles (100x faster!) // // Assembly (x86-64, release build): // movzbl -0x1(%rdi),%eax // Read header (class_idx) // mov g_tls_sll_head(,%rax,8),%rdx // Load head // mov %rdx,(%rdi) // ptr->next = head // mov %rdi,g_tls_sll_head(,%rax,8) // head = ptr // addl $0x1,g_tls_sll_count(,%rax,4) // count++ // ret // // Expected: 3-5 instructions, 5-10 cycles (L1 hit) static inline int hak_tiny_free_fast_v2(void* ptr) { if (__builtin_expect(!ptr, 0)) return 0; // Respect global SLL toggle: when disabled, do not use TLS SLL fast path. if (__builtin_expect(!g_tls_sll_enable, 0)) { return 0; // Force slow path } // Phase E3-1: Remove registry lookup (50-100 cycles overhead) // Reason: Phase E1 added headers to C7, making this check redundant // Header magic validation (2-3 cycles) is now sufficient for all classes // Expected: 9M → 30-50M ops/s recovery (+226-443%) // CRITICAL: Check if header is accessible before reading // FIX: Use ptr directly, not ptr-1, for validation if possible, or trust lookup // void* header_addr = (char*)ptr - 1; // <-- Dangerous for C0 #if !HAKMEM_BUILD_RELEASE // Debug: Validate header accessibility (metadata-based check) // Phase 9: mincore() REMOVED - no syscall overhead (0 cycles) // Strategy: Trust internal metadata (registry ensures memory is valid) // Benefit: Catch invalid pointers via header magic validation below extern int hak_is_memory_readable(void* addr); if (!hak_is_memory_readable(ptr)) { // Check ptr, not header_addr return 0; // Header not accessible - not a Tiny allocation } #else // Release: Phase 9 optimization - mincore() completely removed // OLD: Page boundary check + mincore() syscall (~634 cycles) // NEW: No check needed - trust internal metadata (0 cycles) // Safety: Header magic validation below catches invalid pointers // Performance: 841 syscalls → 0 (100% elimination) // (Page boundary check removed - adds 1-2 cycles without benefit) #endif // 1. Read class_idx from header (2-3 cycles, L1 hit) // Note: In release mode, tiny_region_id_read_header() skips magic validation (saves 2-3 cycles) #if HAKMEM_DEBUG_VERBOSE static _Atomic int debug_calls = 0; if (atomic_fetch_add(&debug_calls, 1) < 5) { fprintf(stderr, "[TINY_FREE_V2] Before read_header, ptr=%p\n", ptr); } #endif // P2.1: Use class_map instead of Header to avoid Header/Next contention // ENV: HAKMEM_TINY_NO_CLASS_MAP=1 to disable (default: ON - class_map is preferred) // Priority-2: Use cached ENV (eliminate lazy-init TLS overhead) int class_idx = -1; { // P2.1: Default is ON (use class_map), HAK_ENV returns inverted logic int g_use_class_map = !HAK_ENV_TINY_NO_CLASS_MAP(); if (__builtin_expect(g_use_class_map, 1)) { // P1.2: class_map path - avoid Header read // FIX: Use ptr (USER) for lookup, NOT ptr-1 SuperSlab* ss = ss_fast_lookup(ptr); if (ss && ss->magic == SUPERSLAB_MAGIC) { // FIX: Use ptr (USER) for slab index int slab_idx = slab_index_for(ss, ptr); if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) { int map_class = tiny_get_class_from_ss(ss, slab_idx); if (map_class < TINY_NUM_CLASSES) { class_idx = map_class; #if HAKMEM_DEBUG_VERBOSE if (atomic_load(&debug_calls) <= 5) { fprintf(stderr, "[TINY_FREE_V2] class_map lookup: class_idx=%d\n", class_idx); } #endif } } } // Fallback to Header if class_map lookup failed if (class_idx < 0) { class_idx = tiny_region_id_read_header(ptr); #if HAKMEM_DEBUG_VERBOSE if (atomic_load(&debug_calls) <= 5) { fprintf(stderr, "[TINY_FREE_V2] class_map failed, Header fallback: class_idx=%d\n", class_idx); } #endif } } else { // P2.1: Fallback to Header read (disabled class_map mode) class_idx = tiny_region_id_read_header(ptr); #if HAKMEM_DEBUG_VERBOSE if (atomic_load(&debug_calls) <= 5) { fprintf(stderr, "[TINY_FREE_V2] Header read: class_idx=%d\n", class_idx); } #endif } } #if HAKMEM_DEBUG_VERBOSE if (atomic_load(&debug_calls) <= 5) { fprintf(stderr, "[TINY_FREE_V2] After read_header, class_idx=%d\n", class_idx); } #endif // TWO-SPEED: Header/meta cross-check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust the header-based classification. #if !HAKMEM_BUILD_RELEASE // Cross-check header class vs meta class (if available from fast lookup) do { // Try fast owner slab lookup to get meta->class_idx for comparison // FIX: Use ptr (USER) SuperSlab* ss = hak_super_lookup(ptr); if (ss && ss->magic == SUPERSLAB_MAGIC) { // FIX: Use ptr (USER) int sidx = slab_index_for(ss, ptr); if (sidx >= 0 && sidx < ss_slabs_capacity(ss)) { TinySlabMeta* m = &ss->slabs[sidx]; uint8_t meta_cls = m->class_idx; if (meta_cls < TINY_NUM_CLASSES && meta_cls != (uint8_t)class_idx) { static _Atomic uint32_t g_hdr_meta_fast = 0; uint32_t n = atomic_fetch_add_explicit(&g_hdr_meta_fast, 1, memory_order_relaxed); if (n < 16) { fprintf(stderr, "[FREE_FAST_HDR_META_MISMATCH] hdr_cls=%d meta_cls=%u ptr=%p slab_idx=%d ss=%p\n", class_idx, (unsigned)meta_cls, ptr, sidx, (void*)ss); if (n < 4) { void* bt[8]; int frames = backtrace(bt, 8); backtrace_symbols_fd(bt, frames, fileno(stderr)); } fflush(stderr); } } } } } while (0); #endif // !HAKMEM_BUILD_RELEASE // Check if header read failed (invalid magic in debug, or out-of-bounds class_idx) if (__builtin_expect(class_idx < 0, 0)) { // Invalid header - route to slow path (non-header allocation or corrupted header) return 0; } // PRIORITY 1: Bounds check on class_idx from header if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) { fprintf(stderr, "[TINY_FREE_V2] FATAL: class_idx=%d out of bounds (from header at %p)\n", class_idx, ptr); fflush(stderr); assert(0 && "class_idx from header out of bounds"); return 0; } #if !HAKMEM_BUILD_RELEASE atomic_fetch_add(&g_integrity_check_class_bounds, 1); #endif // 2. Check TLS freelist capacity (defense in depth - ALWAYS ENABLED) // CRITICAL: Enable in both debug and release to prevent corruption accumulation // Reason: If C7 slips through magic validation, capacity limit prevents unbounded growth // Cost: 1 comparison (~1 cycle, predict-not-taken) // Benefit: Fail-safe against TLS SLL pollution from false positives uint32_t cap = (uint32_t)TINY_TLS_MAG_CAP; if (__builtin_expect(g_tls_sll[class_idx].count >= cap, 0)) { return 0; // Route to slow path for spill (Front Gate will catch corruption) } // 3. Push base to TLS freelist (4 instructions, 5-7 cycles) // Must push base (block start) not user pointer! // Phase E1: ALL classes (C0-C7) have 1-byte header → base = ptr-1 // FIX: Use ptr_user_to_base(ptr, class_idx) logic void* base = HAK_BASE_TO_RAW(ptr_user_to_base(HAK_USER_FROM_RAW(ptr), class_idx)); // Phase 14-C: UltraHot は free 時に横取りしない(Borrowing 設計) // → 正史(TLS SLL)の在庫を正しく保つ // → UltraHot refill は alloc 側で TLS SLL から借りる // LARSON FIX (2025-11-16): Cross-thread free detection - ENV GATED // Problem: Larson MT crash - TLS SLL poison (0xbada55...) from cross-thread free // Root cause: Block allocated by Thread A, freed by Thread B → pushed to B's TLS SLL // → B allocates the block → metadata still points to A's SuperSlab → corruption // Solution: Check owner_tid_low, route cross-thread free to remote queue // Status: ENV-gated for performance (HAKMEM_TINY_LARSON_FIX=1 to enable) // Performance: OFF=5-10 cycles/free, ON=110-520 cycles/free (registry lookup overhead) { // Priority-2: Use cached ENV (eliminate lazy-init TLS syscall overhead) if (__builtin_expect(HAK_ENV_TINY_LARSON_FIX(), 0)) { // Cross-thread check enabled - MT safe mode // Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100) SuperSlab* ss = ss_fast_lookup(base); if (__builtin_expect(ss != NULL, 1)) { // FIX: slab_index_for on BASE (since base is correct now) int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0, 1)) { uint32_t self_tid = tiny_self_u32_local(); uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); // Check if this is a cross-thread free (compare bits 8-15; low 8 bits are 0 on glibc) uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu); if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) { // Cross-thread free → remote queue routing TinySlabMeta* meta = &ss->slabs[slab_idx]; if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) { // Successfully queued to remote, done return 1; } // Remote push failed → fall through to slow path return 0; } // Same-thread free → continue to TLS SLL fast path below } } // SuperSlab lookup failed → fall through to TLS SLL (may be headerless C7) } } // REVERT E3-2: Use Box TLS-SLL for all builds (testing hypothesis) // Hypothesis: Box TLS-SLL acts as verification layer, masking underlying bugs #if !HAKMEM_BUILD_RELEASE // Address watcher: Check if this is the watched address being freed { extern uintptr_t get_watch_addr(void); uintptr_t watch = get_watch_addr(); if (watch != 0 && (uintptr_t)base == watch) { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_load(&g_debug_op_count); fprintf(stderr, "\n"); fprintf(stderr, "========================================\n"); fprintf(stderr, "[WATCH_FREE_HIT] Address %p freed!\n", base); fprintf(stderr, "========================================\n"); fprintf(stderr, " Operation: #%lu\n", (unsigned long)op); fprintf(stderr, " Class: %d\n", class_idx); fprintf(stderr, " User ptr: %p\n", ptr); fprintf(stderr, " Base ptr: %p\n", base); fprintf(stderr, " TLS count: %u (before free)\n", g_tls_sll[class_idx].count); fprintf(stderr, " TLS head: %p\n", g_tls_sll[class_idx].head); fprintf(stderr, "========================================\n"); fprintf(stderr, "\n"); fflush(stderr); // Print backtrace void* bt[16]; int frames = backtrace(bt, 16); fprintf(stderr, "[WATCH_FREE_BACKTRACE] %d frames:\n", frames); backtrace_symbols_fd(bt, frames, fileno(stderr)); fprintf(stderr, "\n"); fflush(stderr); // Abort to preserve state fprintf(stderr, "[WATCH_ABORT] Aborting on watched free...\n"); fflush(stderr); abort(); } } // Debug: Log free operations (first 2000, ALL classes) { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); if (op < 2000) { // ALL classes, not just class 1 fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p tls_count_before=%u\n", (unsigned long)op, class_idx, ptr, base, g_tls_sll[class_idx].count); fflush(stderr); } } #endif if (!tls_sll_push(class_idx, base, UINT32_MAX)) { // C7 rejected or capacity exceeded - route to slow path return 0; } // P1.3/P2.2: Track active/tls_cached when block is freed (user gives it back) // ENV gate: HAKMEM_TINY_ACTIVE_TRACK=1 to enable (default: 0 for performance) // Flow: User → TLS SLL means active--, tls_cached++ // Priority-2: Use cached ENV (eliminate lazy-init TLS syscall overhead) { if (__builtin_expect(HAK_ENV_TINY_ACTIVE_TRACK(), 0)) { // Lookup the actual slab meta for this block SuperSlab* ss = ss_fast_lookup(base); if (ss && ss->magic == SUPERSLAB_MAGIC) { int slab_idx = slab_index_for(ss, base); if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) { TinySlabMeta* meta = &ss->slabs[slab_idx]; atomic_fetch_sub_explicit(&meta->active, 1, memory_order_relaxed); atomic_fetch_add_explicit(&meta->tls_cached, 1, memory_order_relaxed); // P2.2 } } } } // Option B: Periodic TLS SLL Drain (restore slab accounting consistency) // Purpose: Every N frees (default: 1024), drain TLS SLL → slab freelist // Impact: Enables empty detection → SuperSlabs freed → LRU cache functional // Cost: 2-3 cycles (counter increment + comparison, predict-not-taken) // Benefit: +1,300-1,700% throughput (563K → 8-10M ops/s expected) tiny_tls_sll_try_drain(class_idx); return 1; // Success - handled in fast path } // ========== Free Entry Point ========== // Entry point for free() - tries fast path first, falls back to slow path // // Flow: // 1. Try ultra-fast free (header-based) → 95-99% hit rate // 2. Miss → Fallback to slow path → 1-5% (non-header, cache full) // // Performance: // - Fast path: 5-10 cycles (header read + TLS push) // - Slow path: 500+ cycles (SuperSlab lookup + validation) // - Weighted average: ~10-30 cycles (vs 500+ current) static inline void hak_free_fast_v2_entry(void* ptr) { // Try ultra-fast free (header-based) if (__builtin_expect(hak_tiny_free_fast_v2(ptr), 1)) { return; // Success - done in 5-10 cycles! } // Slow path: Non-header allocation or TLS cache full hak_tiny_free(ptr); } // ========== Performance Counters (Debug) ========== #if !HAKMEM_BUILD_RELEASE // Performance counters (TLS, lightweight) static __thread uint64_t g_free_v2_fast_hits = 0; static __thread uint64_t g_free_v2_slow_hits = 0; // Track fast path hit rate static inline void hak_free_v2_track_fast(void) { g_free_v2_fast_hits++; } static inline void hak_free_v2_track_slow(void) { g_free_v2_slow_hits++; } // Print stats at exit static void hak_free_v2_print_stats(void) __attribute__((destructor)); static void hak_free_v2_print_stats(void) { uint64_t total = g_free_v2_fast_hits + g_free_v2_slow_hits; if (total == 0) return; double hit_rate = (double)g_free_v2_fast_hits / total * 100.0; fprintf(stderr, "[FREE_V2] Fast hits: %lu, Slow hits: %lu, Hit rate: %.2f%%\n", g_free_v2_fast_hits, g_free_v2_slow_hits, hit_rate); } #else // Release: No tracking overhead static inline void hak_free_v2_track_fast(void) {} static inline void hak_free_v2_track_slow(void) {} #endif // ========== Benchmark Comparison ========== // // Current (hak_tiny_free_superslab): // - 2x SuperSlab lookup: 200+ cycles // - Safety checks (O(n) duplicate scan): 100+ cycles // - Validation, atomics, diagnostics: 200+ cycles // - Total: 500+ cycles // - Throughput: 1.2M ops/s // // Phase 7 (hak_tiny_free_fast_v2): // - Header read: 2-3 cycles // - TLS push: 3-5 cycles // - Total: 5-10 cycles (100x faster!) // - Throughput: 40-60M ops/s (30-50x improvement) // // vs System malloc tcache: // - System: 10-15 cycles (3-4 instructions) // - HAKMEM: 5-10 cycles (3-5 instructions) // - Result: 70-110% of System speed (互角〜勝ち!) #endif // HAKMEM_TINY_HEADER_CLASSIDX