// malloc_tiny_fast.h - Phase 26: Front Gate Unification (Tiny Fast Path) // // Goal: Eliminate 3-layer overhead (malloc → hak_alloc_at → wrapper → tiny_alloc_fast) // Target: +10-15% performance (11.35M → 12.5-13.5M ops/s) // // Design (ChatGPT analysis): // - Replace: malloc → hak_alloc_at (236 lines) → wrapper (diagnostics) → tiny_alloc_fast // - With: malloc → malloc_tiny_fast (single-layer, direct to Unified Cache) // - Preserves: Safety checks (lock depth, initializing, LD_SAFE, jemalloc block) // - Leverages: Phase 23 Unified Cache (tcache-style, 2-3 cache misses) // // Performance: // - Current overhead: malloc(8.97%) + routing + wrapper(3.63%) + tiny(5.37%) = 17.97% // - BenchFast ceiling: 8-10 instructions (~1-2% overhead) // - Gap: ~16% // - Target: Close half the gap (+10-15% improvement) // // ENV Variables: // HAKMEM_FRONT_GATE_UNIFIED=1 # Enable Front Gate Unification (default: 0, OFF) #ifndef HAK_FRONT_MALLOC_TINY_FAST_H #define HAK_FRONT_MALLOC_TINY_FAST_H #include #include #include #include #include // For pthread_self() in cross-thread check #include "../hakmem_build_flags.h" #include "../hakmem_tiny_config.h" // For TINY_NUM_CLASSES #include "../hakmem_super_registry.h" // For cross-thread owner check #include "../superslab/superslab_inline.h" // For ss_fast_lookup, slab_index_for (Phase 12) #include "../box/ss_slab_meta_box.h" // For ss_slab_meta_owner_tid_low_get #include "../box/free_remote_box.h" // For tiny_free_remote_box #include "tiny_unified_cache.h" // For unified_cache_pop_or_refill #include "../tiny_region_id.h" // For tiny_region_id_write_header #include "../hakmem_tiny.h" // For hak_tiny_size_to_class #include "../box/tiny_env_box.h" // For tiny_env_cfg() (ENV variables) #include "../box/tiny_front_hot_box.h" // Phase 4-Step2: Hot Path Box #include "../box/tiny_front_cold_box.h" // Phase 4-Step2: Cold Path Box #include "../box/tiny_c7_hotbox.h" // Optional: C7 専用ホットボックス #include "../box/tiny_heap_box.h" // TinyHeap 汎用 Box #include "../box/tiny_hotheap_v2_box.h" // TinyHotHeap v2 (Phase31 A/B) #include "../box/smallobject_hotbox_v3_box.h" // SmallObject HotHeap v3 skeleton #include "../box/smallobject_hotbox_v4_box.h" // SmallObject HotHeap v4 (C7 stub) #include "../box/smallobject_hotbox_v5_box.h" // SmallObject HotHeap v5 (C6-only route stub, Phase v5-1) #include "../box/smallobject_core_v6_box.h" // SmallObject Core v6 (Phase V6-HDR-2) #include "../box/smallobject_v6_env_box.h" // SmallObject v6 ENV control (Phase V6-HDR-2) #include "../box/smallobject_hotbox_v7_box.h" // SmallObject HotBox v7 stub (Phase v7-1) #include "../box/smallobject_policy_v7_box.h" // Phase v7-4: Policy Box #include "../box/tiny_static_route_box.h" // Phase 3 C3: Static routing (policy snapshot bypass) #include "../box/smallobject_mid_v35_box.h" // Phase v11a-3: MID v3.5 HotBox #include "../box/tiny_c7_ultra_box.h" // C7 ULTRA stub (UF-1, delegates to v3) #include "../box/tiny_c6_ultra_free_box.h" // Phase 4-2: C6 ULTRA-free (free-only, C6-only) #include "../box/tiny_c5_ultra_free_box.h" // Phase 5-1/5-2: C5 ULTRA-free + alloc integration #include "../box/tiny_c4_ultra_free_box.h" // Phase 6: C4 ULTRA-free + alloc integration (cap=64) #include "../box/tiny_ultra_tls_box.h" // Phase TLS-UNIFY-1: Unified ULTRA TLS API #include "../box/tiny_ultra_classes_box.h" // Phase REFACTOR-1: Named constants for C4-C7 #include "../box/tiny_legacy_fallback_box.h" // Phase REFACTOR-2: Legacy fallback logic unification #include "../box/tiny_ptr_convert_box.h" // Phase REFACTOR-3: Inline pointer macro centralization #include "../box/tiny_front_v3_env_box.h" // Tiny front v3 snapshot gate #include "../box/tiny_heap_env_box.h" // ENV gate for TinyHeap front (A/B) #include "../box/tiny_route_env_box.h" // Route snapshot (Heap vs Legacy) #include "../box/tiny_front_stats_box.h" // Front class distribution counters #include "../box/free_path_stats_box.h" // Phase FREE-LEGACY-BREAKDOWN-1: Free path stats #include "../box/alloc_gate_stats_box.h" // Phase ALLOC-GATE-OPT-1: Alloc gate stats #include "../box/free_policy_fast_v2_box.h" // Phase POLICY-FAST-PATH-V2: Policy snapshot bypass #include "../box/free_tiny_fast_hotcold_env_box.h" // Phase FREE-TINY-FAST-HOTCOLD-OPT-1: ENV control #include "../box/free_tiny_fast_hotcold_stats_box.h" // Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Stats #include "../box/tiny_metadata_cache_hot_box.h" // Phase 3 C2: Policy hot cache (metadata cache optimization) #include "../box/tiny_free_route_cache_env_box.h" // Phase 3 D1: Free path route cache // Helper: current thread id (low 32 bits) for owner check #ifndef TINY_SELF_U32_LOCAL_DEFINED #define TINY_SELF_U32_LOCAL_DEFINED static inline uint32_t tiny_self_u32_local(void) { return (uint32_t)(uintptr_t)pthread_self(); } #endif // ============================================================================ // ENV Control (cached, lazy init) // ============================================================================ // Enable flag (default: 0, OFF) static inline int front_gate_unified_enabled(void) { static int g_enable = -1; if (__builtin_expect(g_enable == -1, 0)) { const char* e = getenv("HAKMEM_FRONT_GATE_UNIFIED"); g_enable = (e && *e && *e == '0') ? 0 : 1; // default ON #if !HAKMEM_BUILD_RELEASE if (g_enable) { fprintf(stderr, "[FrontGate-INIT] front_gate_unified_enabled() = %d\n", g_enable); fflush(stderr); } #endif } return g_enable; } // ============================================================================ // Phase REFACTOR-2: Legacy free helper (unified in tiny_legacy_fallback_box.h) // ============================================================================ // Legacy free handling is encapsulated in tiny_legacy_fallback_box.h // (Removed inline implementation to avoid duplication) // ============================================================================ // Phase 4-Step2: malloc_tiny_fast() - Hot/Cold Path Box (ACTIVE) // ============================================================================ // Ultra-thin Tiny allocation using Hot/Cold Path Box (Phase 4-Step2) // // IMPROVEMENTS over Phase 26-A: // - Branch reduction: Hot path has only 1 branch (cache empty check) // - Branch hints: TINY_HOT_LIKELY/UNLIKELY for better CPU prediction // - Hot/Cold separation: Keeps hot path small (better i-cache locality) // - Explicit fallback: Clear hot→cold transition // // PERFORMANCE: // - Baseline (Phase 26-A, no PGO): 53.3 M ops/s // - Hot/Cold Box (no PGO): 57.2 M ops/s (+7.3%) // // DESIGN: // 1. size → class_idx (same as Phase 26-A) // 2. Hot path: tiny_hot_alloc_fast() - cache hit (1 branch) // 3. Cold path: tiny_cold_refill_and_alloc() - cache miss (noinline, cold) // // Preconditions: // - Called AFTER malloc() safety checks (lock depth, initializing, LD_SAFE) // - size <= tiny_get_max_size() (caller verified) // Returns: // - USER pointer on success // - NULL on failure (caller falls back to normal path) // // Phase ALLOC-TINY-FAST-DUALHOT-2: Probe window ENV gate (safe from early putenv) static inline int alloc_dualhot_enabled(void) { static int g = -1; static int g_probe_left = 64; // Probe window: tolerate early putenv before gate init if (__builtin_expect(g == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ALLOC_DUALHOT"); if (e && *e && *e != '0') { g = 1; } else if (g_probe_left > 0) { g_probe_left--; // Still probing: return "not yet set" without committing 0 if (e == NULL) { return 0; // Env not set (yet), but keep probing } } else { g = 0; // Probe window exhausted, commit to 0 } } return g; } // Phase 2 B3: tiny_alloc_route_cold() - Handle rare routes (V7, MID, ULTRA) // NOTE: noinline to avoid code bloat in hot path, placed in cold section __attribute__((noinline, cold)) static void* tiny_alloc_route_cold(SmallRouteKind route_kind, int class_idx, size_t size) { switch (route_kind) { case SMALL_ROUTE_ULTRA: { // Phase TLS-UNIFY-1: Unified ULTRA TLS pop for C4-C6 (C7 handled above) void* base = tiny_ultra_tls_pop((uint8_t)class_idx); if (TINY_HOT_LIKELY(base != NULL)) { if (class_idx == 6) FREE_PATH_STAT_INC(c6_ultra_alloc_hit); else if (class_idx == 5) FREE_PATH_STAT_INC(c5_ultra_alloc_hit); else if (class_idx == 4) FREE_PATH_STAT_INC(c4_ultra_alloc_hit); return tiny_base_to_user_inline(base); } // ULTRA miss → fallback to LEGACY break; } case SMALL_ROUTE_MID_V35: { // Phase v11a-3: MID v3.5 allocation void* v35p = small_mid_v35_alloc(class_idx, size); if (TINY_HOT_LIKELY(v35p != NULL)) { return v35p; } // MID v3.5 miss → fallback to LEGACY break; } case SMALL_ROUTE_V7: { // Phase v7: SmallObject v7 allocation (research box) void* v7p = small_heap_alloc_fast_v7_stub(size, (uint8_t)class_idx); if (TINY_HOT_LIKELY(v7p != NULL)) { return v7p; } // V7 miss → fallback to LEGACY break; } case SMALL_ROUTE_MID_V3: { // Phase MID-V3: MID v3 allocation (257-768B, C5-C6) // Note: MID v3 uses same segment infrastructure as MID v3.5 // For now, delegate to MID v3.5 which handles both void* v3p = small_mid_v35_alloc(class_idx, size); if (TINY_HOT_LIKELY(v3p != NULL)) { return v3p; } break; } case SMALL_ROUTE_LEGACY: default: break; } // Fallback: LEGACY unified cache hot/cold path void* ptr = tiny_hot_alloc_fast(class_idx); if (TINY_HOT_LIKELY(ptr != NULL)) { return ptr; } return tiny_cold_refill_and_alloc(class_idx); } // Phase ALLOC-GATE-SSOT-1: malloc_tiny_fast_for_class() - body (class_idx already known) __attribute__((always_inline)) static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) { // Stats (class_idx already validated by gate) tiny_front_alloc_stat_inc(class_idx); ALLOC_GATE_STAT_INC_CLASS(class_idx); // Phase v11a-5b: C7 ULTRA early-exit (skip policy snapshot for common case) // This is the most common hot path - avoids TLS policy overhead if (class_idx == 7 && tiny_c7_ultra_enabled_env()) { void* ultra_p = tiny_c7_ultra_alloc(size); if (TINY_HOT_LIKELY(ultra_p != NULL)) { return ultra_p; } // C7 ULTRA miss → fall through to policy-based routing } // Phase ALLOC-TINY-FAST-DUALHOT-2: C0-C3 direct path (second hot path) // Skip expensive policy snapshot and route determination for C0-C3. // NOTE: Branch only taken if class_idx <= 3 (rare when OFF, frequent when ON) if ((unsigned)class_idx <= 3u) { if (alloc_dualhot_enabled()) { // Direct to LEGACY unified cache (no policy snapshot) void* ptr = tiny_hot_alloc_fast(class_idx); if (TINY_HOT_LIKELY(ptr != NULL)) { return ptr; } return tiny_cold_refill_and_alloc(class_idx); } } // 2. Route selection: Static route table (Phase 3 C3) or policy hot cache (Phase 3 C2) or policy snapshot (default) SmallRouteKind route_kind; if (tiny_static_route_ready_fast()) { route_kind = tiny_static_route_get_kind_fast(class_idx); } else { // Phase 3 C2: Use policy hot cache if enabled (eliminates policy_snapshot() call) route_kind = tiny_policy_hot_get_route(class_idx); } // Phase 2 B3: Routing dispatch (ENV gate HAKMEM_TINY_ALLOC_ROUTE_SHAPE) // Optimized: LIKELY on LEGACY (common case), cold helper for rare routes const tiny_env_cfg_t* env_cfg = tiny_env_cfg(); if (TINY_HOT_LIKELY(env_cfg->alloc_route_shape)) { // B3 optimized: Prioritize LEGACY with LIKELY hint if (TINY_HOT_LIKELY(route_kind == SMALL_ROUTE_LEGACY)) { // Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1) if (__builtin_expect(env_cfg->tiny_prefetch, 0)) { __builtin_prefetch(&g_unified_cache[class_idx], 0, 3); } // LEGACY fast path: Unified Cache hot/cold void* ptr = tiny_hot_alloc_fast(class_idx); if (TINY_HOT_LIKELY(ptr != NULL)) { return ptr; } return tiny_cold_refill_and_alloc(class_idx); } // Rare routes: delegate to cold helper return tiny_alloc_route_cold(route_kind, class_idx, size); } // Original dispatch (backward compatible, default) // 3. Single switch on route_kind (all ENV checks moved to Policy init) switch (route_kind) { case SMALL_ROUTE_ULTRA: { // Phase TLS-UNIFY-1: Unified ULTRA TLS pop for C4-C6 (C7 handled above) void* base = tiny_ultra_tls_pop((uint8_t)class_idx); if (TINY_HOT_LIKELY(base != NULL)) { if (class_idx == 6) FREE_PATH_STAT_INC(c6_ultra_alloc_hit); else if (class_idx == 5) FREE_PATH_STAT_INC(c5_ultra_alloc_hit); else if (class_idx == 4) FREE_PATH_STAT_INC(c4_ultra_alloc_hit); return tiny_base_to_user_inline(base); } // ULTRA miss → fallback to LEGACY break; } case SMALL_ROUTE_MID_V35: { // Phase v11a-3: MID v3.5 allocation void* v35p = small_mid_v35_alloc(class_idx, size); if (TINY_HOT_LIKELY(v35p != NULL)) { return v35p; } // MID v3.5 miss → fallback to LEGACY break; } case SMALL_ROUTE_V7: { // Phase v7: SmallObject v7 allocation (research box) void* v7p = small_heap_alloc_fast_v7_stub(size, (uint8_t)class_idx); if (TINY_HOT_LIKELY(v7p != NULL)) { return v7p; } // V7 miss → fallback to LEGACY break; } case SMALL_ROUTE_MID_V3: { // Phase MID-V3: MID v3 allocation (257-768B, C5-C6) // Note: MID v3 uses same segment infrastructure as MID v3.5 // For now, delegate to MID v3.5 which handles both void* v3p = small_mid_v35_alloc(class_idx, size); if (TINY_HOT_LIKELY(v3p != NULL)) { return v3p; } break; } case SMALL_ROUTE_LEGACY: default: break; } // Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1) if (__builtin_expect(env_cfg->tiny_prefetch, 0)) { __builtin_prefetch(&g_unified_cache[class_idx], 0, 3); } // LEGACY fallback: Unified Cache hot/cold path void* ptr = tiny_hot_alloc_fast(class_idx); if (TINY_HOT_LIKELY(ptr != NULL)) { return ptr; } return tiny_cold_refill_and_alloc(class_idx); } // Wrapper: size → class_idx conversion (SSOT) __attribute__((always_inline)) static inline void* malloc_tiny_fast(size_t size) { // Phase ALLOC-GATE-OPT-1: カウンタ散布 (1. 関数入口) ALLOC_GATE_STAT_INC(total_calls); // Phase ALLOC-GATE-SSOT-1: Single size→class conversion (SSOT) ALLOC_GATE_STAT_INC(size_to_class_calls); int class_idx = hak_tiny_size_to_class(size); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { return NULL; } // Delegate to *_for_class (stats tracked inside) return malloc_tiny_fast_for_class(size, class_idx); } // ============================================================================ // Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split helpers // ============================================================================ // Cold path: Cross-thread free, TinyHeap routes, and legacy fallback // (noinline,cold to keep hot path small and I-cache clean) __attribute__((noinline,cold)) static int free_tiny_fast_cold(void* ptr, void* base, int class_idx) { FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_hit); // Phase 3 D1: Free path route cache (eliminate tiny_route_for_class overhead) tiny_route_kind_t route; if (__builtin_expect(tiny_free_static_route_enabled(), 0)) { // Use cached route (bypasses tiny_route_for_class()) route = g_tiny_route_class[(unsigned)class_idx & 7u]; if (__builtin_expect(route == TINY_ROUTE_LEGACY && !g_tiny_route_snapshot_done, 0)) { // Fallback if uninitialized route = tiny_route_for_class((uint8_t)class_idx); } } else { // Standard path route = tiny_route_for_class((uint8_t)class_idx); } const int use_tiny_heap = tiny_route_is_heap_kind(route); const TinyFrontV3Snapshot* front_snap = __builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL; // TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust header magic (0xA0) as sufficient validation. #if !HAKMEM_BUILD_RELEASE // Superslab 登録確認(誤分類防止) SuperSlab* ss_guard = hak_super_lookup(ptr); if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) { return 0; // hakmem 管理外 → 通常 free 経路へ } #endif // !HAKMEM_BUILD_RELEASE // Cross-thread free detection (Larson MT crash fix, ENV gated) + TinyHeap free path { static __thread int g_larson_fix = -1; if (__builtin_expect(g_larson_fix == -1, 0)) { const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); g_larson_fix = (e && *e && *e != '0') ? 1 : 0; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL"); fflush(stderr); #endif } if (__builtin_expect(g_larson_fix || use_tiny_heap, 0)) { // Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100) SuperSlab* ss = ss_fast_lookup(base); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (5. super_lookup 呼び出し) FREE_PATH_STAT_INC(super_lookup_called); if (ss) { int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) { uint32_t self_tid = tiny_self_u32_local(); uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); TinySlabMeta* meta = &ss->slabs[slab_idx]; // LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes) uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu); #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_owner_check_count = 0; uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1); if (oc < 10) { fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n", ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp)); fflush(stderr); } #endif if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) { // Cross-thread free → route to remote queue instead of poisoning TLS cache FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_cross_thread); #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_cross_thread_count = 0; uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1); if (ct < 20) { fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n", ptr, owner_tid_low, self_tid_cmp, self_tid); fflush(stderr); } #endif if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) { // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (6. cross-thread free) FREE_PATH_STAT_INC(remote_free); return 1; // handled via remote queue } return 0; // remote push failed; fall back to normal path } // Same-thread + TinyHeap route → route-based free if (__builtin_expect(use_tiny_heap, 0)) { FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_tinyheap); switch (route) { case TINY_ROUTE_SMALL_HEAP_V7: { // Phase v7-1: C6-only v7 stub (MID v3 fallback) if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) { return 1; } break; // fallthrough to legacy } case TINY_ROUTE_SMALL_HEAP_V6: { // Phase V6-HDR-2: Headerless free (ENV gated) if (small_v6_headerless_route_enabled((uint8_t)class_idx)) { SmallHeapCtxV6* ctx_v6 = small_heap_ctx_v6(); if (small_v6_headerless_free(ctx_v6, ptr, (uint8_t)class_idx)) { return 1; // Handled by v6 } // v6 returned false -> fallback to legacy } break; // fallthrough to legacy } // Phase v10: v3/v4/v5 removed - routes now handled as LEGACY case TINY_ROUTE_HOTHEAP_V2: tiny_hotheap_v2_free((uint8_t)class_idx, base, meta); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (v2 は tiny_heap_v1 にカウント) FREE_PATH_STAT_INC(tiny_heap_v1_fast); return 1; case TINY_ROUTE_HEAP: { tiny_heap_ctx_t* ctx = tiny_heap_ctx_for_thread(); if (class_idx == 7) { tiny_c7_free_fast_with_meta(ss, slab_idx, base); } else { tiny_heap_free_class_fast_with_meta(ctx, class_idx, ss, slab_idx, base); } // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (9. TinyHeap v1 route) FREE_PATH_STAT_INC(tiny_heap_v1_fast); return 1; } default: break; } } } } if (use_tiny_heap) { // fallback: lookup failed but TinyHeap front is ON → use generic TinyHeap free if (route == TINY_ROUTE_HOTHEAP_V2) { tiny_hotheap_v2_record_free_fallback((uint8_t)class_idx); } // Phase v10: v3/v4 removed - no special fallback tiny_heap_free_class_fast(tiny_heap_ctx_for_thread(), class_idx, ptr); return 1; } } } // Debug: Log free operations (first 5000, all classes) #if !HAKMEM_BUILD_RELEASE { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); // Note: Shares g_debug_op_count with alloc logging, so bump the window. if (op < 5000) { fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast_cold tls_count_before=%u\n", (unsigned long)op, class_idx, ptr, base, g_tls_sll[class_idx].count); fflush(stderr); } } #endif // Phase REFACTOR-2: Legacy fallback (use unified helper) FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_legacy_fallback); tiny_legacy_fallback_free_base(base, class_idx); return 1; } // Hot path: Fast-path validation + ULTRA/MID/V7 routes // (always_inline to minimize overhead on critical path) __attribute__((always_inline)) static inline int free_tiny_fast_hot(void* ptr) { if (__builtin_expect(!ptr, 0)) { FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_null_ptr); return 0; } #if HAKMEM_TINY_HEADER_CLASSIDX // 1. ページ境界ガード: // ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。 // その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。 uintptr_t off = (uintptr_t)ptr & 0xFFFu; if (__builtin_expect(off == 0, 0)) { FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_page_boundary); return 0; } // 2. Fast header magic validation (必須) // Release ビルドでは tiny_region_id_read_header() が magic を省略するため、 // ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。 uint8_t* header_ptr = (uint8_t*)ptr - 1; uint8_t header = *header_ptr; uint8_t magic = header & 0xF0u; if (__builtin_expect(magic != HEADER_MAGIC, 0)) { // Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_bad_magic); return 0; } // 3. class_idx 抽出(下位4bit) int class_idx = (int)(header & HEADER_CLASS_MASK); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { FREE_TINY_FAST_HOTCOLD_STAT_INC(ret0_bad_class); return 0; } // 4. BASE を計算して Unified Cache に push void* base = tiny_user_to_base_inline(ptr); tiny_front_free_stat_inc(class_idx); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (1. 関数入口) FREE_PATH_STAT_INC(total_calls); // Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case) if (class_idx == 7 && tiny_c7_ultra_enabled_env()) { FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_c7_ultra); tiny_c7_ultra_free(ptr); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit); return 1; } // Phase FREE-TINY-FAST-DUALHOT-1: C0-C3 direct path (48% of calls) // Skip expensive policy snapshot and route determination, direct to legacy fallback. // Safety: Check Larson mode (cross-thread free handling requires full validation path) { static __thread int g_larson_fix = -1; if (__builtin_expect(g_larson_fix == -1, 0)) { const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); g_larson_fix = (e && *e && *e != '0') ? 1 : 0; } if (__builtin_expect(class_idx <= 3 && !g_larson_fix, 1)) { // C0-C3 + Larson mode OFF → Direct to legacy (no policy snapshot overhead) tiny_legacy_fallback_free_base(base, class_idx); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit); return 1; } } // Phase POLICY-FAST-PATH-V2: Skip policy snapshot for known-legacy classes if (free_policy_fast_v2_can_skip((uint8_t)class_idx)) { FREE_PATH_STAT_INC(policy_fast_v2_skip); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_policy_fast_skip); goto cold_path; // Delegate to cold path for legacy handling } // Phase v11b-1: Policy-based single switch (replaces serial ULTRA checks) const SmallPolicyV7* policy_free = small_policy_v7_snapshot(); SmallRouteKind route_kind_free = policy_free->route_kind[class_idx]; switch (route_kind_free) { case SMALL_ROUTE_ULTRA: { // Phase TLS-UNIFY-1: Unified ULTRA TLS push for C4-C6 (C7 handled above) if (class_idx >= 4 && class_idx <= 6) { FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_ultra_tls); tiny_ultra_tls_push((uint8_t)class_idx, base); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit); return 1; } // ULTRA for other classes → fallback to cold path break; } case SMALL_ROUTE_MID_V35: { // Phase v11a-3: MID v3.5 free FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_mid_v35); small_mid_v35_free(ptr, class_idx); FREE_PATH_STAT_INC(smallheap_v7_fast); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit); return 1; } case SMALL_ROUTE_V7: { // Phase v7: SmallObject v7 free (research box) if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) { FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_v7); FREE_PATH_STAT_INC(smallheap_v7_fast); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit); return 1; } // V7 miss → fallback to cold path break; } case SMALL_ROUTE_MID_V3: { // Phase MID-V3: delegate to MID v3.5 FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_mid_v35); small_mid_v35_free(ptr, class_idx); FREE_PATH_STAT_INC(smallheap_v7_fast); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit); return 1; } case SMALL_ROUTE_LEGACY: default: break; } cold_path: // Delegate to cold path for cross-thread, TinyHeap, and legacy handling return free_tiny_fast_cold(ptr, base, class_idx); #else // No header mode - fall back to normal free return 0; #endif } // ============================================================================ // Phase 26-B: free_tiny_fast() - Ultra-thin Tiny deallocation // ============================================================================ // Single-layer Tiny deallocation (bypasses hak_free_at + wrapper + diagnostics) // Preconditions: // - ptr is from malloc_tiny_fast() (has valid header) // - Front Gate Unified is enabled // Returns: // - 1 on success (pushed to Unified Cache) // - 0 on failure (caller falls back to normal free path) __attribute__((always_inline)) static inline int free_tiny_fast(void* ptr) { if (__builtin_expect(!ptr, 0)) return 0; #if HAKMEM_TINY_HEADER_CLASSIDX // 1. ページ境界ガード: // ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。 // その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。 uintptr_t off = (uintptr_t)ptr & 0xFFFu; if (__builtin_expect(off == 0, 0)) { return 0; } // 2. Fast header magic validation (必須) // Release ビルドでは tiny_region_id_read_header() が magic を省略するため、 // ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。 uint8_t* header_ptr = (uint8_t*)ptr - 1; uint8_t header = *header_ptr; uint8_t magic = header & 0xF0u; if (__builtin_expect(magic != HEADER_MAGIC, 0)) { // Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ return 0; } // 3. class_idx 抽出(下位4bit) int class_idx = (int)(header & HEADER_CLASS_MASK); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { return 0; } // 4. BASE を計算して Unified Cache に push void* base = tiny_user_to_base_inline(ptr); tiny_front_free_stat_inc(class_idx); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (1. 関数入口) FREE_PATH_STAT_INC(total_calls); // Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case) if (class_idx == 7 && tiny_c7_ultra_enabled_env()) { tiny_c7_ultra_free(ptr); return 1; } // Phase POLICY-FAST-PATH-V2: Skip policy snapshot for known-legacy classes if (free_policy_fast_v2_can_skip((uint8_t)class_idx)) { FREE_PATH_STAT_INC(policy_fast_v2_skip); goto legacy_fallback; } // Phase v11b-1: Policy-based single switch (replaces serial ULTRA checks) const SmallPolicyV7* policy_free = small_policy_v7_snapshot(); SmallRouteKind route_kind_free = policy_free->route_kind[class_idx]; switch (route_kind_free) { case SMALL_ROUTE_ULTRA: { // Phase TLS-UNIFY-1: Unified ULTRA TLS push for C4-C6 (C7 handled above) if (class_idx >= 4 && class_idx <= 6) { tiny_ultra_tls_push((uint8_t)class_idx, base); return 1; } // ULTRA for other classes → fallback to LEGACY break; } case SMALL_ROUTE_MID_V35: { // Phase v11a-3: MID v3.5 free small_mid_v35_free(ptr, class_idx); FREE_PATH_STAT_INC(smallheap_v7_fast); return 1; } case SMALL_ROUTE_V7: { // Phase v7: SmallObject v7 free (research box) if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) { FREE_PATH_STAT_INC(smallheap_v7_fast); return 1; } // V7 miss → fallback to LEGACY break; } case SMALL_ROUTE_MID_V3: { // Phase MID-V3: delegate to MID v3.5 small_mid_v35_free(ptr, class_idx); FREE_PATH_STAT_INC(smallheap_v7_fast); return 1; } case SMALL_ROUTE_LEGACY: default: break; } legacy_fallback: // LEGACY fallback path // Phase 3 D1: Free path route cache (eliminate tiny_route_for_class overhead) tiny_route_kind_t route; if (__builtin_expect(tiny_free_static_route_enabled(), 0)) { // Use cached route (bypasses tiny_route_for_class()) route = g_tiny_route_class[(unsigned)class_idx & 7u]; if (__builtin_expect(route == TINY_ROUTE_LEGACY && !g_tiny_route_snapshot_done, 0)) { // Fallback if uninitialized route = tiny_route_for_class((uint8_t)class_idx); } } else { // Standard path route = tiny_route_for_class((uint8_t)class_idx); } const int use_tiny_heap = tiny_route_is_heap_kind(route); const TinyFrontV3Snapshot* front_snap = __builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL; // TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust header magic (0xA0) as sufficient validation. #if !HAKMEM_BUILD_RELEASE // 5. Superslab 登録確認(誤分類防止) SuperSlab* ss_guard = hak_super_lookup(ptr); if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) { return 0; // hakmem 管理外 → 通常 free 経路へ } #endif // !HAKMEM_BUILD_RELEASE // Cross-thread free detection (Larson MT crash fix, ENV gated) + TinyHeap free path { static __thread int g_larson_fix = -1; if (__builtin_expect(g_larson_fix == -1, 0)) { const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); g_larson_fix = (e && *e && *e != '0') ? 1 : 0; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL"); fflush(stderr); #endif } if (__builtin_expect(g_larson_fix || use_tiny_heap, 0)) { // Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100) SuperSlab* ss = ss_fast_lookup(base); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (5. super_lookup 呼び出し) FREE_PATH_STAT_INC(super_lookup_called); if (ss) { int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) { uint32_t self_tid = tiny_self_u32_local(); uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); TinySlabMeta* meta = &ss->slabs[slab_idx]; // LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes) uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu); #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_owner_check_count = 0; uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1); if (oc < 10) { fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n", ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp)); fflush(stderr); } #endif if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) { // Cross-thread free → route to remote queue instead of poisoning TLS cache #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_cross_thread_count = 0; uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1); if (ct < 20) { fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n", ptr, owner_tid_low, self_tid_cmp, self_tid); fflush(stderr); } #endif if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) { // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (6. cross-thread free) FREE_PATH_STAT_INC(remote_free); return 1; // handled via remote queue } return 0; // remote push failed; fall back to normal path } // Same-thread + TinyHeap route → route-based free if (__builtin_expect(use_tiny_heap, 0)) { switch (route) { case TINY_ROUTE_SMALL_HEAP_V7: { // Phase v7-1: C6-only v7 stub (MID v3 fallback) if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) { return 1; } break; // fallthrough to legacy } case TINY_ROUTE_SMALL_HEAP_V6: { // Phase V6-HDR-2: Headerless free (ENV gated) if (small_v6_headerless_route_enabled((uint8_t)class_idx)) { SmallHeapCtxV6* ctx_v6 = small_heap_ctx_v6(); if (small_v6_headerless_free(ctx_v6, ptr, (uint8_t)class_idx)) { return 1; // Handled by v6 } // v6 returned false -> fallback to legacy } break; // fallthrough to legacy } // Phase v10: v3/v4/v5 removed - routes now handled as LEGACY case TINY_ROUTE_HOTHEAP_V2: tiny_hotheap_v2_free((uint8_t)class_idx, base, meta); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (v2 は tiny_heap_v1 にカウント) FREE_PATH_STAT_INC(tiny_heap_v1_fast); return 1; case TINY_ROUTE_HEAP: { tiny_heap_ctx_t* ctx = tiny_heap_ctx_for_thread(); if (class_idx == 7) { tiny_c7_free_fast_with_meta(ss, slab_idx, base); } else { tiny_heap_free_class_fast_with_meta(ctx, class_idx, ss, slab_idx, base); } // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (9. TinyHeap v1 route) FREE_PATH_STAT_INC(tiny_heap_v1_fast); return 1; } default: break; } } } } if (use_tiny_heap) { // fallback: lookup failed but TinyHeap front is ON → use generic TinyHeap free if (route == TINY_ROUTE_HOTHEAP_V2) { tiny_hotheap_v2_record_free_fallback((uint8_t)class_idx); } // Phase v10: v3/v4 removed - no special fallback tiny_heap_free_class_fast(tiny_heap_ctx_for_thread(), class_idx, ptr); return 1; } } } // Debug: Log free operations (first 5000, all classes) #if !HAKMEM_BUILD_RELEASE { extern _Atomic uint64_t g_debug_op_count; extern __thread TinyTLSSLL g_tls_sll[]; uint64_t op = atomic_fetch_add(&g_debug_op_count, 1); // Note: Shares g_debug_op_count with alloc logging, so bump the window. if (op < 5000) { fprintf(stderr, "[OP#%04lu FREE] cls=%d ptr=%p base=%p from=free_tiny_fast tls_count_before=%u\n", (unsigned long)op, class_idx, ptr, base, g_tls_sll[class_idx].count); fflush(stderr); } } #endif // Phase REFACTOR-2: Legacy fallback (use unified helper) tiny_legacy_fallback_free_base(base, class_idx); return 1; #else // No header mode - fall back to normal free return 0; #endif } #endif // HAK_FRONT_MALLOC_TINY_FAST_H