# Hakmem free-path review packet (compact) Goal: understand remaining fixed costs vs mimalloc/tcmalloc, with Box Theory (single boundary, reversible ENV gates). SSOT bench conditions (current practice): - `HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE` - `ITERS=20000000 WS=400 RUNS=10` - run via `scripts/run_mixed_10_cleanenv.sh` Request: 1) Where is the dominant fixed cost on free path now? 2) What structural change would give +5–10% without breaking Box Theory? 3) What NOT to do (layout tax pitfalls)? ## Code excerpts (clipped) ### `core/box/tiny_free_gate_box.h` ```c static inline int tiny_free_gate_try_fast(void* user_ptr) { #if !HAKMEM_TINY_HEADER_CLASSIDX (void)user_ptr; // Header 無効構成では Tiny Fast Path 自体を使わない return 0; #else if (__builtin_expect(!user_ptr, 0)) { return 0; } // Layer 3a: 軽量 Fail-Fast(常時ON) // 明らかに不正なアドレス(極端に小さい値)は Fast Path では扱わない。 // Slow Path 側(hak_free_at + registry/header)に任せる。 { uintptr_t addr = (uintptr_t)user_ptr; if (__builtin_expect(addr < 4096, 0)) { #if !HAKMEM_BUILD_RELEASE static _Atomic uint32_t g_free_gate_range_invalid = 0; uint32_t n = atomic_fetch_add_explicit(&g_free_gate_range_invalid, 1, memory_order_relaxed); if (n < 8) { fprintf(stderr, "[TINY_FREE_GATE_RANGE_INVALID] ptr=%p\n", user_ptr); fflush(stderr); } #endif return 0; } } // 将来の拡張ポイント: // - DIAG ON のときだけ Bridge + Guard を実行し、 // Tiny 管理外と判定された場合は Fast Path をスキップする。 #if !HAKMEM_BUILD_RELEASE if (__builtin_expect(tiny_free_gate_diag_enabled(), 0)) { TinyFreeGateContext ctx; if (!tiny_free_gate_classify(user_ptr, &ctx)) { // Tiny 管理外 or Bridge 失敗 → Fast Path は使わない return 0; } (void)ctx; // 現時点ではログ専用。将来はここから Guard を挿入。 } #endif // 本体は既存の ultra-fast free に丸投げ(挙動を変えない) return hak_tiny_free_fast_v2(user_ptr); #endif } ``` ### `core/front/malloc_tiny_fast.h` ```c static inline int free_tiny_fast(void* ptr) { if (__builtin_expect(!ptr, 0)) return 0; #if HAKMEM_TINY_HEADER_CLASSIDX // 1. ページ境界ガード: // ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。 // その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。 uintptr_t off = (uintptr_t)ptr & 0xFFFu; if (__builtin_expect(off == 0, 0)) { return 0; } // 2. Fast header magic validation (必須) // Release ビルドでは tiny_region_id_read_header() が magic を省略するため、 // ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。 uint8_t* header_ptr = (uint8_t*)ptr - 1; uint8_t header = *header_ptr; uint8_t magic = header & 0xF0u; if (__builtin_expect(magic != HEADER_MAGIC, 0)) { // Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ return 0; } // 3. class_idx 抽出(下位4bit) int class_idx = (int)(header & HEADER_CLASS_MASK); if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { return 0; } // 4. BASE を計算して Unified Cache に push void* base = tiny_user_to_base_inline(ptr); tiny_front_free_stat_inc(class_idx); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (1. 関数入口) FREE_PATH_STAT_INC(total_calls); // Phase 19-3b: Consolidate ENV snapshot reads (capture once per free_tiny_fast call). const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; // Phase 9: MONO DUALHOT early-exit for C0-C3 (skip policy snapshot, direct to legacy) // Conditions: // - ENV: HAKMEM_FREE_TINY_FAST_MONO_DUALHOT=1 // - class_idx <= 3 (C0-C3) // - !HAKMEM_TINY_LARSON_FIX (cross-thread handling requires full validation) // - g_tiny_route_snapshot_done == 1 && route == TINY_ROUTE_LEGACY (断定できないときは既存経路) if ((unsigned)class_idx <= 3u) { if (free_tiny_fast_mono_dualhot_enabled()) { static __thread int g_larson_fix = -1; if (__builtin_expect(g_larson_fix == -1, 0)) { const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); g_larson_fix = (e && *e && *e != '0') ? 1 : 0; } if (!g_larson_fix && g_tiny_route_snapshot_done == 1 && g_tiny_route_class[class_idx] == TINY_ROUTE_LEGACY) { // Direct path: Skip policy snapshot, go straight to legacy fallback FREE_PATH_STAT_INC(mono_dualhot_hit); tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env); return 1; } } } // Phase 10: MONO LEGACY DIRECT early-exit for C4-C7 (skip policy snapshot, direct to legacy) // Conditions: // - ENV: HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT=1 // - cached nonlegacy_mask: class is NOT in non-legacy mask (= ULTRA/MID/V7 not active) // - g_tiny_route_snapshot_done == 1 && route == TINY_ROUTE_LEGACY (断定できないときは既存経路) // - !HAKMEM_TINY_LARSON_FIX (cross-thread handling requires full validation) if (free_tiny_fast_mono_legacy_direct_enabled()) { // 1. Check nonlegacy mask (computed once at init) uint8_t nonlegacy_mask = free_tiny_fast_mono_legacy_direct_nonlegacy_mask(); if ((nonlegacy_mask & (1u << class_idx)) == 0) { // 2. Check route snapshot if (g_tiny_route_snapshot_done == 1 && g_tiny_route_class[class_idx] == TINY_ROUTE_LEGACY) { // 3. Check Larson fix static __thread int g_larson_fix = -1; if (__builtin_expect(g_larson_fix == -1, 0)) { const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); g_larson_fix = (e && *e && *e != '0') ? 1 : 0; } if (!g_larson_fix) { // Direct path: Skip policy snapshot, go straight to legacy fallback FREE_PATH_STAT_INC(mono_legacy_direct_hit); tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env); return 1; } } } } // Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case) // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) const bool c7_ultra_free = env ? env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env(); if (class_idx == 7 && c7_ultra_free) { tiny_c7_ultra_free(ptr); return 1; } // Phase POLICY-FAST-PATH-V2: Skip policy snapshot for known-legacy classes if (free_policy_fast_v2_can_skip((uint8_t)class_idx)) { FREE_PATH_STAT_INC(policy_fast_v2_skip); goto legacy_fallback; } // Phase v11b-1: Policy-based single switch (replaces serial ULTRA checks) const SmallPolicyV7* policy_free = small_policy_v7_snapshot(); SmallRouteKind route_kind_free = policy_free->route_kind[class_idx]; switch (route_kind_free) { case SMALL_ROUTE_ULTRA: { // Phase TLS-UNIFY-1: Unified ULTRA TLS push for C4-C6 (C7 handled above) if (class_idx >= 4 && class_idx <= 6) { tiny_ultra_tls_push((uint8_t)class_idx, base); return 1; } // ULTRA for other classes → fallback to LEGACY break; } case SMALL_ROUTE_MID_V35: { // Phase v11a-3: MID v3.5 free small_mid_v35_free(ptr, class_idx); FREE_PATH_STAT_INC(smallheap_v7_fast); return 1; } case SMALL_ROUTE_V7: { // Phase v7: SmallObject v7 free (research box) if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) { FREE_PATH_STAT_INC(smallheap_v7_fast); return 1; } // V7 miss → fallback to LEGACY break; } case SMALL_ROUTE_MID_V3: { // Phase MID-V3: delegate to MID v3.5 small_mid_v35_free(ptr, class_idx); FREE_PATH_STAT_INC(smallheap_v7_fast); return 1; } case SMALL_ROUTE_LEGACY: default: break; } legacy_fallback: // LEGACY fallback path // Phase 19-6C: Compute route once using helper (avoid redundant tiny_route_for_class) tiny_route_kind_t route; int use_tiny_heap; free_tiny_fast_compute_route_and_heap(class_idx, &route, &use_tiny_heap); // TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust header magic (0xA0) as sufficient validation. #if !HAKMEM_BUILD_RELEASE // 5. Superslab 登録確認(誤分類防止) SuperSlab* ss_guard = hak_super_lookup(ptr); if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) { return 0; // hakmem 管理外 → 通常 free 経路へ } #endif // !HAKMEM_BUILD_RELEASE // Cross-thread free detection (Larson MT crash fix, ENV gated) + TinyHeap free path { static __thread int g_larson_fix = -1; if (__builtin_expect(g_larson_fix == -1, 0)) { const char* e = getenv("HAKMEM_TINY_LARSON_FIX"); g_larson_fix = (e && *e && *e != '0') ? 1 : 0; #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL"); fflush(stderr); #endif } if (__builtin_expect(g_larson_fix || use_tiny_heap, 0)) { // Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100) SuperSlab* ss = ss_fast_lookup(base); // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (5. super_lookup 呼び出し) FREE_PATH_STAT_INC(super_lookup_called); if (ss) { int slab_idx = slab_index_for(ss, base); if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) { uint32_t self_tid = tiny_self_u32_local(); uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx); TinySlabMeta* meta = &ss->slabs[slab_idx]; // LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes) uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu); #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_owner_check_count = 0; uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1); if (oc < 10) { fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n", ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp)); fflush(stderr); } #endif if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) { // Cross-thread free → route to remote queue instead of poisoning TLS cache #if !HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_cross_thread_count = 0; uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1); if (ct < 20) { fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n", ptr, owner_tid_low, self_tid_cmp, self_tid); fflush(stderr); } #endif if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) { // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (6. cross-thread free) FREE_PATH_STAT_INC(remote_free); return 1; // handled via remote queue ``` ### `core/box/tiny_front_hot_box.h` ```c static inline int tiny_hot_free_fast(int class_idx, void* base) { extern __thread TinyUnifiedCache g_unified_cache[]; // TLS cache access (1 cache miss) // NOTE: Range check removed - caller guarantees valid class_idx TinyUnifiedCache* cache = &g_unified_cache[class_idx]; #if HAKMEM_TINY_UNIFIED_LIFO_COMPILED // Phase 15 v1: Mode check at entry (once per call, not scattered in hot path) // Phase 22: Compile-out when disabled (default OFF) int lifo_mode = tiny_unified_lifo_enabled(); // Phase 15 v1: LIFO vs FIFO mode switch if (lifo_mode) { // === LIFO MODE: Stack-based (LIFO) === // Try push to stack (tail is stack depth) if (unified_cache_try_push_lifo(class_idx, base)) { #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_push[]; g_unified_cache_push[class_idx]++; #endif return 1; // SUCCESS } // LIFO overflow → fall through to cold path #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_full[]; g_unified_cache_full[class_idx]++; #endif return 0; // FULL } #endif // === FIFO MODE: Ring-based (existing, default) === // Calculate next tail (for full check) uint16_t next_tail = (cache->tail + 1) & cache->mask; // Branch 1: Cache full check (UNLIKELY full) // Hot path: cache has space (next_tail != head) // Cold path: cache full (next_tail == head) → drain needed if (TINY_HOT_LIKELY(next_tail != cache->head)) { // === HOT PATH: Cache has space (2-3 instructions) === // Push to cache (1 cache miss for array write) cache->slots[cache->tail] = base; cache->tail = next_tail; // Debug metrics (zero overhead in release) #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_push[]; g_unified_cache_push[class_idx]++; #endif return 1; // SUCCESS } // === COLD PATH: Cache full === // Don't drain here - let caller handle via tiny_cold_drain_and_free() #if !HAKMEM_BUILD_RELEASE extern __thread uint64_t g_unified_cache_full[]; g_unified_cache_full[class_idx]++; #endif return 0; // FULL } ``` ### `core/box/tiny_legacy_fallback_box.h` ```c static inline void tiny_legacy_fallback_free_base_with_env(void* base, uint32_t class_idx, const HakmemEnvSnapshot* env) { // Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization) // Phase 83-1: Per-op branch removed via fixed-mode caching // C2/C3 excluded (NO-GO from Phase 77-1/79-1) if (tiny_inline_slots_switch_dispatch_enabled_fast()) { // Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6) switch (class_idx) { case 4: if (tiny_c4_inline_slots_enabled_fast()) { if (c4_inline_push(c4_inline_tls(), base)) { FREE_PATH_STAT_INC(legacy_fallback); if (__builtin_expect(free_path_stats_enabled(), 0)) { g_free_path_stats.legacy_by_class[class_idx]++; } return; } } break; case 5: if (tiny_c5_inline_slots_enabled_fast()) { if (c5_inline_push(c5_inline_tls(), base)) { FREE_PATH_STAT_INC(legacy_fallback); if (__builtin_expect(free_path_stats_enabled(), 0)) { g_free_path_stats.legacy_by_class[class_idx]++; } return; } } break; case 6: if (tiny_c6_inline_slots_enabled_fast()) { if (c6_inline_push(c6_inline_tls(), base)) { FREE_PATH_STAT_INC(legacy_fallback); if (__builtin_expect(free_path_stats_enabled(), 0)) { g_free_path_stats.legacy_by_class[class_idx]++; } return; } } break; default: // C0-C3, C7: fall through to unified_cache push break; } // Switch mode: fall through to unified_cache push after miss } else { // If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks // NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path // Phase 77-1: C3 Inline Slots early-exit (ENV gated) // Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3 if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) { if (c3_inline_push(c3_inline_tls(), base)) { // Success: pushed to C3 inline slots FREE_PATH_STAT_INC(legacy_fallback); if (__builtin_expect(free_path_stats_enabled(), 0)) { g_free_path_stats.legacy_by_class[class_idx]++; } return; } // FULL → fall through to C4/C5/C6/unified cache } // Phase 76-1: C4 Inline Slots early-exit (ENV gated) // Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4 if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) { if (c4_inline_push(c4_inline_tls(), base)) { // Success: pushed to C4 inline slots FREE_PATH_STAT_INC(legacy_fallback); if (__builtin_expect(free_path_stats_enabled(), 0)) { g_free_path_stats.legacy_by_class[class_idx]++; } return; } // FULL → fall through to C5/C6/unified cache } // Phase 75-2: C5 Inline Slots early-exit (ENV gated) // Try C5 inline slots SECOND (before C6 and unified cache) for class 5 if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) { if (c5_inline_push(c5_inline_tls(), base)) { // Success: pushed to C5 inline slots FREE_PATH_STAT_INC(legacy_fallback); if (__builtin_expect(free_path_stats_enabled(), 0)) { g_free_path_stats.legacy_by_class[class_idx]++; } return; } // FULL → fall through to C6/unified cache } // Phase 75-1: C6 Inline Slots early-exit (ENV gated) // Try C6 inline slots THIRD (before unified cache) for class 6 if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) { if (c6_inline_push(c6_inline_tls(), base)) { // Success: pushed to C6 inline slots FREE_PATH_STAT_INC(legacy_fallback); if (__builtin_expect(free_path_stats_enabled(), 0)) { g_free_path_stats.legacy_by_class[class_idx]++; } return; } // FULL → fall through to unified cache } } // End of if-chain mode const TinyFrontV3Snapshot* front_snap = env ? (env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL) : (__builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL); const bool metadata_cache_on = env ? env->tiny_metadata_cache_eff : tiny_metadata_cache_enabled(); // Phase 3 C2 Patch 2: First page cache hint (optional fast-path) // Check if pointer is in cached page (avoids metadata lookup in future optimizations) if (__builtin_expect(metadata_cache_on, 0)) { // Note: This is a hint-only check. Even if it hits, we still use the standard path. // The cache will be populated during refill operations for future use. // Currently this just validates the cache state; actual optimization TBD. if (tiny_first_page_cache_hit(class_idx, base, 4096)) { // Future: could optimize metadata access here } } // Legacy fallback - Unified Cache push if (!front_snap || front_snap->unified_cache_on) { // Phase 74-3 (P0): FASTAPI path (ENV-gated) if (tiny_uc_fastapi_enabled()) { // Preconditions guaranteed: // - unified_cache_on == true (checked above) // - TLS init guaranteed by front_gate_unified_enabled() in malloc_tiny_fast.h // - Stats compiled-out in FAST builds if (unified_cache_push_fast(class_idx, HAK_BASE_FROM_RAW(base))) { FREE_PATH_STAT_INC(legacy_fallback); // Per-class breakdown (Phase 4-1) if (__builtin_expect(free_path_stats_enabled(), 0)) { if (class_idx < 8) { g_free_path_stats.legacy_by_class[class_idx]++; } } return; } // FULL → fallback to slow path (rare) } // Original path (FASTAPI=0 or fallback) if (unified_cache_push(class_idx, HAK_BASE_FROM_RAW(base))) { FREE_PATH_STAT_INC(legacy_fallback); // Per-class breakdown (Phase 4-1) if (__builtin_expect(free_path_stats_enabled(), 0)) { if (class_idx < 8) { g_free_path_stats.legacy_by_class[class_idx]++; } } return; } } // Final fallback tiny_hot_free_fast(class_idx, base); } ``` ## Questions to answer (please be concrete) 1) In these snippets, which checks/branches are still "per-op fixed taxes" on the hot free path? - Please point to specific lines/conditions and estimate cost (branches/instructions or dependency chain). 2) Is `tiny_hot_free_fast()` already close to optimal, and the real bottleneck is upstream (user->base/classify/route)? - If yes, what’s the smallest structural refactor that removes that upstream fixed tax? 3) Should we introduce a "commit once" plan (freeze the chosen free path) — or is branch prediction already making lazy-init checks ~free here? - If "commit once", where should it live to avoid runtime gate overhead (bench_profile refresh boundary vs per-op)? 4) We have had many layout-tax regressions from code removal/reordering. - What patterns here are most likely to trigger layout tax if changed? - How would you stage a safe A/B (same binary, ENV toggle) for your proposal? 5) If you could change just ONE of: - pointer classification to base/class_idx, - route determination, - unified cache push/pop structure, which is highest ROI for +5–10% on WS=400? [packet] done