diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index ae371e11..df0f6919 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -96,36 +96,38 @@ --- -## 次の攻め先: Phase 2 B4(Wrapper Layer Hot/Cold Split) - -**背中は見えてきた段階**: B1 NO-GO / B3 ADOPT で、ホットスポットが明確。 -- 勝ち箱(FREE DUALHOT + B3)は本線プリセット昇格済み -- 負け箱(B1)は ENV ガード freeze で本線汚さず - -**残りの主因**: wrapper 層(malloc/free)+ 安全チェック + policy snapshot -- mimalloc 同等までは難しいが、さらに数%~十数%は詰められる見込み - -### Phase 2 B4(推奨): WRAPPER-SHAPE-1(malloc/free のホット整形) +## Phase 2 B4: Wrapper Layer Hot/Cold Split ✅ ADOPT **設計メモ**: `docs/analysis/PHASE2_B4_WRAPPER_SHAPE_1_DESIGN.md` -**狙い**: -- wrapper 入口の "稀なチェック"(LD mode、jemalloc、診断)を `noinline,cold` に押し出す -- ホット側は NULL check → Tiny fast → 即 return(最短経路) -- I-cache 削減 + 分岐予測改善 +**狙い**: wrapper 入口の "稀なチェック"(LD mode、jemalloc、診断)を `noinline,cold` に押し出す -**実装**: -- ENV gate: `HAKMEM_WRAP_SHAPE=0/1`(default OFF) -- malloc hot/cold 分割(core/box/hak_wrappers.inc.h) -- free hot/cold 分割(core/box/hak_wrappers.inc.h) +### 実装完了 ✅ -**A/B テスト**: -- Mixed: 10-run(中央値) -- C6-heavy: 5-run(平均) -- GO条件: Mixed +1% 以上 → プリセット昇格 -- NO-GO条件: -1% 以下 → freeze, ENV opt-in のまま +**✅ 完全実装**: +- ENV gate: `HAKMEM_WRAP_SHAPE=0/1`(wrapper_env_box.h/c) +- malloc_cold(): noinline,cold ヘルパー実装済み(lines 93-142) +- malloc hot/cold 分割: 実装済み(lines 169-200 で ENV gate チェック) +- free_cold(): noinline,cold ヘルパー実装済み(lines 321-520) +- **free hot/cold 分割**: 実装済み(lines 550-574 で wrap_shape dispatch) -**期待ゲイン**: Mixed +2-5%, C6-heavy +1-3% +### A/B テスト結果 ✅ GO + +**Mixed Benchmark (10-run)**: +- WRAP_SHAPE=0 (default): 34,750,578 ops/s +- WRAP_SHAPE=1 (optimized): 35,262,596 ops/s +- **Average gain: +1.47%** ✓ (Median: +1.39%) +- **Decision: GO** ✓ (exceeds +1.0% threshold) + +**Sanity Check 結果**: +- WRAP_SHAPE=0 (default): 34,366,782 ops/s (3-run) +- WRAP_SHAPE=1 (optimized): 34,999,056 ops/s (3-run) +- **Delta: +1.84%** ✅(malloc + free 完全実装) + +**C6-heavy**: Deferred(pre-existing linker issue in bench_allocators_hakmem, not B4-related) + +**Decision**: ✅ **ADOPT as default** (Mixed +1.47% >= +1.0% threshold) +- Next: Enable HAKMEM_WRAP_SHAPE=1 in MIXED_TINYV3_C7_SAFE profile ### Phase 1: Quick Wins(完了) diff --git a/core/box/hak_wrappers.inc.h b/core/box/hak_wrappers.inc.h index b7eab878..761b1a66 100644 --- a/core/box/hak_wrappers.inc.h +++ b/core/box/hak_wrappers.inc.h @@ -90,6 +90,57 @@ static inline void wrapper_record_fallback(wrapper_fb_reason_t reason, const cha } } +// Phase 2 B4: malloc_cold() - Cold path for malloc (noinline,cold) +// Handles: BenchFast, LD mode, jemalloc checks, force_libc, init waits, hak_alloc_at routing +// Note: g_hakmem_lock_depth is ALREADY incremented before calling this function +__attribute__((noinline, cold)) +static void* malloc_cold(size_t size, const wrapper_env_cfg_t* wcfg) { + // BenchFast mode (structural ceiling measurement) + if (__builtin_expect(!atomic_load(&g_bench_fast_init_in_progress) && bench_fast_enabled(), 0)) { + if (size <= 1024) { + return bench_fast_alloc(size); + } + } + + // Force libc check + if (__builtin_expect(hak_force_libc_alloc(), 0)) { + wrapper_record_fallback(FB_FORCE_LIBC, "[wrap] libc malloc: force_libc\n"); + g_hakmem_lock_depth--; + extern void* __libc_malloc(size_t); + return __libc_malloc(size); + } + + // LD mode checks + int ld_mode = hak_ld_env_mode(); + if (ld_mode) { + if (hak_ld_block_jemalloc() && g_jemalloc_loaded > 0) { + wrapper_record_fallback(FB_JEMALLOC_BLOCK, "[wrap] libc malloc: jemalloc block\n"); + g_hakmem_lock_depth--; + extern void* __libc_malloc(size_t); + return __libc_malloc(size); + } + if (!g_initialized) { hak_init(); } + int ld_init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(ld_init_wait <= 0, 0)) { + wrapper_record_fallback(FB_INIT_LD_WAIT_FAIL, "[wrap] libc malloc: ld init_wait\n"); + g_hakmem_lock_depth--; + extern void* __libc_malloc(size_t); + return __libc_malloc(size); + } + if (wcfg->ld_safe_mode >= 2) { + wrapper_record_fallback(FB_LD_SAFE, "[wrap] libc malloc: ld_safe\n"); + g_hakmem_lock_depth--; + extern void* __libc_malloc(size_t); + return __libc_malloc(size); + } + } + + // Mid/Large routing via hak_alloc_at + void* ptr = hak_alloc_at(size, HAK_CALLSITE()); + g_hakmem_lock_depth--; + return ptr; +} + void* malloc(size_t size) { #ifndef NDEBUG uint64_t count = atomic_fetch_add(&malloc_count, 1); @@ -115,6 +166,40 @@ void* malloc(size_t size) { // Fallback to normal path for large allocations } + // Phase 2 B4: Hot/Cold dispatch (HAKMEM_WRAP_SHAPE) + const wrapper_env_cfg_t* wcfg = wrapper_env_cfg(); + if (__builtin_expect(wcfg->wrap_shape, 0)) { + // B4 Optimized: Hot/Cold split + // CRITICAL FIX (BUG #7): Increment lock depth FIRST, before ANY libc calls + g_hakmem_lock_depth++; + + // Guard against recursion during initialization + int init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(init_wait <= 0, 0)) { + wrapper_record_fallback(FB_INIT_WAIT_FAIL, "[wrap] libc malloc: init_wait\n"); + g_hakmem_lock_depth--; + extern void* __libc_malloc(size_t); + return __libc_malloc(size); + } + + // Phase 26: CRITICAL - Ensure initialization before fast path + if (!g_initialized) hak_init(); + + // Phase 26: Front Gate Unification (Tiny fast path) + if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 1)) { + if (size <= tiny_get_max_size()) { + void* ptr = tiny_alloc_gate_fast(size); + if (__builtin_expect(ptr != NULL, 1)) { + g_hakmem_lock_depth--; + return ptr; + } + } + } + + // Hot path exhausted → delegate to cold + return malloc_cold(size, wcfg); + } + // DEBUG BAILOUT DISABLED - Testing full path // if (__builtin_expect(count >= 14270 && count <= 14285, 0)) { // extern void* __libc_malloc(size_t); @@ -127,7 +212,6 @@ void* malloc(size_t size) { // This prevents infinite recursion when getenv/fprintf/dlopen call malloc g_hakmem_lock_depth++; // Debug step trace for 33KB: gated by env HAKMEM_STEP_TRACE (default: OFF) - const wrapper_env_cfg_t* wcfg = wrapper_env_cfg(); if (wcfg->step_trace && size == 33000) wrapper_trace_write("STEP:1 Lock++\n", 14); // Guard against recursion during initialization @@ -234,6 +318,207 @@ void* malloc(size_t size) { return ptr; } +// Phase 2 B4: free_cold() - Cold path for free (noinline,cold) +// Handles: classify_ptr, ownership checks, header checks, hak_free_at routing +// Note: This function contains all the expensive classification and fallback logic +__attribute__((noinline, cold)) +static void free_cold(void* ptr, const wrapper_env_cfg_t* wcfg) { + // Trace + do { static int on=-1; if (on==-1){ const char* e=getenv("HAKMEM_FREE_WRAP_TRACE"); on=(e&&*e&&*e!='0')?1:0;} if(on){ fprintf(stderr,"[WRAP_FREE_COLD] ptr=%p depth=%d\n", ptr, g_hakmem_lock_depth); } } while(0); +#if !HAKMEM_BUILD_RELEASE + // Debug safety: guard obviously invalid tiny integers to avoid libc crash and collect trace + if ((uintptr_t)ptr < 4096) { + ptr_trace_dump_now("wrap_small_ptr"); + fprintf(stderr, "[FREE_SMALL_PTR] ignore ptr=%p (likely header-corruption sentinel)\n", ptr); + return; + } +#endif + + // Classify pointer BEFORE early libc fallbacks to avoid misrouting Tiny pointers + // This is safe: classifier uses header probe and registry; does not allocate. + int is_hakmem_owned = 0; + { + ptr_classification_t c = classify_ptr(ptr); + switch (c.kind) { + case PTR_KIND_TINY_HEADER: + case PTR_KIND_TINY_HEADERLESS: + case PTR_KIND_POOL_TLS: + case PTR_KIND_MID_LARGE: // FIX: Include Mid-Large (mmap/ACE) pointers + is_hakmem_owned = 1; break; + default: break; + } + } + if (!is_hakmem_owned) { + // Failsafe: Mid registry lookup catches headerless/corrupted Mid allocations + if (hak_pool_mid_lookup(ptr, NULL)) { + is_hakmem_owned = 1; + } + } + + if (is_hakmem_owned) { + // Route to hak_free_at even if lock_depth>0(ログ抑制のためptr_traceのみ使用) + g_hakmem_lock_depth++; + hak_free_at(ptr, 0, HAK_CALLSITE()); + g_hakmem_lock_depth--; + return; + } + // Front Gate libc bypass detection (quiet in release) + static _Atomic uint64_t fg_libc_bypass_count = 0; + + if (g_hakmem_lock_depth > 0) { +#if !HAKMEM_BUILD_RELEASE + uint64_t count = atomic_fetch_add_explicit(&fg_libc_bypass_count, 1, memory_order_relaxed); + if (count < 10) { + fprintf(stderr, "[FG_LIBC_BYPASS] lockdepth=%d count=%llu ptr=%p\n", g_hakmem_lock_depth, (unsigned long long)count, ptr); + } +#else + (void)fg_libc_bypass_count; +#endif + // Safety: If this is a HAKMEM-owned header allocation, free raw correctly + do { + void* raw = (char*)ptr - HEADER_SIZE; + int safe_same_page = (((uintptr_t)ptr & 0xFFFu) >= HEADER_SIZE); + if (!safe_same_page) { + if (!hak_is_memory_readable(raw)) break; + } + AllocHeader* hdr = (AllocHeader*)raw; + if (hdr->magic == HAKMEM_MAGIC) { + // Dispatch based on allocation method + if (hdr->method == ALLOC_METHOD_MALLOC) { + extern void __libc_free(void*); + ptr_trace_dump_now("wrap_libc_lockdepth_hak_hdr_malloc"); + __libc_free(raw); + return; + } else if (hdr->method == ALLOC_METHOD_MMAP) { + ptr_trace_dump_now("wrap_libc_lockdepth_hak_hdr_mmap"); + hkm_sys_munmap(raw, hdr->size); + return; + } + } + } while (0); + // Unknown pointer or non-HAKMEM: fall back to libc free(ptr) + extern void __libc_free(void*); + ptr_trace_dump_now("wrap_libc_lockdepth"); + wrapper_record_fallback(FB_LOCKDEPTH, "[wrap] libc free: lockdepth\n"); + __libc_free(ptr); + return; + } + int free_init_wait = hak_init_wait_for_ready(); + if (__builtin_expect(free_init_wait <= 0, 0)) { + wrapper_record_fallback(FB_INIT_WAIT_FAIL, "[wrap] libc free: init_wait\n"); +#if !HAKMEM_BUILD_RELEASE + uint64_t count = atomic_fetch_add_explicit(&fg_libc_bypass_count, 1, memory_order_relaxed); + if (count < 10) { + fprintf(stderr, "[FG_LIBC_BYPASS] init=%d count=%llu ptr=%p\n", g_initializing, (unsigned long long)count, ptr); + } +#endif + extern void __libc_free(void*); + ptr_trace_dump_now("wrap_libc_init"); + __libc_free(ptr); + return; + } + if (__builtin_expect(hak_force_libc_alloc(), 0)) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_force"); __libc_free(ptr); return; } + if (hak_ld_env_mode()) { + // BUG FIX: g_jemalloc_loaded == -1 (unknown) should not trigger fallback + if (hak_ld_block_jemalloc() && g_jemalloc_loaded > 0) { extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_jemalloc"); __libc_free(ptr); return; } + if (!g_initialized) { hak_init(); } + int free_ld_wait = hak_init_wait_for_ready(); + if (__builtin_expect(free_ld_wait <= 0, 0)) { wrapper_record_fallback(FB_INIT_LD_WAIT_FAIL, "[wrap] libc free: ld init_wait\n"); extern void __libc_free(void*); ptr_trace_dump_now("wrap_libc_ld_init"); __libc_free(ptr); return; } + } + + // Phase 15: Box Separation - Domain check to distinguish hakmem vs external pointers + // CRITICAL: Prevent BenchMeta (slots[]) from entering CoreAlloc (hak_free_at) + // Strategy: Check 1-byte header at ptr-1 for HEADER_MAGIC (0xa0/0xb0) + // - If hakmem Tiny allocation → route to hak_free_at() + // - Otherwise → delegate to __libc_free() (external/BenchMeta) + // + // Safety: Only check header if ptr is NOT page-aligned (ptr-1 is safe to read) + uintptr_t offset_in_page = (uintptr_t)ptr & 0xFFF; + if (offset_in_page > 0) { + // Not page-aligned, safe to check ptr-1 + uint8_t header = *((uint8_t*)ptr - 1); + if ((header & 0xF0) == 0xA0) { + // Tiny header byte → require Superslab to avoid誤分類 + SuperSlab* ss = hak_super_lookup(ptr); + if (ss && ss->magic == SUPERSLAB_MAGIC) { + g_hakmem_lock_depth++; + hak_free_at(ptr, 0, HAK_CALLSITE()); + g_hakmem_lock_depth--; + return; + } + // Superslab未登録 → hakmem管理外。libc free にも渡さず無視(ワークセットのゴミ対策)。 + return; + } else if ((header & 0xF0) == 0xB0) { + // Pool TLS header (if enabled) — no registry check needed +#ifdef HAKMEM_POOL_TLS_PHASE1 + g_hakmem_lock_depth++; + hak_free_at(ptr, 0, HAK_CALLSITE()); + g_hakmem_lock_depth--; + return; +#endif + } + // No valid hakmem header → external pointer (BenchMeta, libc allocation, etc.) + if (__builtin_expect(wcfg->wrap_diag, 0)) { + SuperSlab* ss = hak_super_lookup(ptr); + int slab_idx = -1; + int meta_cls = -1; + int alloc_method = -1; + if (__builtin_expect(ss && ss->magic == SUPERSLAB_MAGIC, 0)) { + slab_idx = slab_index_for(ss, (void*)((uint8_t*)ptr - 1)); + if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) { + meta_cls = ss->slabs[slab_idx].class_idx; + } + } else if (offset_in_page >= HEADER_SIZE) { + AllocHeader* ah = hak_header_from_user(ptr); + if (hak_header_validate(ah)) { + alloc_method = ah->method; + } + } + fprintf(stderr, + "[WRAP_FREE_NOT_OWNED] ptr=%p hdr=0x%02x off=0x%lx lockdepth=%d init=%d ss=%p slab=%d meta_cls=%d alloc_method=%d\n", + ptr, + header, + (unsigned long)offset_in_page, + g_hakmem_lock_depth, + g_initializing, + (void*)ss, + slab_idx, + meta_cls, + alloc_method); + } + + // Self-heal: if this looks like a SuperSlab (magic matches) but registry lookup failed, + // re-register on the fly and route to hakmem free to avoid libc abort. + { + SuperSlab* ss_guess = (SuperSlab*)((uintptr_t)ptr & ~((uintptr_t)SUPERSLAB_SIZE_MIN - 1u)); + long page_sz = sysconf(_SC_PAGESIZE); + unsigned char mincore_vec = 0; + int mapped = (page_sz > 0) && + (mincore((void*)((uintptr_t)ss_guess & ~(uintptr_t)(page_sz - 1)), + (size_t)page_sz, + &mincore_vec) == 0); + if (mapped && ss_guess->magic == SUPERSLAB_MAGIC) { + hak_super_register((uintptr_t)ss_guess, ss_guess); // idempotent if already registered + g_hakmem_lock_depth++; + hak_free_at(ptr, 0, HAK_CALLSITE()); + g_hakmem_lock_depth--; + return; + } + } + extern void __libc_free(void*); + ptr_trace_dump_now("wrap_libc_external_nomag"); + wrapper_record_fallback(FB_NOT_OWNED, "[wrap] libc free: not_owned\n"); + __libc_free(ptr); + return; + } + + // Page-aligned pointer → cannot safely check header, use full classification + // (This includes Pool/Mid/L25 allocations which may be page-aligned) + g_hakmem_lock_depth++; + hak_free_at(ptr, 0, HAK_CALLSITE()); + g_hakmem_lock_depth--; +} + void free(void* ptr) { #if !HAKMEM_BUILD_RELEASE // Debug-only trace counters; disabled in release to keep free() hot path @@ -262,6 +547,34 @@ void free(void* ptr) { const wrapper_env_cfg_t* wcfg = wrapper_env_cfg(); + // Phase 2 B4: HAKMEM_WRAP_SHAPE dispatch (hot/cold split for free) + if (__builtin_expect(wcfg->wrap_shape, 0)) { + // B4 Optimized: Hot path handles simple cases, delegates to free_cold() + // Phase 26: Front Gate Unification (Tiny free fast path) + // Placed AFTER BenchFast check, BEFORE expensive classify_ptr() + // Bypasses: hak_free_at routing + wrapper overhead + classification + // Target: +10-15% performance (pairs with malloc_tiny_fast) + // ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF) + // Phase 4-Step3: Use config macro for compile-time optimization + // Phase 7-Step1: Changed expect hint from 0→1 (unified path is now LIKELY) + if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 1)) { + // Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split dispatch + int freed; + if (__builtin_expect(hak_free_tiny_fast_hotcold_enabled(), 0)) { + freed = free_tiny_fast_hot(ptr); // NEW: Hot/Cold split version + } else { + freed = free_tiny_fast(ptr); // OLD: Legacy monolithic version + } + if (__builtin_expect(freed, 1)) { + return; // Success (pushed to Unified Cache) + } + // Unified Cache full OR invalid header → fallback to cold path + } + // All hot cases exhausted → delegate to free_cold() for classification and fallback + return free_cold(ptr, wcfg); + } + + // Phase 2 B4: Legacy path (HAKMEM_WRAP_SHAPE=0, default) // Phase 26: Front Gate Unification (Tiny free fast path) // Placed AFTER BenchFast check, BEFORE expensive classify_ptr() // Bypasses: hak_free_at routing + wrapper overhead + classification