diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index 1897cdda..27be33e9 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -142,10 +142,34 @@ - ✅ **B4(WRAPPER-SHAPE-1)**: `HAKMEM_WRAP_SHAPE=1` は Mixed +1.47% → ADOPT(`docs/analysis/PHASE2_B4_WRAPPER_SHAPE_1_DESIGN.md`) - (保留)**B2**: C0–C3 専用 alloc fast path(入口短絡は回帰リスク高。B4 の後に判断) -### Phase 3: Cache Locality (Weeks 4-5) - Target: +12-22% (57-68M ops/s) +### Phase 3: Cache Locality - Target: +12-22% (57-68M ops/s) **指示書**: `docs/analysis/PHASE3_CACHE_LOCALITY_NEXT_INSTRUCTIONS.md` +#### Phase 3 C3: Static Routing ✅ ADOPT + +**設計メモ**: `docs/analysis/PHASE3_C3_STATIC_ROUTING_1_DESIGN.md` + +**狙い**: policy_snapshot + learner evaluation をバイパスするために、初期化時に静的ルーティングテーブルを構築 + +**実装完了** ✅: +- `core/box/tiny_static_route_box.h` (API header + hot path functions) +- `core/box/tiny_static_route_box.c` (initialization + ENV gate + learner interlock) +- `core/front/malloc_tiny_fast.h` (lines 249-256) - 統合: `tiny_static_route_ready_fast()` で分岐 +- `core/bench_profile.h` (line 77) - MIXED_TINYV3_C7_SAFE プリセットで `HAKMEM_TINY_STATIC_ROUTE=1` を default 化 + +**A/B テスト結果** ✅ GO: +- Mixed (10-run): 38,910,792 → 39,768,006 ops/s (**+2.20% average gain**, median +1.98%) +- Decision: ✅ **ADOPT** (exceeds +1.0% GO threshold) +- Rationale: policy_snapshot is light (L1 cache resident), but atomic+branch overhead makes +2.2% realistic +- Learner Interlock: Static route auto-disables when HAKMEM_SMALL_LEARNER_V7_ENABLED=1 (safe) + +**Current Cumulative Gain** (Phase 2-3): +- B3 (Routing shape): +2.89% +- B4 (Wrapper split): +1.47% +- C3 (Static routing): +2.20% +- **Total: ~6.8%** (baseline 35.2M → ~39.8M ops/s) + **優先度 C1** - TLS cache prefetch: - `__builtin_prefetch(g_small_policy_v7, 0, 3)` on malloc entry - Improve L1 hit rate on cold start @@ -157,11 +181,6 @@ - Inline first slab descriptor - Expected: +5-10% -**優先度 C3** - Static routing (if no learner): -- Detect static routes at init -- Bypass policy snapshot entirely -- Expected: +5-8% - ### Architectural Insight (Long-term) **Reality check**: hakmem 4-5 layer design (wrapper → gate → policy → route → handler) adds 50-100x instruction overhead vs mimalloc's 1-layer TLS buckets. diff --git a/core/box/alloc_gate_stats_box.h b/core/box/alloc_gate_stats_box.h index 4d2593d3..53d7792b 100644 --- a/core/box/alloc_gate_stats_box.h +++ b/core/box/alloc_gate_stats_box.h @@ -4,6 +4,7 @@ #include #include #include +#include "../hakmem_build_flags.h" typedef struct AllocGateStats { uint64_t total_calls; // malloc_tiny_fast 入口 @@ -18,18 +19,23 @@ typedef struct AllocGateStats { // ENV gate static inline bool alloc_gate_stats_enabled(void) { +#if !HAKMEM_DEBUG_COUNTERS + return false; +#else static int g_enabled = -1; if (__builtin_expect(g_enabled == -1, 0)) { const char* e = getenv("HAKMEM_ALLOC_GATE_STATS"); g_enabled = (e && *e && *e != '0') ? 1 : 0; } return g_enabled; +#endif } // Global stats instance extern AllocGateStats g_alloc_gate_stats; // Increment macros (with unlikely guard) +#if HAKMEM_DEBUG_COUNTERS #define ALLOC_GATE_STAT_INC(field) \ do { if (__builtin_expect(alloc_gate_stats_enabled(), 0)) { \ g_alloc_gate_stats.field++; \ @@ -39,5 +45,9 @@ extern AllocGateStats g_alloc_gate_stats; do { if (__builtin_expect(alloc_gate_stats_enabled(), 0)) { \ if ((class_idx) >= 0 && (class_idx) < 8) g_alloc_gate_stats.class_calls[class_idx]++; \ } } while(0) +#else +#define ALLOC_GATE_STAT_INC(field) ((void)0) +#define ALLOC_GATE_STAT_INC_CLASS(class_idx) ((void)0) +#endif #endif // HAKMEM_ALLOC_GATE_STATS_BOX_H diff --git a/core/box/free_path_stats_box.h b/core/box/free_path_stats_box.h index defdf0af..c08735ad 100644 --- a/core/box/free_path_stats_box.h +++ b/core/box/free_path_stats_box.h @@ -4,6 +4,7 @@ #include #include #include +#include "../hakmem_build_flags.h" typedef struct FreePathStats { uint64_t total_calls; @@ -36,6 +37,9 @@ typedef struct FreePathStats { // ENV gate static inline bool free_path_stats_enabled(void) { +#if !HAKMEM_DEBUG_COUNTERS + return false; +#else static int g_enabled = -1; // -1: unknown, 0: off, 1: on static int g_probe_left = 64; // tolerate early getenv() instability (bench_profile putenv) @@ -54,15 +58,20 @@ static inline bool free_path_stats_enabled(void) { g_enabled = 0; return false; +#endif } // Global stats instance extern FreePathStats g_free_path_stats; // Increment macros (with unlikely guard) +#if HAKMEM_DEBUG_COUNTERS #define FREE_PATH_STAT_INC(field) \ do { if (__builtin_expect(free_path_stats_enabled(), 0)) { \ g_free_path_stats.field++; \ } } while(0) +#else +#define FREE_PATH_STAT_INC(field) ((void)0) +#endif #endif // HAKMEM_FREE_PATH_STATS_BOX_H diff --git a/core/box/free_tiny_fast_hotcold_env_box.h b/core/box/free_tiny_fast_hotcold_env_box.h index c83cd1cf..22a7eb6e 100644 --- a/core/box/free_tiny_fast_hotcold_env_box.h +++ b/core/box/free_tiny_fast_hotcold_env_box.h @@ -15,17 +15,29 @@ static inline int hak_free_tiny_fast_hotcold_enabled(void) { static int g = -1; - if (__builtin_expect(g == -1, 0)) { - const char* e = getenv("HAKMEM_FREE_TINY_FAST_HOTCOLD"); - g = (e && *e == '1') ? 1 : 0; + static int g_probe_left = 64; // tolerate early getenv() instability (bench_profile putenv) + + if (__builtin_expect(g == 1, 1)) return 1; + if (__builtin_expect(g == 0, 1)) return 0; + + const char* e = getenv("HAKMEM_FREE_TINY_FAST_HOTCOLD"); + if (e && *e) { + g = (*e == '1') ? 1 : 0; #if !HAKMEM_BUILD_RELEASE if (g) { fprintf(stderr, "[FREE_TINY_FAST_HOTCOLD] Enabled (hot/cold split)\n"); fflush(stderr); } #endif + return g; } - return g; + + if (g_probe_left-- > 0) { + return 0; // keep g==-1, retry later + } + + g = 0; + return 0; } #endif // HAK_FREE_TINY_FAST_HOTCOLD_ENV_BOX_H diff --git a/core/box/free_tiny_fast_hotcold_stats_box.h b/core/box/free_tiny_fast_hotcold_stats_box.h index 061420ea..326d843b 100644 --- a/core/box/free_tiny_fast_hotcold_stats_box.h +++ b/core/box/free_tiny_fast_hotcold_stats_box.h @@ -28,6 +28,7 @@ #include #include "../hakmem_build_flags.h" +#if HAKMEM_DEBUG_COUNTERS // ============================================================================ // ENV Control // ============================================================================ @@ -127,4 +128,10 @@ static void free_tiny_fast_hotcold_stats_dump(void) { fflush(stderr); } +#else // !HAKMEM_DEBUG_COUNTERS + +#define FREE_TINY_FAST_HOTCOLD_STAT_INC(field) ((void)0) + +#endif // HAKMEM_DEBUG_COUNTERS + #endif // HAK_FREE_TINY_FAST_HOTCOLD_STATS_BOX_H diff --git a/core/box/hak_wrappers.inc.h b/core/box/hak_wrappers.inc.h index 761b1a66..ae137b83 100644 --- a/core/box/hak_wrappers.inc.h +++ b/core/box/hak_wrappers.inc.h @@ -98,7 +98,9 @@ static void* malloc_cold(size_t size, const wrapper_env_cfg_t* wcfg) { // BenchFast mode (structural ceiling measurement) if (__builtin_expect(!atomic_load(&g_bench_fast_init_in_progress) && bench_fast_enabled(), 0)) { if (size <= 1024) { - return bench_fast_alloc(size); + void* p = bench_fast_alloc(size); + g_hakmem_lock_depth--; + return p; } } diff --git a/core/box/ptr_conversion_box.h b/core/box/ptr_conversion_box.h index e0527f90..0989f80e 100644 --- a/core/box/ptr_conversion_box.h +++ b/core/box/ptr_conversion_box.h @@ -6,9 +6,9 @@ * * DESIGN: * - BASE pointer: Points to start of block in storage (0-byte aligned) - * - USER pointer: Points to usable memory (+1 byte for classes 0-6, +0 for class 7) - * - Class 7 (2KB) is headerless (no +1 offset) - * - Classes 0-6 have 1-byte header (need +1 offset) + * - USER pointer: Points to usable memory (+1 byte for all Tiny classes in header mode) + * - Freed blocks may overwrite the header for next-pointer storage (C0/C7), but + * allocation path rewrites the header before returning USER. * * BOX BOUNDARIES: * - Box 1 (Front Gate) → Box 3 → Box 4 (User) [BASE to USER] @@ -95,11 +95,11 @@ static inline hak_base_ptr_t ptr_user_to_base(hak_user_ptr_t user, int class_idx static inline void* ptr_user_to_base_blind(void* user_ptr) { if (user_ptr == NULL) return NULL; -#if HAKMEM_TINY_HEADERLESS +#if HAKMEM_TINY_HEADERLESS || !HAKMEM_TINY_HEADER_CLASSIDX // Headerless: base = user return user_ptr; #else - // Phase 1: All classes 0-6 have 1 byte header, class 7 is headerless + // Phase E1-CORRECT: All Tiny classes use 1 byte header. // For blind conversion (no class info), assume standard header offset // This works because class is determined AFTER base pointer calculation size_t offset = TINY_HEADER_SIZE; // From tiny_layout_box.h diff --git a/core/box/smallobject_policy_v7_box.h b/core/box/smallobject_policy_v7_box.h index 4f6483db..d1aee715 100644 --- a/core/box/smallobject_policy_v7_box.h +++ b/core/box/smallobject_policy_v7_box.h @@ -41,6 +41,10 @@ typedef struct SmallPolicyV7 { /// Frontend calls this to determine route_kind[class_idx] const SmallPolicyV7* small_policy_v7_snapshot(void); +/// Bench/helper: invalidate all TLS snapshots by bumping global version. +/// Next call to small_policy_v7_snapshot() will re-read ENV (and apply learner, if enabled). +void small_policy_v7_bump_version(void); + /// Initialize policy from ENV variables (called once at startup) /// Priority: ULTRA > v7 > MID_v3 > LEGACY /// @param policy: Policy structure to initialize diff --git a/core/box/tiny_env_box.c b/core/box/tiny_env_box.c index 6bc976e5..8d89b1f0 100644 --- a/core/box/tiny_env_box.c +++ b/core/box/tiny_env_box.c @@ -65,6 +65,7 @@ void tiny_env_init_once(void) { g_tiny_env.ss_acquire_debug = env_flag("HAKMEM_SS_ACQUIRE_DEBUG", 0); g_tiny_env.tension_drain_enable = env_flag("HAKMEM_TINY_TENSION_DRAIN_ENABLE", 1); g_tiny_env.tension_drain_threshold = env_int("HAKMEM_TINY_TENSION_DRAIN_THRESHOLD", 1024); + g_tiny_env.alloc_route_shape = env_flag("HAKMEM_TINY_ALLOC_ROUTE_SHAPE", 0); g_tiny_env.inited = 1; } diff --git a/core/box/tiny_env_box.h b/core/box/tiny_env_box.h index 37cf4af2..b149aac7 100644 --- a/core/box/tiny_env_box.h +++ b/core/box/tiny_env_box.h @@ -41,6 +41,7 @@ typedef struct { int ss_acquire_debug; // HAKMEM_SS_ACQUIRE_DEBUG (default: 0) int tension_drain_enable; // HAKMEM_TINY_TENSION_DRAIN_ENABLE (default: 1) int tension_drain_threshold; // HAKMEM_TINY_TENSION_DRAIN_THRESHOLD (default: 1024) + int alloc_route_shape; // HAKMEM_TINY_ALLOC_ROUTE_SHAPE (default: 0) } tiny_env_cfg_t; extern tiny_env_cfg_t g_tiny_env; diff --git a/core/box/tiny_front_hot_box.h b/core/box/tiny_front_hot_box.h index 693c244b..94bf9901 100644 --- a/core/box/tiny_front_hot_box.h +++ b/core/box/tiny_front_hot_box.h @@ -29,7 +29,6 @@ #include "../hakmem_tiny_config.h" #include "../tiny_region_id.h" #include "../front/tiny_unified_cache.h" // For TinyUnifiedCache -#include "tiny_layout_box.h" // For tiny_user_offset() // ============================================================================ // Branch Prediction Macros (Pointer Safety - Prediction Hints) @@ -128,10 +127,7 @@ static inline void* tiny_hot_alloc_fast(int class_idx) { // Write header + return USER pointer (no branch) #if HAKMEM_TINY_HEADER_CLASSIDX - tiny_region_id_write_header(base, class_idx); // 1-byte header at BASE - // Use centralized layout API for offset calculation - size_t user_offset = tiny_user_offset(class_idx); - return (void*)((char*)base + user_offset); // Return USER pointer + return tiny_region_id_write_header(base, class_idx); #else return base; // No-header mode: return BASE directly #endif diff --git a/core/box/tiny_front_stats_box.h b/core/box/tiny_front_stats_box.h index 93c00ced..20337319 100644 --- a/core/box/tiny_front_stats_box.h +++ b/core/box/tiny_front_stats_box.h @@ -4,21 +4,29 @@ #include #include #include +#include "../hakmem_build_flags.h" #include "../hakmem_tiny_config.h" extern _Atomic uint64_t g_tiny_front_alloc_class[TINY_NUM_CLASSES]; extern _Atomic uint64_t g_tiny_front_free_class[TINY_NUM_CLASSES]; static inline int tiny_front_class_stats_enabled(void) { +#if !HAKMEM_DEBUG_COUNTERS + return 0; +#else static int g = -1; if (__builtin_expect(g == -1, 0)) { const char* e = getenv("HAKMEM_TINY_FRONT_CLASS_STATS"); g = (e && *e && *e != '0') ? 1 : 0; } return g; +#endif } static inline int tiny_front_class_stats_dump_enabled(void) { +#if !HAKMEM_DEBUG_COUNTERS + return 0; +#else static int g = -1; if (__builtin_expect(g == -1, 0)) { const char* e = getenv("HAKMEM_TINY_FRONT_CLASS_STATS_DUMP"); @@ -26,17 +34,25 @@ static inline int tiny_front_class_stats_dump_enabled(void) { g = ((e && *e && *e != '0') || (e2 && *e2 && *e2 != '0')) ? 1 : 0; } return g; +#endif } static inline void tiny_front_alloc_stat_inc(int class_idx) { +#if HAKMEM_DEBUG_COUNTERS if (__builtin_expect(tiny_front_class_stats_enabled(), 0)) { atomic_fetch_add_explicit(&g_tiny_front_alloc_class[class_idx], 1, memory_order_relaxed); } +#else + (void)class_idx; +#endif } static inline void tiny_front_free_stat_inc(int class_idx) { +#if HAKMEM_DEBUG_COUNTERS if (__builtin_expect(tiny_front_class_stats_enabled(), 0)) { atomic_fetch_add_explicit(&g_tiny_front_free_class[class_idx], 1, memory_order_relaxed); } +#else + (void)class_idx; +#endif } - diff --git a/core/box/tiny_header_box.h b/core/box/tiny_header_box.h index a95c3cab..5737425a 100644 --- a/core/box/tiny_header_box.h +++ b/core/box/tiny_header_box.h @@ -35,7 +35,7 @@ // All code must use this instead of hardcoded class_idx checks. // // Implementation: -// - Delegates to tiny_user_offset() from tiny_layout_box.h +// - Delegates to tiny_nextptr_offset() from tiny_layout_box.h // - offset=0 → header overwritten by next pointer → false // - offset!=0 → header preserved → true // @@ -46,9 +46,9 @@ static inline bool tiny_class_preserves_header(int class_idx) { #if HAKMEM_TINY_HEADER_CLASSIDX // Delegate to tiny_layout_box.h specification (Single Source of Truth) - // user_offset=0 → header overwritten (C0, C7) - // user_offset=1 → header preserved (C1-C6) - return tiny_user_offset(class_idx) != 0; + // next_off=0 → header overwritten (C0, C7) + // next_off=1 → header preserved (C1-C6) + return tiny_nextptr_offset(class_idx) != 0; #else // Headers disabled globally (void)class_idx; diff --git a/core/box/tiny_layout_box.h b/core/box/tiny_layout_box.h index 2b741515..5885759d 100644 --- a/core/box/tiny_layout_box.h +++ b/core/box/tiny_layout_box.h @@ -6,9 +6,11 @@ * * Current Design (Phase E1-CORRECT): * - All classes (0-7) have 1-byte header - * - User pointer = base + 1 for classes 0-6, base + 0 for class 7 - * (Note: Class 7 is headerless in practice but marked for consistency) - * - No external code should hardcode offsets + * - User pointer = base + 1 for ALL classes (0-7) + * - Freed blocks store next pointers intrusively: + * - C0/C7: next at base+0 (header overwritten while free) + * - C1-C6: next at base+1 (header preserved while free) + * - No external code should hardcode offsets; use this box API */ #ifndef TINY_LAYOUT_BOX_H @@ -24,6 +26,8 @@ #define HAKMEM_TINY_HEADERLESS 0 #endif +#include "../hakmem_build_flags.h" + // Define all class-specific layout parameters // Current: Defined in g_tiny_class_sizes[8] in hakmem_tiny.c // This file makes them accessible via a unified Box API @@ -33,12 +37,12 @@ static inline size_t tiny_header_size(int class_idx) { #if HAKMEM_TINY_HEADERLESS (void)class_idx; return 0; +#elif HAKMEM_TINY_HEADER_CLASSIDX + (void)class_idx; + return 1; #else - // Phase 1: 1 byte header if enabled - // C0 (8B): offset 0 (8B stride too small for header + 8B pointer - would overflow) - // C7 (2048B): offset 0 (overwrites header in freelist - largest class can tolerate) - // C1-C6: offset 1 (header preserved - user data is not disturbed) - return (0x7Eu >> class_idx) & 1u; + (void)class_idx; + return 0; #endif } @@ -63,12 +67,26 @@ static inline size_t tiny_user_offset(int class_idx) { (void)class_idx; return 0; // Headerless: user = base #elif HAKMEM_TINY_HEADER_CLASSIDX - // C0 (8B): offset 0 (8B stride too small for header + 8B pointer - would overflow) - // C7 (2048B): offset 0 (overwrites header in freelist - largest class can tolerate) - // C1-C6: offset 1 (header preserved - user data is not disturbed) - // Optimized: Use bitmask lookup instead of branching + (void)class_idx; + // Phase E1-CORRECT: All classes have 1-byte header → user = base + 1 + return 1u; +#else + (void)class_idx; + return 0u; +#endif +} + +// Offset for storing the freelist next pointer inside a freed block. +// This is distinct from tiny_user_offset(): +// - User offset is always +1 in header mode. +// - Next offset is 0 for C0/C7 (cannot preserve header while free), else 1. +static inline size_t tiny_nextptr_offset(int class_idx) { +#if HAKMEM_TINY_HEADERLESS + (void)class_idx; + return 0; +#elif HAKMEM_TINY_HEADER_CLASSIDX // Bit pattern: C0=0, C1-C6=1, C7=0 → 0b01111110 = 0x7E - return (0x7Eu >> class_idx) & 1u; + return (0x7Eu >> ((unsigned)class_idx & 7u)) & 1u; #else (void)class_idx; return 0u; diff --git a/core/box/tiny_static_route_box.c b/core/box/tiny_static_route_box.c index ea781b6e..58f96998 100644 --- a/core/box/tiny_static_route_box.c +++ b/core/box/tiny_static_route_box.c @@ -4,9 +4,9 @@ #include #include -TinyStaticRoute g_tiny_static_route = {.inited = 0}; +TinyStaticRoute g_tiny_static_route = {.inited = ATOMIC_VAR_INIT(0)}; -static int static_route_cached_enabled = -1; // -1 = uninitialized +static _Atomic int static_route_cached_enabled = -1; // -1 = uninitialized // Check if static routing should be enabled // (Returns 0 if learner is active, or if ENV is explicitly off) @@ -16,11 +16,18 @@ static inline int static_route_should_be_enabled(void) { // Explicit disable if (e && *e == '0') return 0; - // If learner is enabled, force disable (static route doesn't track learner updates) - // Check: if HAKMEM_TINY_LEARNER_ENABLED is set to 1, disable static route - const char* learner_e = getenv("HAKMEM_TINY_LEARNER_ENABLED"); - if (learner_e && *learner_e != '0') { - return 0; // Learner ON → static route OFF + // If v7 learner is enabled, force disable (static route doesn't track learner updates). + // Mirrors smallobject_policy_v7.c::learner_v7_enabled(): + // - If HAKMEM_SMALL_LEARNER_V7_ENABLED=0 → learner disabled + // - Else learner enabled iff HAKMEM_SMALL_HEAP_V7_ENABLED is enabled + { + const char* learner_e = getenv("HAKMEM_SMALL_LEARNER_V7_ENABLED"); + if (!(learner_e && *learner_e && *learner_e == '0')) { + const char* v7_e = getenv("HAKMEM_SMALL_HEAP_V7_ENABLED"); + if (v7_e && *v7_e && *v7_e != '0') { + return 0; // Learner ON → static route OFF + } + } } // Explicit enable or default OFF @@ -30,22 +37,31 @@ static inline int static_route_should_be_enabled(void) { int tiny_static_route_init_once(void) { // Check if static routing is enabled if (!static_route_should_be_enabled()) { - g_tiny_static_route.inited = 0; + atomic_store_explicit(&g_tiny_static_route.inited, 0, memory_order_release); return 0; } - // Atomic initialization - static _Atomic int init_started = 0; - int expected = 0; - - if (!atomic_compare_exchange_strong_explicit(&init_started, &expected, 1, - memory_order_acq_rel, - memory_order_relaxed)) { - // Someone else is initializing or already initialized - while (!__builtin_expect(g_tiny_static_route.inited, 1)) { + int state = atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire); + if (__builtin_expect(state == 1, 1)) { + return 1; + } + if (__builtin_expect(state == -1, 0)) { + while (atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire) == -1) { __builtin_ia32_pause(); } - return g_tiny_static_route.inited; + return atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire) == 1; + } + + // Try to become the initializer: 0 → -1 + int expected = 0; + if (!atomic_compare_exchange_strong_explicit(&g_tiny_static_route.inited, &expected, -1, + memory_order_acq_rel, + memory_order_relaxed)) { + // Someone else is initializing or already initialized + while (atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire) == -1) { + __builtin_ia32_pause(); + } + return atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire) == 1; } // We own the initialization @@ -63,19 +79,24 @@ int tiny_static_route_init_once(void) { } SmallRouteKind tiny_static_route_get_kind(int class_idx) { - if (!g_tiny_static_route.inited) { - return 0; // Not available + if ((unsigned)class_idx >= 8u) { + return SMALL_ROUTE_LEGACY; } - if (class_idx < 0 || class_idx >= 8) { - return 0; // Invalid class + if (atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire) != 1) { + if (!tiny_static_route_init_once()) { + return SMALL_ROUTE_LEGACY; + } } return g_tiny_static_route.route_kind[class_idx]; } void tiny_static_route_refresh_from_env(void) { - // Re-initialize if needed (for bench_apply_profile sync) - if (!static_route_should_be_enabled()) { - g_tiny_static_route.inited = 0; + // Refresh cached enable flag first (bench_apply_profile sync). + int enabled = static_route_should_be_enabled(); + atomic_store_explicit(&static_route_cached_enabled, enabled, memory_order_release); + + if (!enabled) { + atomic_store_explicit(&g_tiny_static_route.inited, 0, memory_order_release); return; } @@ -91,18 +112,22 @@ void tiny_static_route_refresh_from_env(void) { } int tiny_static_route_enabled(void) { - // Check cached enable flag - if (static_route_cached_enabled < 0) { - static_route_cached_enabled = static_route_should_be_enabled(); + int cached = atomic_load_explicit(&static_route_cached_enabled, memory_order_acquire); + if (__builtin_expect(cached >= 0, 1)) { + return cached; } - return static_route_cached_enabled; + + int enabled = static_route_should_be_enabled(); + atomic_store_explicit(&static_route_cached_enabled, enabled, memory_order_release); + return enabled; } __attribute__((constructor(102))) static void tiny_static_route_ctor(void) { // Constructor priority 102 runs after wrapper_env_ctor (101) // Initialize static route table if enabled - if (!g_tiny_static_route.inited && tiny_static_route_enabled()) { + if (atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire) != 1 && + tiny_static_route_enabled()) { tiny_static_route_init_once(); } } diff --git a/core/box/tiny_static_route_box.h b/core/box/tiny_static_route_box.h index 1d709cb5..d272a443 100644 --- a/core/box/tiny_static_route_box.h +++ b/core/box/tiny_static_route_box.h @@ -4,21 +4,33 @@ #pragma once +#include #include "smallobject_policy_v7_box.h" typedef struct { - int inited; + // 0: uninit/disabled, -1: initializing, 1: ready + atomic_int inited; SmallRouteKind route_kind[8]; // C0-C7 static route (determined at init, no learner update) } TinyStaticRoute; extern TinyStaticRoute g_tiny_static_route; +// Hot-path helper: true when static route table is ready (enabled + initialized). +static inline int tiny_static_route_ready_fast(void) { + return atomic_load_explicit(&g_tiny_static_route.inited, memory_order_acquire) == 1; +} + +// Hot-path helper: fetch route kind when tiny_static_route_ready_fast()==true. +static inline SmallRouteKind tiny_static_route_get_kind_fast(int class_idx) { + return g_tiny_static_route.route_kind[(unsigned)class_idx & 7u]; +} + // Initialize static route table (called once, at library load time) // Returns 1 if static routing is enabled and initialized, 0 otherwise int tiny_static_route_init_once(void); -// Get static route for class_idx, or 0 if not enabled -// (Returns route_kind, or 0 if disabled/uninitialized) +// Get static route for class_idx. +// Returns SMALL_ROUTE_LEGACY when disabled/uninitialized/invalid (safe fallback). SmallRouteKind tiny_static_route_get_kind(int class_idx); // Refresh from ENV (for bench_apply_profile() sync) diff --git a/core/box/wrapper_env_box.c b/core/box/wrapper_env_box.c index 5a55f0f8..0f9c7a50 100644 --- a/core/box/wrapper_env_box.c +++ b/core/box/wrapper_env_box.c @@ -46,6 +46,23 @@ void wrapper_env_init_once(void) { atomic_store_explicit(&g_wrapper_env.inited, 1, memory_order_release); } +void wrapper_env_refresh_from_env(void) { + // Ensure base init happened (constructor may have already done this) + if (!g_wrapper_env.inited) { + wrapper_env_init_once(); + return; + } + + // Re-read ENV (bench_profile may have injected defaults via putenv) + g_wrapper_env.step_trace = env_flag("HAKMEM_STEP_TRACE", 0); + g_wrapper_env.ld_safe_mode = env_int("HAKMEM_LD_SAFE", 1); + g_wrapper_env.free_wrap_trace = env_flag("HAKMEM_FREE_WRAP_TRACE", 0); + g_wrapper_env.wrap_diag = env_flag("HAKMEM_WRAP_DIAG", 0); + g_wrapper_env.wrap_shape = env_flag("HAKMEM_WRAP_SHAPE", 0); + + atomic_store_explicit(&g_wrapper_env.inited, 1, memory_order_release); +} + __attribute__((constructor(101))) static void wrapper_env_ctor(void) { // Constructor priority 101 runs early (libc uses 100+) diff --git a/core/box/wrapper_env_box.h b/core/box/wrapper_env_box.h index f641f4a2..bf234059 100644 --- a/core/box/wrapper_env_box.h +++ b/core/box/wrapper_env_box.h @@ -16,6 +16,9 @@ typedef struct { extern wrapper_env_cfg_t g_wrapper_env; void wrapper_env_init_once(void); +// Bench helper: re-read ENV after bench_profile putenv defaults. +// This keeps wrapper hot paths getenv-free while still allowing HAKMEM_PROFILE presets. +void wrapper_env_refresh_from_env(void); static inline const wrapper_env_cfg_t* wrapper_env_cfg(void) { // Constructor ensures init at library load time diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index e4549cdf..68a25192 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -35,6 +35,7 @@ #include "tiny_unified_cache.h" // For unified_cache_pop_or_refill #include "../tiny_region_id.h" // For tiny_region_id_write_header #include "../hakmem_tiny.h" // For hak_tiny_size_to_class +#include "../box/tiny_env_box.h" // For tiny_env_cfg() (ENV variables) #include "../box/tiny_front_hot_box.h" // Phase 4-Step2: Hot Path Box #include "../box/tiny_front_cold_box.h" // Phase 4-Step2: Cold Path Box #include "../box/tiny_c7_hotbox.h" // Optional: C7 専用ホットボックス @@ -47,6 +48,7 @@ #include "../box/smallobject_v6_env_box.h" // SmallObject v6 ENV control (Phase V6-HDR-2) #include "../box/smallobject_hotbox_v7_box.h" // SmallObject HotBox v7 stub (Phase v7-1) #include "../box/smallobject_policy_v7_box.h" // Phase v7-4: Policy Box +#include "../box/tiny_static_route_box.h" // Phase 3 C3: Static routing (policy snapshot bypass) #include "../box/smallobject_mid_v35_box.h" // Phase v11a-3: MID v3.5 HotBox #include "../box/tiny_c7_ultra_box.h" // C7 ULTRA stub (UF-1, delegates to v3) #include "../box/tiny_c6_ultra_free_box.h" // Phase 4-2: C6 ULTRA-free (free-only, C6-only) @@ -151,6 +153,68 @@ static inline int alloc_dualhot_enabled(void) { return g; } +// Phase 2 B3: tiny_alloc_route_cold() - Handle rare routes (V7, MID, ULTRA) +// NOTE: noinline to avoid code bloat in hot path, placed in cold section +__attribute__((noinline, cold)) +static void* tiny_alloc_route_cold(SmallRouteKind route_kind, int class_idx, size_t size) { + switch (route_kind) { + case SMALL_ROUTE_ULTRA: { + // Phase TLS-UNIFY-1: Unified ULTRA TLS pop for C4-C6 (C7 handled above) + void* base = tiny_ultra_tls_pop((uint8_t)class_idx); + if (TINY_HOT_LIKELY(base != NULL)) { + if (class_idx == 6) FREE_PATH_STAT_INC(c6_ultra_alloc_hit); + else if (class_idx == 5) FREE_PATH_STAT_INC(c5_ultra_alloc_hit); + else if (class_idx == 4) FREE_PATH_STAT_INC(c4_ultra_alloc_hit); + return tiny_base_to_user_inline(base); + } + // ULTRA miss → fallback to LEGACY + break; + } + + case SMALL_ROUTE_MID_V35: { + // Phase v11a-3: MID v3.5 allocation + void* v35p = small_mid_v35_alloc(class_idx, size); + if (TINY_HOT_LIKELY(v35p != NULL)) { + return v35p; + } + // MID v3.5 miss → fallback to LEGACY + break; + } + + case SMALL_ROUTE_V7: { + // Phase v7: SmallObject v7 allocation (research box) + void* v7p = small_heap_alloc_fast_v7_stub(size, (uint8_t)class_idx); + if (TINY_HOT_LIKELY(v7p != NULL)) { + return v7p; + } + // V7 miss → fallback to LEGACY + break; + } + + case SMALL_ROUTE_MID_V3: { + // Phase MID-V3: MID v3 allocation (257-768B, C5-C6) + // Note: MID v3 uses same segment infrastructure as MID v3.5 + // For now, delegate to MID v3.5 which handles both + void* v3p = small_mid_v35_alloc(class_idx, size); + if (TINY_HOT_LIKELY(v3p != NULL)) { + return v3p; + } + break; + } + + case SMALL_ROUTE_LEGACY: + default: + break; + } + + // Fallback: LEGACY unified cache hot/cold path + void* ptr = tiny_hot_alloc_fast(class_idx); + if (TINY_HOT_LIKELY(ptr != NULL)) { + return ptr; + } + return tiny_cold_refill_and_alloc(class_idx); +} + // Phase ALLOC-GATE-SSOT-1: malloc_tiny_fast_for_class() - body (class_idx already known) __attribute__((always_inline)) static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) { @@ -182,10 +246,33 @@ static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) { } } - // 2. Policy snapshot (TLS cached, single read) - const SmallPolicyV7* policy = small_policy_v7_snapshot(); - SmallRouteKind route_kind = policy->route_kind[class_idx]; + // 2. Route selection: Static route table (Phase 3 C3) or policy snapshot (default) + SmallRouteKind route_kind; + if (tiny_static_route_ready_fast()) { + route_kind = tiny_static_route_get_kind_fast(class_idx); + } else { + const SmallPolicyV7* policy = small_policy_v7_snapshot(); + route_kind = policy->route_kind[class_idx]; + } + // Phase 2 B3: Routing dispatch (ENV gate HAKMEM_TINY_ALLOC_ROUTE_SHAPE) + // Optimized: LIKELY on LEGACY (common case), cold helper for rare routes + const tiny_env_cfg_t* env_cfg = tiny_env_cfg(); + if (TINY_HOT_LIKELY(env_cfg->alloc_route_shape)) { + // B3 optimized: Prioritize LEGACY with LIKELY hint + if (TINY_HOT_LIKELY(route_kind == SMALL_ROUTE_LEGACY)) { + // LEGACY fast path: Unified Cache hot/cold + void* ptr = tiny_hot_alloc_fast(class_idx); + if (TINY_HOT_LIKELY(ptr != NULL)) { + return ptr; + } + return tiny_cold_refill_and_alloc(class_idx); + } + // Rare routes: delegate to cold helper + return tiny_alloc_route_cold(route_kind, class_idx, size); + } + + // Original dispatch (backward compatible, default) // 3. Single switch on route_kind (all ENV checks moved to Policy init) switch (route_kind) { case SMALL_ROUTE_ULTRA: { diff --git a/core/smallobject_policy_v7.c b/core/smallobject_policy_v7.c index 7ad445a2..dff3722f 100644 --- a/core/smallobject_policy_v7.c +++ b/core/smallobject_policy_v7.c @@ -43,6 +43,10 @@ static uint32_t g_policy_v7_version = 0; // Global version, incremented by Lear static __thread SmallPolicyV7 g_small_policy_v7; static __thread uint32_t g_small_policy_v7_version = 0; // TLS cached version +void small_policy_v7_bump_version(void) { + __sync_fetch_and_add(&g_policy_v7_version, 1); +} + const SmallPolicyV7* small_policy_v7_snapshot(void) { // Check if TLS cache is stale (version mismatch or uninitialized) if (unlikely(g_small_policy_v7_version != g_policy_v7_version || g_policy_v7_version == 0)) { diff --git a/core/tiny_nextptr.h b/core/tiny_nextptr.h index 6746c3ad..1b986d33 100644 --- a/core/tiny_nextptr.h +++ b/core/tiny_nextptr.h @@ -59,7 +59,7 @@ static __thread void* g_tiny_next_ra2 __attribute__((unused)) = NULL; // P0.1 updated: C0 and C7 use offset 0, C1-C6 use offset 1 (header preserved) // Rationale for C0: 8B stride cannot fit [1B header][8B next pointer] without overflow static inline __attribute__((always_inline)) size_t tiny_next_off(int class_idx) { - return tiny_user_offset(class_idx); + return tiny_nextptr_offset(class_idx); } #if !HAKMEM_BUILD_RELEASE diff --git a/docs/analysis/ENV_PROFILE_PRESETS.md b/docs/analysis/ENV_PROFILE_PRESETS.md index 1a150700..0547e3b3 100644 --- a/docs/analysis/ENV_PROFILE_PRESETS.md +++ b/docs/analysis/ENV_PROFILE_PRESETS.md @@ -37,6 +37,9 @@ HAKMEM_BENCH_MAX_SIZE=1024 - `HAKMEM_POOL_V2_ENABLED=0` - `HAKMEM_TINY_FRONT_V3_ENABLED=1` - `HAKMEM_TINY_FRONT_V3_LUT_ENABLED=1` +- `HAKMEM_FREE_TINY_FAST_HOTCOLD=1`(Phase FREE-TINY-FAST-DUALHOT-1: free の第2ホット(C0-C3)を直行) +- `HAKMEM_WRAP_SHAPE=1`(Phase 2 B4: wrapper hot/cold split を default ON) +- `HAKMEM_TINY_ALLOC_ROUTE_SHAPE=1`(Phase 2 B3: alloc の route dispatch 形を最適化) - `HAKMEM_MID_V3_ENABLED=1`(Phase MID-V3: 257-768B, C6 only) - `HAKMEM_MID_V3_CLASSES=0x40`(C6 only, C7 は ULTRA に任せる) - `HAKMEM_MID_V35_ENABLED=0`(Phase v11a-5: Mixed では MID v3.5 OFF が最速) @@ -113,6 +116,7 @@ HAKMEM_MID_V3_ENABLED=1 # Phase MID-V3: 257-768B, C6 only HAKMEM_MID_V3_CLASSES=0x40 # C6 only (+11% on C6-heavy) HAKMEM_MID_V35_ENABLED=1 # Phase v11a-5: C6-heavy で +8% 改善 HAKMEM_MID_V35_CLASSES=0x40 # C6 only (53.1M ops/s) +HAKMEM_TINY_ALLOC_ROUTE_SHAPE=1 # Phase 2 B3: alloc route dispatch shape (ADOPT) # Phase MID-V35-HOTPATH-OPT-1: C6-heavy 最速セット(推奨ON) # 機能: header prefill + hot counts削除 + C6 fast path (組み合わせで +7.3%) diff --git a/docs/analysis/PHASE2_B1_HEADER_TAX_AB_TEST_RESULTS.md b/docs/analysis/PHASE2_B1_HEADER_TAX_AB_TEST_RESULTS.md new file mode 100644 index 00000000..2b3dba13 --- /dev/null +++ b/docs/analysis/PHASE2_B1_HEADER_TAX_AB_TEST_RESULTS.md @@ -0,0 +1,25 @@ +# Phase 2 B1: HAKMEM_TINY_HEADER_MODE=LIGHT A/B Test Report + +## 目的 + +Tiny の header 書き込み回数を減らし、store traffic を落とす(コード膨張なしで勝つ狙い)。 + +## 条件 + +- Baseline: `HAKMEM_TINY_HEADER_MODE=FULL` +- Opt: `HAKMEM_TINY_HEADER_MODE=LIGHT` + +## 結果(Mixed) + +- Mixed(10-run avg): 48.89M → 47.65M ops/s(**-2.54%**) + +## 判定 + +- ❌ NO-GO(Mixed の回帰が明確) +- Decision: **freeze**(research box として残し、本線 default は FULL のまま) + +## 所見 + +`LIGHT` の「条件分岐+追加ロード」のコストが、store 削減メリットを上回った。 +(Mixed は分岐/I-cache に敏感で、A3 と同じ学び) + diff --git a/docs/analysis/PHASE2_B4_WRAPPER_SHAPE_1_DESIGN.md b/docs/analysis/PHASE2_B4_WRAPPER_SHAPE_1_DESIGN.md new file mode 100644 index 00000000..0fa2a876 --- /dev/null +++ b/docs/analysis/PHASE2_B4_WRAPPER_SHAPE_1_DESIGN.md @@ -0,0 +1,57 @@ +# Phase 2 B4: WRAPPER-SHAPE-1(malloc/free wrapper のホット/コールド分割) + +## 目的 + +Phase 2 B3 で `malloc_tiny_fast_for_class()` の route dispatch 形は勝ち形に寄せられた。 +次の “芯” は wrapper 層(`malloc/free`)の I-cache / 分岐形で、稀なチェックを `noinline,cold` に押し出してホットを最短化する。 + +狙いは **アルゴリズム変更ではなく「形」**: +- 既存の安全チェックは削らない(移動/集約のみ) +- ホットは `return` で即抜け(重複 work を作らない) +- 失敗/特殊条件は cold に委譲(境界は 1 箇所) + +## Box Theory(箱割り) + +### L0: WrapperHotBox + +責務: 通常ケースを最短で Tiny fast に到達させ、成功なら即 return。 + +- `malloc`: init wait → init → Tiny fast → hit で return +- `free`: BenchFast → Tiny free fast → hit で return + +### L1: WrapperColdBox + +責務: それ以外(LD/jemalloc/force_libc/分類/所有権/フォールバック/診断)を 1 箱に集約。 + +境界: `HAKMEM_WRAP_SHAPE=1` のとき、入口で 1 回だけ hot/cold を分岐する。 + +## 実装指示(ファイル / 形) + +- `core/box/hak_wrappers.inc.h` + - `malloc_cold()` / `free_cold()` を `__attribute__((noinline, cold))` で追加 + - `malloc()` / `free()` に `HAKMEM_WRAP_SHAPE` gate を追加 +- ENV gate: + - `HAKMEM_WRAP_SHAPE=0/1`(default 0) + - ただしプリセットで default=1 に昇格可能(bench_profile) + +## 可視化(最小) + +常時ログは禁止。必要なら: +- ワンショット(数回のみ) +- `HAKMEM_DEBUG_COUNTERS=1` ビルド時だけのカウンタ + +## Fail-Fast / 安全 + +- `g_hakmem_lock_depth` の ++/-- はどの return 経路でも対称に保つ +- safety check の削除は禁止(移動のみ) + +## A/B 結果(2025-12-13) + +Mixed 10-run: +- `HAKMEM_WRAP_SHAPE=0`: 34,750,578 ops/s +- `HAKMEM_WRAP_SHAPE=1`: 35,262,596 ops/s +- mean: **+1.47%** / median: **+1.39%** → **GO** + +決定: +- `HAKMEM_WRAP_SHAPE` は ENV gate のまま維持しつつ、`MIXED_TINYV3_C7_SAFE` プリセットで default=1 に昇格。 + diff --git a/docs/analysis/PHASE2_STRUCTURAL_CHANGES_NEXT_INSTRUCTIONS.md b/docs/analysis/PHASE2_STRUCTURAL_CHANGES_NEXT_INSTRUCTIONS.md index b7a24a5a..fff108ff 100644 --- a/docs/analysis/PHASE2_STRUCTURAL_CHANGES_NEXT_INSTRUCTIONS.md +++ b/docs/analysis/PHASE2_STRUCTURAL_CHANGES_NEXT_INSTRUCTIONS.md @@ -27,7 +27,7 @@ Phase 2 では **wrapper 層とアロケーション経路のルーティング **結果**: - Mixed (10-run): 48.41M → 49.80M ops/s (**+2.89%**, win) - C6-heavy (5-run): 8.97M → 9.79M ops/s (**+9.13%**, strong) -- 決定: **ADOPT as default** in MIXED_TINYV3_C7_SAFE +- 決定: **ADOPT as default** in `MIXED_TINYV3_C7_SAFE` / `C6_HEAVY_LEGACY_POOLV1` ### B4: Wrapper Layer Hot/Cold Split ✅ ADOPT @@ -41,7 +41,7 @@ Phase 2 では **wrapper 層とアロケーション経路のルーティング **結果**: - Mixed (10-run): 34,750,578 → 35,262,596 ops/s (**+1.47%**, average) -- 決定: **ADOPT as default** (HAKMEM_WRAP_SHAPE=1) +- 決定: **ADOPT as default** in `MIXED_TINYV3_C7_SAFE`(`HAKMEM_WRAP_SHAPE=1`) ## 累積効果 diff --git a/docs/analysis/PHASE3_C3_STATIC_ROUTING_1_DESIGN.md b/docs/analysis/PHASE3_C3_STATIC_ROUTING_1_DESIGN.md new file mode 100644 index 00000000..4edfa561 --- /dev/null +++ b/docs/analysis/PHASE3_C3_STATIC_ROUTING_1_DESIGN.md @@ -0,0 +1,90 @@ +# Phase 3 C3: Tiny Static Routing(policy snapshot bypass)設計メモ + +## 目的(なにを削るか) + +`malloc_tiny_fast_for_class()` / `tiny_alloc_gate_fast()` の hot path で毎回呼ばれている +`small_policy_v7_snapshot()`(TLS policy snapshot)と `route_kind[class]` ロードを **省略**し、 +「このプロファイルでは route が固定」という前提で **class→route を静的テーブル参照**に置換する。 + +狙いはアルゴリズム変更ではなく **形(instruction / cache / branch)** の削減。 + +## 非目標(やらないこと) + +- Learner の学習ロジックを hot path に混ぜない(Learning Layer の箱を汚さない) +- route の意味(ULTRA/MID/LEGACY/V7)や優先順位は変えない +- 既存の ENV の意味を変えない(A/B で戻せる) + +## Box Theory(箱割り) + +### L0: StaticRouteEnvBox(戻せる) + +- ENV: `HAKMEM_TINY_STATIC_ROUTE=0/1`(default: 0) +- 強制無効(安全フェンス): + - Learner が有効な場合は自動で OFF(static route は learner の更新を反映できない) + - LD safe / diagnostic モード等の特殊経路では OFF(wrapper 側の cold に回す) + +### L1: TinyStaticRouteBox(境界: 1 箇所) + +責務: `class_idx -> route_kind` の **静的スナップショット**を保持するだけ。 + +- データ: + - `SmallRouteKind route_kind[8]` + - `inited`(1 回だけ初期化) +- 初期化境界: + - `small_policy_v7_init_from_env(&tmp)` を 1 回呼び、`tmp.route_kind[]` をコピー + - ※ Learner の update は一切適用しない(Learner ON なら L0 で無効化) + +### L2: Front Integration(hot path の差し替え点) + +`malloc_tiny_fast_for_class()` で `route_kind` を取る部分だけを差し替える: + +``` +if (static_route_enabled) { + route_kind = g_tiny_static_route.route_kind[class_idx]; +} else { + route_kind = small_policy_v7_snapshot()->route_kind[class_idx]; +} +``` + +以降の dispatch(B3: route shape / 従来 switch)は **そのまま**。 + +## 実装指示(小パッチ順序) + +1. **ENV gate を追加** + - `HAKMEM_TINY_STATIC_ROUTE` を読み、cached にする(probe window 方式は不要推奨) + - 併せて「Learner 有効なら強制 OFF」を入れる(ENV を直に読むだけでよい) + +2. **TinyStaticRouteBox を追加** + - `core/box/tiny_static_route_box.h/.c` + - `tiny_static_route_init_once()` / `tiny_static_route_route_kind(class_idx)` を提供 + +3. **malloc_tiny_fast_for_class に統合** + - `route_kind` 取得を Box 経由に変更(static ON の時だけ bypass) + - 既存の B3 `HAKMEM_TINY_ALLOC_ROUTE_SHAPE` と **合成**できること(静的 route を feed するだけ) + +4. **ベンチ用の ENV 注入(putenv)対策** + - bench は `bench_apply_profile()` が putenv するため、必要なら: + - `small_policy_v7_bump_version()` を `bench_apply_profile()` 末尾で呼び、policy snapshot が「最終 ENV」を読むようにする + - `tiny_static_route_refresh_from_env()` を用意し、`bench_apply_profile()` の末尾で呼ぶ + - ただし `tiny_env_cfg()` と同様に「初回呼び出しが bench_apply_profile 後」なら refresh は不要。 + - 先に `tiny_static_route_init` が走らないことを確認してから追加する(余計な分岐は増やさない)。 + +5. **最小可視化(debug counters のみ)** + - `HAKMEM_DEBUG_COUNTERS` が 1 のときだけ: + - `static_route_hit` / `static_route_fallback` を 64-bit で増やす + - 常時ログは禁止(ワンショットのみ許可) + +## GO/NO-GO ゲート(A/B) + +- Mixed(`HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE`)10-run: + - **GO**: +1.0% 以上 + - **NO-GO**: -1.0% 以下(freeze) +- C6-heavy(`HAKMEM_PROFILE=C6_HEAVY_LEGACY_POOLV1`)5-run: + - 参考(勝っても Mixed で負けたら本線には入れない) + +## 想定される落とし穴 + +- `small_policy_v7_snapshot()` 自体が既に十分軽い場合: + - bypass のメリットが薄く、追加分岐が負ける(その場合は迷わず freeze) +- Learner ON の状態で static route を ON にすると: + - route 変更が反映されず挙動がズレる → L0 で強制 OFF diff --git a/docs/analysis/PHASE3_CACHE_LOCALITY_NEXT_INSTRUCTIONS.md b/docs/analysis/PHASE3_CACHE_LOCALITY_NEXT_INSTRUCTIONS.md index 1b9fcf4b..45350ff8 100644 --- a/docs/analysis/PHASE3_CACHE_LOCALITY_NEXT_INSTRUCTIONS.md +++ b/docs/analysis/PHASE3_CACHE_LOCALITY_NEXT_INSTRUCTIONS.md @@ -14,19 +14,22 @@ - 現在: 毎回 malloc 時に policy snapshot + learner evaluation → 大きな overhead - 案: malloc_tiny_fast() 呼び出し前に "static route" を init 時決定 +**設計メモ**: `docs/analysis/PHASE3_C3_STATIC_ROUTING_1_DESIGN.md` + **実装ステップ**: 1. **Profiling(現状把握)** ```bash - perf record -F 99 ./bench_random_mixed_hakmem - perf top --call-graph=flame -p [pid] + HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE ./bench_random_mixed_hakmem 1000000 400 1 + perf record -F 99 --call-graph dwarf -- HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE ./bench_random_mixed_hakmem 1000000 400 1 + perf report --stdio # → malloc/policy_snapshot/learner がどの程度か確認 ``` 2. **Static Route Detection (init 時)** - malloc_tiny_fast() が呼ばれる前に route を "決定" - 対象: C0-C7 の class 別に「LEGACY が dominant か」を判定 - - ENV gate: HAKMEM_STATIC_ROUTE=1/0 (default 0) + - ENV gate: `HAKMEM_TINY_STATIC_ROUTE=1/0` (default 0) 3. **Route Bypass** ```c @@ -50,7 +53,7 @@ **実装**: ```c -__builtin_prefetch(g_small_policy_v7, 0, 3); // L1 cache, write intent +__builtin_prefetch(g_small_policy_v7, 0, 3); // read intent (prefetch for load) ``` **期待**: +2-4% diff --git a/docs/analysis/TINY_HEADER_WRITE_ALWAYS_INLINE_A3_DESIGN.md b/docs/analysis/TINY_HEADER_WRITE_ALWAYS_INLINE_A3_DESIGN.md new file mode 100644 index 00000000..2b6c29bc --- /dev/null +++ b/docs/analysis/TINY_HEADER_WRITE_ALWAYS_INLINE_A3_DESIGN.md @@ -0,0 +1,26 @@ +# Phase 1 A3: tiny_region_id_write_header() always_inline(設計メモ + A/B) + +## 目的 + +Tiny の header write を “関数呼び出し” から解放し、ホットパスの命令数を削る(期待: +1〜2%)。 + +## 方針 + +- `tiny_region_id_write_header()` を `__attribute__((always_inline))` にする(研究箱) +- ENV/ビルドフラグで即切り戻せる前提 + +## A/B 結果 + +- Mixed(10-run): 49.53M → 47.55M ops/s(**-4.00%**)❌ +- C6-heavy(5-run): 23.49M → 24.93M ops/s(**+6.00%**)✅ + +## 判定 + +- ❌ NO-GO(Mixed 回帰が閾値超え) +- Decision: **research box freeze(default OFF)** + +## 所見 + +Mixed は I-cache 圧力が支配的で、強制 inline によるコード膨張が即回帰につながった。 +C6-heavy はワーキングセットが小さく、inline の勝ちが出やすい。 + diff --git a/docs/specs/ENV_VARS_COMPLETE.md b/docs/specs/ENV_VARS_COMPLETE.md index 6ce5abb1..a81b89b1 100644 --- a/docs/specs/ENV_VARS_COMPLETE.md +++ b/docs/specs/ENV_VARS_COMPLETE.md @@ -62,6 +62,12 @@ From `/mnt/workdisk/public_share/hakmem/core/hakmem_tiny_stats.h`: - **Usage**: `export HAKMEM_WRAP_TINY_REFILL=1` - **Safety**: OFF by default (avoids deadlock risk in recursive malloc) +#### HAKMEM_WRAP_SHAPE +- **Default**: 0 (disabled) +- **Purpose**: Experiment: reshape malloc/free wrappers into hot/cold boxes (push rare checks into `noinline,cold`) +- **Impact**: Phase 2 B4 (WRAPPER-SHAPE-1) の A/B gate。`MIXED_TINYV3_C7_SAFE` プリセットでは default=1 に昇格。 +- **Notes**: Intended to reduce wrapper I-cache / branch overhead while preserving exact behavior. + #### HAKMEM_TINY_USE_SUPERSLAB - **Default**: 1 (enabled) - **Purpose**: Enable SuperSlab allocator for Tiny Pool slabs @@ -73,6 +79,23 @@ From `/mnt/workdisk/public_share/hakmem/core/hakmem_tiny_stats.h`: - **Purpose**: Treat C0–C3 alloc as “second hot path” and skip policy snapshot/routing in `malloc_tiny_fast()` - **Impact**: Opt-in experiment; keep OFF unless you are A/B testing +#### HAKMEM_FREE_TINY_FAST_HOTCOLD +- **Default**: 0 (disabled) +- **Purpose**: Enable FREE DUALHOT (C0–C3 as “second hot path”) in Tiny free path +- **Impact**: Mixed workload winner; promoted as default in `MIXED_TINYV3_C7_SAFE` preset +- **Notes**: `HAKMEM_TINY_LARSON_FIX=1` で互換性のため最適化を無効化できる + +#### HAKMEM_TINY_ALLOC_ROUTE_SHAPE +- **Default**: 0 (disabled) +- **Purpose**: Optimize alloc route dispatch shape (LIKELY on LEGACY + cold helper for rare routes) +- **Impact**: Mixed winner; promoted as default in `MIXED_TINYV3_C7_SAFE` / `C6_HEAVY_LEGACY_POOLV1` presets + +#### HAKMEM_TINY_STATIC_ROUTE +- **Default**: 0 (disabled) +- **Purpose**: Phase 3 C3: bypass `small_policy_v7_snapshot()` by using a static per-class route table initialized from ENV +- **Impact**: A/B gate for policy snapshot cost removal (research box until GO) +- **Notes**: v7 learner が有効な場合(`HAKMEM_SMALL_HEAP_V7_ENABLED=1` かつ learner 無効化なし)は安全のため強制 OFF + --- ### 2. Tiny Pool TLS Caching (Performance Critical)