From e1a4561992ab3964f2d1bb53c8d86b5e0557a19a Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Mon, 15 Dec 2025 12:50:16 +0900 Subject: [PATCH] Phase 19-3b: pass down env snapshot in hot paths --- CURRENT_TASK.md | 24 ++++ core/box/tiny_legacy_fallback_box.h | 23 ++-- core/box/tiny_metadata_cache_hot_box.h | 18 +-- core/front/malloc_tiny_fast.h | 57 +++------ ...B_ENV_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md | 76 ++++++++++++ ...FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md | 112 ++++++++---------- ...UCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md | 23 +++- 7 files changed, 207 insertions(+), 126 deletions(-) create mode 100644 docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3B_ENV_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index 9c0fb508..75e18d42 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -1,5 +1,29 @@ # 本線タスク(現在) +## 更新メモ(2025-12-15 Phase 19-3b ENV-SNAPSHOT-PASSDOWN) + +### Phase 19-3b ENV-SNAPSHOT-PASSDOWN: Consolidate ENV snapshot reads across hot helpers — ✅ GO (+2.76%) + +**A/B Test Results** (`scripts/run_mixed_10_cleanenv.sh`, iter=20M ws=400): +- Baseline (Phase 19-3a): mean **55.56M** ops/s, median **55.65M** +- Optimized (Phase 19-3b): mean **57.10M** ops/s, median **57.09M** +- Delta: **+2.76% mean** / **+2.57% median** → ✅ GO + +**Change**: +- `core/front/malloc_tiny_fast.h`: capture `env` once in `free_tiny_fast()` / `free_tiny_fast_hot()` and pass into cold/legacy helpers; use `tiny_policy_hot_get_route_with_env()` to avoid a second snapshot gate. +- `core/box/tiny_legacy_fallback_box.h`: add `tiny_legacy_fallback_free_base_with_env(...)` and use it from hot paths to avoid redundant `hakmem_env_snapshot_enabled()` checks. +- `core/box/tiny_metadata_cache_hot_box.h`: add `tiny_policy_hot_get_route_with_env(...)` so `malloc_tiny_fast_for_class()` can reuse the already-fetched snapshot. +- Remove dead `front_snap` computations (set-but-unused) from the free hot paths. + +**Why it works**: +- Hot call chains had multiple redundant `hakmem_env_snapshot_enabled()` gates (branch + loads) across nested helpers. +- Capture once → pass-down keeps the “ENV decision” at a single boundary per operation and removes duplicated work. + +**Next**: +- Phase 19-3c (optional): if needed, also pass `env` into alloc-side call chains to remove the remaining `malloc_tiny_fast_for_class()` gate. + +--- + ## 更新メモ(2025-12-15 Phase 19-3a UNLIKELY-HINT-REMOVAL) ### Phase 19-3a UNLIKELY-HINT-REMOVAL: ENV Snapshot UNLIKELY Hint Removal — ✅ GO (+4.42%) diff --git a/core/box/tiny_legacy_fallback_box.h b/core/box/tiny_legacy_fallback_box.h index 19c2886c..fcf5091a 100644 --- a/core/box/tiny_legacy_fallback_box.h +++ b/core/box/tiny_legacy_fallback_box.h @@ -21,18 +21,11 @@ // - Returns: void (always succeeds or falls back to tiny_hot_free_fast) // __attribute__((always_inline)) -static inline void tiny_legacy_fallback_free_base(void* base, uint32_t class_idx) { - // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) - const TinyFrontV3Snapshot* front_snap; - bool metadata_cache_on; - if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { - const HakmemEnvSnapshot* env = hakmem_env_snapshot(); - front_snap = env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL; - metadata_cache_on = env->tiny_metadata_cache_eff; // Uses effective (cache && !learner) - } else { - front_snap = __builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL; - metadata_cache_on = tiny_metadata_cache_enabled(); - } +static inline void tiny_legacy_fallback_free_base_with_env(void* base, uint32_t class_idx, const HakmemEnvSnapshot* env) { + const TinyFrontV3Snapshot* front_snap = + env ? (env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL) + : (__builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL); + const bool metadata_cache_on = env ? env->tiny_metadata_cache_eff : tiny_metadata_cache_enabled(); // Phase 3 C2 Patch 2: First page cache hint (optional fast-path) // Check if pointer is in cached page (avoids metadata lookup in future optimizations) @@ -64,4 +57,10 @@ static inline void tiny_legacy_fallback_free_base(void* base, uint32_t class_idx tiny_hot_free_fast(class_idx, base); } +__attribute__((always_inline)) +static inline void tiny_legacy_fallback_free_base(void* base, uint32_t class_idx) { + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + tiny_legacy_fallback_free_base_with_env(base, class_idx, env); +} + #endif // HAKMEM_TINY_LEGACY_FALLBACK_BOX_H diff --git a/core/box/tiny_metadata_cache_hot_box.h b/core/box/tiny_metadata_cache_hot_box.h index be57a774..7501413f 100644 --- a/core/box/tiny_metadata_cache_hot_box.h +++ b/core/box/tiny_metadata_cache_hot_box.h @@ -58,15 +58,9 @@ static inline void tiny_policy_hot_refresh(void) { /// @param class_idx: Size class (0-7) /// @return: Route kind for this class __attribute__((always_inline)) -static inline SmallRouteKind tiny_policy_hot_get_route(uint32_t class_idx) { - // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) - bool metadata_cache_eff; - if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { - const HakmemEnvSnapshot* env = hakmem_env_snapshot(); - metadata_cache_eff = env->tiny_metadata_cache_eff; // Already includes learner check - } else { - metadata_cache_eff = tiny_metadata_cache_enabled() && !g_policy_hot.learner_v7_enabled; - } +static inline SmallRouteKind tiny_policy_hot_get_route_with_env(uint32_t class_idx, const HakmemEnvSnapshot* env) { + const bool metadata_cache_eff = + env ? env->tiny_metadata_cache_eff : (tiny_metadata_cache_enabled() && !g_policy_hot.learner_v7_enabled); if (__builtin_expect(metadata_cache_eff, 0)) { // Fast path: use cached route_kind @@ -80,4 +74,10 @@ static inline SmallRouteKind tiny_policy_hot_get_route(uint32_t class_idx) { return policy->route_kind[class_idx]; } +__attribute__((always_inline)) +static inline SmallRouteKind tiny_policy_hot_get_route(uint32_t class_idx) { + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + return tiny_policy_hot_get_route_with_env(class_idx, env); +} + #endif // HAK_TINY_METADATA_CACHE_HOT_BOX_H diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index 4a9204f9..00d10303 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -233,9 +233,10 @@ static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) { // This is the most common hot path - avoids TLS policy overhead // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) + const HakmemEnvSnapshot* env = NULL; bool c7_ultra_on; if (hakmem_env_snapshot_enabled()) { - const HakmemEnvSnapshot* env = hakmem_env_snapshot(); + env = hakmem_env_snapshot(); c7_ultra_on = env->tiny_c7_ultra_enabled; } else { c7_ultra_on = tiny_c7_ultra_enabled_env(); @@ -269,7 +270,7 @@ static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) { route_kind = tiny_static_route_get_kind_fast(class_idx); } else { // Phase 3 C2: Use policy hot cache if enabled (eliminates policy_snapshot() call) - route_kind = tiny_policy_hot_get_route(class_idx); + route_kind = tiny_policy_hot_get_route_with_env((uint32_t)class_idx, env); } // Phase 2 B3: Routing dispatch (ENV gate HAKMEM_TINY_ALLOC_ROUTE_SHAPE) @@ -381,7 +382,7 @@ static inline void* malloc_tiny_fast(size_t size) { // Cold path: Cross-thread free, TinyHeap routes, and legacy fallback // (noinline,cold to keep hot path small and I-cache clean) __attribute__((noinline,cold)) -static int free_tiny_fast_cold(void* ptr, void* base, int class_idx) +static int free_tiny_fast_cold(void* ptr, void* base, int class_idx, const HakmemEnvSnapshot* env) { FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_hit); @@ -399,15 +400,6 @@ static int free_tiny_fast_cold(void* ptr, void* base, int class_idx) route = tiny_route_for_class((uint8_t)class_idx); } const int use_tiny_heap = tiny_route_is_heap_kind(route); - // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) - // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) - const TinyFrontV3Snapshot* front_snap; - if (hakmem_env_snapshot_enabled()) { - const HakmemEnvSnapshot* env = hakmem_env_snapshot(); - front_snap = env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL; - } else { - front_snap = __builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL; - } // TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust header magic (0xA0) as sufficient validation. @@ -571,7 +563,7 @@ static int free_tiny_fast_cold(void* ptr, void* base, int class_idx) // Phase REFACTOR-2: Legacy fallback (use unified helper) legacy_fallback: FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_legacy_fallback); - tiny_legacy_fallback_free_base(base, class_idx); + tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env); return 1; } @@ -623,13 +615,8 @@ static inline int free_tiny_fast_hot(void* ptr) { // Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case) // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) - bool c7_ultra_free; - if (hakmem_env_snapshot_enabled()) { - const HakmemEnvSnapshot* env = hakmem_env_snapshot(); - c7_ultra_free = env->tiny_c7_ultra_enabled; - } else { - c7_ultra_free = tiny_c7_ultra_enabled_env(); - } + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + const bool c7_ultra_free = env ? env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env(); if (class_idx == 7 && c7_ultra_free) { FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_c7_ultra); @@ -650,7 +637,7 @@ static inline int free_tiny_fast_hot(void* ptr) { if (__builtin_expect(class_idx <= 3 && !g_larson_fix, 1)) { // C0-C3 + Larson mode OFF → Direct to legacy (no policy snapshot overhead) - tiny_legacy_fallback_free_base(base, class_idx); + tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env); FREE_TINY_FAST_HOTCOLD_STAT_INC(hot_hit); return 1; } @@ -717,7 +704,7 @@ static inline int free_tiny_fast_hot(void* ptr) { cold_path: // Delegate to cold path for cross-thread, TinyHeap, and legacy handling - return free_tiny_fast_cold(ptr, base, class_idx); + return free_tiny_fast_cold(ptr, base, class_idx, env); #else // No header mode - fall back to normal free @@ -773,6 +760,9 @@ static inline int free_tiny_fast(void* ptr) { // Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (1. 関数入口) FREE_PATH_STAT_INC(total_calls); + // Phase 19-3b: Consolidate ENV snapshot reads (capture once per free_tiny_fast call). + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + // Phase 9: MONO DUALHOT early-exit for C0-C3 (skip policy snapshot, direct to legacy) // Conditions: // - ENV: HAKMEM_FREE_TINY_FAST_MONO_DUALHOT=1 @@ -792,7 +782,7 @@ static inline int free_tiny_fast(void* ptr) { g_tiny_route_class[class_idx] == TINY_ROUTE_LEGACY) { // Direct path: Skip policy snapshot, go straight to legacy fallback FREE_PATH_STAT_INC(mono_dualhot_hit); - tiny_legacy_fallback_free_base(base, class_idx); + tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env); return 1; } } @@ -820,7 +810,7 @@ static inline int free_tiny_fast(void* ptr) { if (!g_larson_fix) { // Direct path: Skip policy snapshot, go straight to legacy fallback FREE_PATH_STAT_INC(mono_legacy_direct_hit); - tiny_legacy_fallback_free_base(base, class_idx); + tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env); return 1; } } @@ -830,13 +820,7 @@ static inline int free_tiny_fast(void* ptr) { // Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case) // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) - bool c7_ultra_free; - if (hakmem_env_snapshot_enabled()) { - const HakmemEnvSnapshot* env = hakmem_env_snapshot(); - c7_ultra_free = env->tiny_c7_ultra_enabled; - } else { - c7_ultra_free = tiny_c7_ultra_enabled_env(); - } + const bool c7_ultra_free = env ? env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env(); if (class_idx == 7 && c7_ultra_free) { tiny_c7_ultra_free(ptr); @@ -909,15 +893,6 @@ legacy_fallback: route = tiny_route_for_class((uint8_t)class_idx); } const int use_tiny_heap = tiny_route_is_heap_kind(route); - // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) - // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) - const TinyFrontV3Snapshot* front_snap; - if (hakmem_env_snapshot_enabled()) { - const HakmemEnvSnapshot* env = hakmem_env_snapshot(); - front_snap = env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL; - } else { - front_snap = __builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL; - } // TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust header magic (0xA0) as sufficient validation. @@ -1055,7 +1030,7 @@ legacy_fallback: #endif // Phase REFACTOR-2: Legacy fallback (use unified helper) - tiny_legacy_fallback_free_base(base, class_idx); + tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env); return 1; #else // No header mode - fall back to normal free diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3B_ENV_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3B_ENV_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md new file mode 100644 index 00000000..874ee95d --- /dev/null +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3B_ENV_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md @@ -0,0 +1,76 @@ +# Phase 19-3b: ENV Snapshot Pass-Down — A/B Test Results + +## Summary + +Verdict: ✅ **GO** + +- Baseline mean: **55.56M ops/s** +- Optimized mean: **57.10M ops/s** +- Delta (mean): **+2.76%** + +- Baseline median: **55.65M ops/s** +- Optimized median: **57.09M ops/s** +- Delta (median): **+2.57%** + +## What Changed + +- `core/front/malloc_tiny_fast.h` + - Capture `const HakmemEnvSnapshot* env` once per hot call and pass it down: + - `free_tiny_fast()` / `free_tiny_fast_hot()` capture once + - `free_tiny_fast_cold(..., env)` consumes it + - `tiny_legacy_fallback_free_base_with_env(..., env)` consumes it + - Reuse the same snapshot in the alloc route selection: + - `tiny_policy_hot_get_route_with_env(class_idx, env)` + - Remove dead `front_snap` computations (were set-but-unused). + +- `core/box/tiny_legacy_fallback_box.h` + - Add `tiny_legacy_fallback_free_base_with_env(...)` and keep the old wrapper for compatibility. + +- `core/box/tiny_metadata_cache_hot_box.h` + - Add `tiny_policy_hot_get_route_with_env(...)` and keep the old wrapper for compatibility. + +## Bench Setup + +- Command: `scripts/run_mixed_10_cleanenv.sh` +- Profile: `HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE` (script default) +- Params: `iter=20000000 ws=400` +- Host: same machine / same build flags, back-to-back runs. + +## Raw Results + +### Baseline (Phase 19-3a) + +``` +56215204 +55685609 +55968309 +55866150 +54795835 +55113419 +55659129 +55645869 +55286223 +55409488 +``` + +Mean: 55,564,523.5 +Median: 55,652,499.0 + +### Optimized (Phase 19-3b) + +``` +57413912 +57291780 +56913158 +57044292 +57219468 +56609810 +56995683 +57027125 +57350810 +57126094 +``` + +Mean: 57,099,213.2 +Median: 57,085,193.0 + diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md index 3a827b5a..0c7e3eb3 100644 --- a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md @@ -3,7 +3,7 @@ ## 0. Goal **Objective**: Reduce ENV check overhead from per-operation 3+ TLS reads to 1 TLS read -**Expected Impact**: -10.0 instructions/op, -4.0 branches/op, +5-8% throughput +**Expected Impact (target)**: -10.0 instructions/op, -4.0 branches/op, +3-8% throughput **Risk Level**: MEDIUM (ENV invalidation handling required) **Box Name**: EnvSnapshotConsolidationBox (Phase 19-3) @@ -32,6 +32,12 @@ Phase 19-3a removed the call-site UNLIKELY hint: Observed impact: **GO (+4.42% throughput)** on Mixed. This validates that the remaining ENV work is dominated by branch/layout effects, not just raw "read cost". +### Phase 19-3b Result (validated) + +Phase 19-3b consolidated snapshot reads by capturing `env` once per hot call and passing it down into nested helpers. + +Observed impact: **GO (+2.76% mean / +2.57% median)** on Mixed 10-run (`scripts/run_mixed_10_cleanenv.sh`). + --- ## 1. Current State Analysis @@ -235,19 +241,20 @@ if (class_idx == 7 && g_c7_ultra_enabled_fixed) { **Files**: - Modified: `core/front/malloc_tiny_fast.h` - Phase 19-3a: remove backwards `__builtin_expect(..., 0)` hints (DONE, +4.42% GO). - - Phase 19-3b/c: thread `const HakmemEnvSnapshot* env` down to eliminate repeated `hakmem_env_snapshot_enabled()` checks. -- Modified: `core/box/hak_wrappers.inc.h` - - Compute `env` once per wrapper entry and pass it to hot helpers (especially when `HAKMEM_FASTLANE_DIRECT=1`). -- Optional (only if separate rollback gate is desired): - - New: `core/box/env_snapshot_consolidation_env_box.{h,c}` (cached gate + refresh hook) + - Phase 19-3b: thread `const HakmemEnvSnapshot* env` down to eliminate repeated `hakmem_env_snapshot_enabled()` checks (DONE, +2.76% GO). +- Modified: `core/box/tiny_legacy_fallback_box.h` + - Add `_with_env` helper (Phase 19-3b). +- Modified: `core/box/tiny_metadata_cache_hot_box.h` + - Add `_with_env` helper (Phase 19-3b). +- Optional: `core/box/hak_wrappers.inc.h` + - If needed, compute `env` once per wrapper entry and pass it down (removes the remaining alloc-side gate in `malloc_tiny_fast_for_class()`). **ENV Gate**: - Base: `HAKMEM_ENV_SNAPSHOT=0/1` (Phase 4 E1 gate; promoted ON in presets) -- Optional: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0/1` (default 0, opt-in) — gates only the “pass-down” refactor for A/B safety. **Rollback**: -- If using `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION`: set it to `0`. -- Otherwise: `HAKMEM_FASTLANE_DIRECT=0` falls back to the wrapper/FastLane path (still safe). +- Snapshot behavior: set `HAKMEM_ENV_SNAPSHOT=0` to fall back to per-feature env gates. +- Pass-down refactor: revert the Phase 19-3b commit (or add a dedicated pass-down gate if future A/B is needed). ### 3.2 API Design @@ -268,29 +275,28 @@ int free_tiny_fast_with_env(void* ptr, const HakmemEnvSnapshot* env); - `__builtin_expect(hakmem_env_snapshot_enabled(), 0)` → `hakmem_env_snapshot_enabled()` - Measured: **GO (+4.42%)** -**Phase 19-3b (NEXT)**: wrapper entry computes `const HakmemEnvSnapshot* env` once and passes it down. -- Wrapper entry: - - `const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL;` -- Hot helpers: - - Replace repeated `hakmem_env_snapshot_enabled()` checks with `if (env) { ... } else { ... }` - - Keep `env==NULL` fallback path unchanged. -- Target: reduce repeated gate checks across hot helpers (especially inside `free_tiny_fast*`). +**Phase 19-3b (DONE)**: capture `env` once per hot call and pass it down into nested helpers. +- In `core/front/malloc_tiny_fast.h`: + - `free_tiny_fast()` / `free_tiny_fast_hot()` capture `env` once and pass it to cold + legacy helpers. + - `malloc_tiny_fast_for_class()` reuses the same snapshot for `tiny_policy_hot_get_route_with_env(...)`. +- In `core/box/tiny_legacy_fallback_box.h` and `core/box/tiny_metadata_cache_hot_box.h`: + - add `_with_env` helpers to consume the pass-down pointer. +- Measured: **GO (+2.76% mean / +2.57% median)** on Mixed 10-run. **Phase 19-3c (OPTIONAL)**: propagate `env` into legacy fallback + metadata cache helpers to eliminate the remaining call sites: -- `core/box/tiny_legacy_fallback_box.h` -- `core/box/tiny_metadata_cache_hot_box.h` +- (Already done in Phase 19-3b.) Optional next: pass `env` down from wrapper entry to remove the remaining alloc-side gate. ### 3.4 Files to Modify 1. `core/front/malloc_tiny_fast.h` - - Phase 19-3b: add `_with_env` helper variants or thread `env` through internal helpers. - - Replace the remaining repeated `hakmem_env_snapshot_enabled()` call sites with `env`-based checks. -2. `core/box/hak_wrappers.inc.h` - - Compute `env` once per entry and pass it down (especially for `HAKMEM_FASTLANE_DIRECT` path). -3. (Optional) `core/box/tiny_legacy_fallback_box.h` - - Thread `env` into the legacy fallback helper to eliminate an extra gate check. -4. (Optional) `core/box/tiny_metadata_cache_hot_box.h` - - Same for metadata-cache effective checks. + - Phase 19-3a: UNLIKELY hint removal. + - Phase 19-3b: pass-down `env` to cold + legacy helpers. +2. `core/box/tiny_legacy_fallback_box.h` + - Phase 19-3b: add `_with_env` helper + keep wrapper. +3. `core/box/tiny_metadata_cache_hot_box.h` + - Phase 19-3b: add `_with_env` helper + keep wrapper. +4. (Optional) `core/box/hak_wrappers.inc.h` + - Pass `env` down from wrapper entry (alloc-side; removes one remaining gate). --- @@ -299,9 +305,9 @@ int free_tiny_fast_with_env(void* ptr, const HakmemEnvSnapshot* env); ### 4.1 Boundary Preservation **L0 (ENV gate)**: -- `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` → `env==NULL` (or not passed down), fallback to existing path -- `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1` → `env` is passed down, new path -- Compile-time flag: `#if HAKMEM_ENV_SNAPSHOT_CONSOLIDATION` for complete removal +- `HAKMEM_ENV_SNAPSHOT=0` → `env==NULL` → fallback to per-feature env gates +- `HAKMEM_ENV_SNAPSHOT=1` → `env!=NULL` → snapshot-based checks +- (Optional) A dedicated “pass-down gate” can be introduced for A/B safety, but avoid adding a new hot-branch unless needed. **L1 (Hot inline)**: - No algorithmic changes, only ENV check consolidation @@ -333,23 +339,13 @@ int free_tiny_fast_with_env(void* ptr, const HakmemEnvSnapshot* env); **Runtime rollback**: ```sh -HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0 # Disable new path -``` - -**Compile-time rollback**: -```c -#if HAKMEM_ENV_SNAPSHOT_CONSOLIDATION - // New context path -#else - // Old scattered ENV checks (preserved) -#endif +HAKMEM_ENV_SNAPSHOT=0 # Disable snapshot path (falls back to per-feature env gates) ``` **Gradual rollout**: 1. Phase 19-3a: UNLIKELY hint removal (DONE, GO) -2. Phase 19-3b: wrapper pass-down to hot helpers (measure) -3. Phase 19-3c: legacy + metadata pass-down (measure) -4. Graduate: add to `MIXED_TINYV3_C7_SAFE` preset if GO +2. Phase 19-3b: hot helper pass-down (DONE, GO) +3. Phase 19-3c: optional wrapper-entry pass-down (alloc-side; measure) ### 4.4 Observability @@ -366,15 +362,12 @@ typedef struct { **Perf validation**: - Before: `perf record` shows `hakmem_env_snapshot_enabled` at ~7% - After: `hakmem_env_snapshot_enabled` should drop to <1% -- Expected: deep helpers stop calling `hakmem_env_snapshot_enabled()` repeatedly (only wrapper entry remains) +- Expected: deep helpers stop calling `hakmem_env_snapshot_enabled()` repeatedly (single capture per hot call) **A/B testing**: ```sh -# Baseline (Phase 19-2 state) -HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0 ./bench_random_mixed_hakmem 200000000 400 1 - -# Optimized (Phase 19-3) -HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1 ./bench_random_mixed_hakmem 200000000 400 1 +# Recommended: compare baseline vs optimized commits with the same bench script +scripts/run_mixed_10_cleanenv.sh ``` --- @@ -501,7 +494,7 @@ HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1 ./bench_random_mixed_hakmem 200000000 400 1 - [ ] Add `malloc_tiny_fast_ctx()` variant (Phase 19-3a) - [ ] Add `free_tiny_fast_ctx()` variant (Phase 19-3b) - [ ] Propagate context to `tiny_legacy_fallback_box.h` (Phase 19-3c) -- [ ] Add ENV gate `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0/1` +- [ ] (Optional) Add a dedicated pass-down gate if A/B within a single binary is needed - [ ] Add stats counters (debug builds) ### 7.3 Testing (Per Phase) @@ -528,13 +521,8 @@ HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1 ./bench_random_mixed_hakmem 200000000 400 1 **Benchmark suite**: ```sh -# Baseline (Phase 19-2) -HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0 \ - scripts/run_mixed_10_cleanenv.sh - -# Optimized (Phase 19-3) -HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1 \ - scripts/run_mixed_10_cleanenv.sh +# Run the same cleanenv script on baseline vs optimized commits +scripts/run_mixed_10_cleanenv.sh ``` **GO/NO-GO criteria**: @@ -568,7 +556,7 @@ perf stat -e cycles,instructions,branches,branch-misses,L1-icache-load-misses \ **Timeline**: 4-6 hours implementation + 2 hours testing **Risk**: LOW (isolated to alloc path) -**Rollback**: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` +**Rollback**: revert Phase 19-3b commit (or set `HAKMEM_ENV_SNAPSHOT=0` to disable snapshot path) ### 8.2 Phase 19-3b: free Path (Week 1) @@ -580,7 +568,7 @@ perf stat -e cycles,instructions,branches,branch-misses,L1-icache-load-misses \ **Timeline**: 4-6 hours implementation + 2 hours testing **Risk**: LOW-MEDIUM (more call sites than malloc) -**Rollback**: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` +**Rollback**: revert Phase 19-3b commit (or set `HAKMEM_ENV_SNAPSHOT=0` to disable snapshot path) ### 8.3 Phase 19-3c: Legacy + Metadata (Week 2) @@ -591,7 +579,7 @@ perf stat -e cycles,instructions,branches,branch-misses,L1-icache-load-misses \ **Timeline**: 3-4 hours implementation + 2 hours testing **Risk**: MEDIUM (touches multiple boxes) -**Rollback**: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` +**Rollback**: revert Phase 19-3b commit (or set `HAKMEM_ENV_SNAPSHOT=0` to disable snapshot path) ### 8.4 Graduate (Week 2-3) @@ -602,15 +590,15 @@ perf stat -e cycles,instructions,branches,branch-misses,L1-icache-load-misses \ - Perf validation confirms instruction reduction **Promotion actions**: -1. Add to `MIXED_TINYV3_C7_SAFE` preset: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1` +1. Ensure `MIXED_TINYV3_C7_SAFE` preset keeps `HAKMEM_ENV_SNAPSHOT=1` (already) 2. Document in optimization roadmap 3. Update Box Theory index 4. Keep ENV default=0 (opt-in) until production validation **Rollback strategy**: - Preset level: Remove from preset, keep code -- Code level: `#if HAKMEM_ENV_SNAPSHOT_CONSOLIDATION` → disable at compile time -- Emergency: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` global default +- Code level: revert the Phase 19-3b commit +- Emergency: set `HAKMEM_ENV_SNAPSHOT=0` (falls back to per-feature env gates) --- diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md index 0d750491..842b18a5 100644 --- a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md @@ -46,14 +46,33 @@ scripts/run_mixed_10_cleanenv.sh **GO Criteria**: No regression (±1%) +**Status**: DONE ✅ (GO +4.42%) + --- -### Phase 19-3b: Pass snapshot down from wrapper entry +### Phase 19-3b: Pass snapshot down (hot helper pass-down) + +**Status**: DONE ✅ (GO +2.76% mean / +2.57% median) +Results: `docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3B_ENV_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md` **Current state**: Each callee calls `hakmem_env_snapshot_enabled()` independently - 5 calls × 2 TLS reads each = **10 TLS reads/op** -**Proposed**: +**Implementation (landed)**: +- `core/front/malloc_tiny_fast.h` + - Capture `env` once per hot call and pass it down: + - `free_tiny_fast()` / `free_tiny_fast_hot()` capture `env` once + - `free_tiny_fast_cold(..., env)` consumes it + - `tiny_legacy_fallback_free_base_with_env(..., env)` consumes it + - Reuse the same snapshot for alloc route selection: + - `tiny_policy_hot_get_route_with_env(class_idx, env)` +- `core/box/tiny_legacy_fallback_box.h`: add `tiny_legacy_fallback_free_base_with_env(...)` +- `core/box/tiny_metadata_cache_hot_box.h`: add `tiny_policy_hot_get_route_with_env(...)` + +**Optional extension (if chasing the last alloc-side gate)**: +Pass `env` down from `core/box/hak_wrappers.inc.h` entry. Keep the invariant: malloc miss must fall through (do not call `malloc_cold()` directly). + +**Example (optional)**: ```c // core/box/hak_wrappers.inc.h (malloc wrapper) void* malloc(size_t size) {