From 8f4ada5bbdab5dfd8689e4d013c24101ec772aa0 Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Mon, 15 Dec 2025 12:29:27 +0900 Subject: [PATCH] Phase 19-3a: remove backwards UNLIKELY env-snapshot hints --- CURRENT_TASK.md | 37 +- core/front/malloc_tiny_fast.h | 15 +- ...FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md | 712 ++++++++++++++++++ ...UCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md | 406 ++++++++++ 4 files changed, 1164 insertions(+), 6 deletions(-) create mode 100644 docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md create mode 100644 docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index 26a3678b..9c0fb508 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -1,6 +1,41 @@ # 本線タスク(現在) -## 更新メモ(2025-12-15 Phase 19-1b FASTLANE-DIRECT-1B) +## 更新メモ(2025-12-15 Phase 19-3a UNLIKELY-HINT-REMOVAL) + +### Phase 19-3a UNLIKELY-HINT-REMOVAL: ENV Snapshot UNLIKELY Hint Removal — ✅ GO (+4.42%) + +**Result**: UNLIKELY hint (`__builtin_expect(..., 0)`) 削除により throughput **+4.42%** 達成。期待値(+0-2%)を大幅超過。 + +**A/B Test Results** (HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE, 20M ops, 3-run average): +- Baseline (Phase 19-1b): 52.06M ops/s +- Optimized (Phase 19-3a): 54.36M ops/s (53.99, 54.44, 54.66) +- Delta: **+4.42%** (GO判定、期待値 +0-2% を大幅超過) + +**修正内容**: +- File: `/mnt/workdisk/public_share/hakmem/core/front/malloc_tiny_fast.h` +- 修正箇所: 5箇所 + - Line 237: malloc_tiny_fast_for_class (C7 ULTRA alloc) + - Line 405: free_tiny_fast_cold (Front V3 free hotcold) + - Line 627: free_tiny_fast_hot (C7 ULTRA free) + - Line 834: free_tiny_fast (C7 ULTRA free larson) + - Line 915: free_tiny_fast (Front V3 free larson) +- 変更: `__builtin_expect(hakmem_env_snapshot_enabled(), 0)` → `hakmem_env_snapshot_enabled()` +- 理由: ENV snapshot は ON by default (MIXED_TINYV3_C7_SAFE preset) → UNLIKELY hint が逆効果 + +**Why it works**: +- Phase 19-1b で学んだ教訓: `__builtin_expect(..., 0)` は branch misprediction を誘発 +- ENV snapshot は MIXED_TINYV3_C7_SAFE で ON → "UNLIKELY" hint が backwards +- Hint 削除により compiler が正しい branch prediction を生成 → misprediction penalty 削減 + +**Impact**: +- Throughput: 52.06M → 54.36M ops/s (+4.42%) +- Expected future gains (from design doc Phase 19-3b/c): Additional +3-5% from ENV consolidation + +**Next**: Phase 19-3b (ENV Snapshot Consolidation) — Pass env snapshot down from wrapper entry to eliminate 8 additional TLS reads/op. + +--- + +## 前回タスク(2025-12-15 Phase 19-1b FASTLANE-DIRECT-1B) ### Phase 19-1b FASTLANE-DIRECT-1B: FastLane Direct (Revised) — ✅ GO (+5.88%) diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index 573a3a2d..4a9204f9 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -232,8 +232,9 @@ static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) { // Phase v11a-5b: C7 ULTRA early-exit (skip policy snapshot for common case) // This is the most common hot path - avoids TLS policy overhead // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) + // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) bool c7_ultra_on; - if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { + if (hakmem_env_snapshot_enabled()) { const HakmemEnvSnapshot* env = hakmem_env_snapshot(); c7_ultra_on = env->tiny_c7_ultra_enabled; } else { @@ -399,8 +400,9 @@ static int free_tiny_fast_cold(void* ptr, void* base, int class_idx) } const int use_tiny_heap = tiny_route_is_heap_kind(route); // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) + // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) const TinyFrontV3Snapshot* front_snap; - if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { + if (hakmem_env_snapshot_enabled()) { const HakmemEnvSnapshot* env = hakmem_env_snapshot(); front_snap = env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL; } else { @@ -620,8 +622,9 @@ static inline int free_tiny_fast_hot(void* ptr) { // Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case) // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) + // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) bool c7_ultra_free; - if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { + if (hakmem_env_snapshot_enabled()) { const HakmemEnvSnapshot* env = hakmem_env_snapshot(); c7_ultra_free = env->tiny_c7_ultra_enabled; } else { @@ -826,8 +829,9 @@ static inline int free_tiny_fast(void* ptr) { // Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case) // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) + // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) bool c7_ultra_free; - if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { + if (hakmem_env_snapshot_enabled()) { const HakmemEnvSnapshot* env = hakmem_env_snapshot(); c7_ultra_free = env->tiny_c7_ultra_enabled; } else { @@ -906,8 +910,9 @@ legacy_fallback: } const int use_tiny_heap = tiny_route_is_heap_kind(route); // Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1) + // Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards) const TinyFrontV3Snapshot* front_snap; - if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { + if (hakmem_env_snapshot_enabled()) { const HakmemEnvSnapshot* env = hakmem_env_snapshot(); front_snap = env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL; } else { diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md new file mode 100644 index 00000000..3a827b5a --- /dev/null +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_DESIGN.md @@ -0,0 +1,712 @@ +# Phase 19-3: ENV Snapshot Consolidation — Design + +## 0. Goal + +**Objective**: Reduce ENV check overhead from per-operation 3+ TLS reads to 1 TLS read +**Expected Impact**: -10.0 instructions/op, -4.0 branches/op, +5-8% throughput +**Risk Level**: MEDIUM (ENV invalidation handling required) +**Box Name**: EnvSnapshotConsolidationBox (Phase 19-3) + +### Context + +Phase 19 perf analysis revealed that ENV checks are executed **3+ times per operation**: +- `hakmem_env_snapshot_enabled()`: called 5 times in malloc/free hot paths (lines 236, 403, 624, 830, 910 in malloc_tiny_fast.h) +- Each call triggers: + - TLS read of `g_hakmem_env_snapshot_ctor_mode` + - TLS read of `g_hakmem_env_snapshot_gate` + - Branch prediction overhead + - Potential lazy initialization check + +**Current overhead**: ~7% perf samples on `hakmem_env_snapshot_enabled()` and related ENV checks. + +**Phase 4 E1 Status**: ENV snapshot infrastructure exists (global default OFF, but promoted ON in presets like `MIXED_TINYV3_C7_SAFE`). Phase 19-3 aims to: +1. Eliminate redundant `hakmem_env_snapshot_enabled()` checks (5 calls → 1 call) +2. Make ENV snapshot the **default path** (not research box) +3. Further consolidate ENV reads into entry-point snapshot + +### Phase 19-3a Result (validated) + +Phase 19-3a removed the call-site UNLIKELY hint: +`__builtin_expect(hakmem_env_snapshot_enabled(), 0)` → `hakmem_env_snapshot_enabled()` + +Observed impact: **GO (+4.42% throughput)** on Mixed. +This validates that the remaining ENV work is dominated by branch/layout effects, not just raw "read cost". + +--- + +## 1. Current State Analysis + +### 1.1 ENV Check Locations (Per-Operation) + +Based on code analysis, ENV checks occur in these hot path locations: + +**malloc_tiny_fast() path**: +1. Line 236: C7 ULTRA check (`hakmem_env_snapshot_enabled()` → `hakmem_env_snapshot()`) +2. Line 403: Front V3 snapshot check for `free()` (in `free_tiny_fast_v4_hotcold`) +3. Line 910: Front V3 snapshot check for `free()` (in `free_tiny_fast_v4_larson`) + +**free_tiny_fast() paths**: +1. Line 624: C7 ULTRA check (`hakmem_env_snapshot_enabled()` → `hakmem_env_snapshot()`) +2. Line 830: C7 ULTRA check (duplicate in `free_tiny_fast_v4_larson`) + +**tiny_legacy_fallback_box.h**: +- Line 28: `hakmem_env_snapshot_enabled()` for front_snap + metadata_cache_on + +**tiny_metadata_cache_hot_box.h**: +- Line 64: `hakmem_env_snapshot_enabled()` for metadata cache effective check + +### 1.2 TLS Read Overhead Analysis + +Each `hakmem_env_snapshot_enabled()` call performs: +```c +int ctor_mode = g_hakmem_env_snapshot_ctor_mode; // TLS read #1 +if (ctor_mode == 1) { + return g_hakmem_env_snapshot_gate != 0; // TLS read #2 (ctor path) +} +// Legacy path +if (g_hakmem_env_snapshot_gate == -1) { // TLS read #2 (legacy path) + // Lazy init with getenv() +} +``` + +**Per-operation cost** (when snapshot enabled): +- **5 calls** × **2 TLS reads** = **10 TLS reads/op** +- Plus: 5× branch on `ctor_mode`, 5× branch on snapshot enabled +- Actual measurement: ~7% perf samples + +**Per-operation cost** (when snapshot disabled - current default): +- **5 calls** × **2-3 TLS reads** = **10-15 TLS reads/op** +- Plus: lazy init checks, getenv() overhead on first call per thread + +### 1.3 Redundancy Analysis + +**Problem**: Each hot path independently checks `hakmem_env_snapshot_enabled()`: +- malloc C7 ULTRA: check at line 236 +- free C7 ULTRA: check at line 624 (same operation, different code path) +- free front V3: check at line 403 and 910 (same snapshot needed) +- Legacy fallback: check at line 28 (called from above paths) +- Metadata cache: check at line 64 (called from above paths) + +**Redundancy**: For a typical malloc+free pair: +- Current: 5+ `hakmem_env_snapshot_enabled()` calls = 10-15 TLS reads +- Optimal: 1 entry-point snapshot = 1-2 TLS reads + +**Gap**: 8-13 redundant TLS reads per operation + +--- + +## 2. Design Options + +### Option A: Entry-Point Snapshot Pass-Down (Recommended) + +**Concept**: Capture the existing `HakmemEnvSnapshot` pointer once at malloc/free entry, and pass it down. +This avoids creating a new TLS context and automatically stays compatible with `hakmem_env_snapshot_refresh_from_env()` (refresh updates the snapshot in-place). + +**Architecture**: +```c +// At wrapper entry (malloc/free): +const HakmemEnvSnapshot* env = + hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + +// In malloc_tiny_fast(): +void* malloc_tiny_fast_with_env(size_t size, const HakmemEnvSnapshot* env) { + // Use ctx->c7_ultra_enabled instead of calling hakmem_env_snapshot_enabled() + if (env && class_idx == 7 && env->tiny_c7_ultra_enabled) { + // Direct check, no TLS read + } +} +``` + +**Pros**: +- **Minimal refactoring**: Add context parameter to existing functions +- **Type safety**: Compiler enforces context passing +- **Clear boundary**: ENV decisions made at entry, logic below is pure +- **Easy rollback**: Context parameter can be NULL (fallback to old path) + +**Cons**: +- **API threading**: Some hot helpers need an extra pointer parameter (`env`) or `_with_env` variants. +- **Register pressure**: Extra parameter may affect register allocation (verify via perf stat). + +**Risk**: LOW-MEDIUM (mechanical threading, rollback is simple) + +--- + +### Option B: TLS Cached Context (Alternative) + +**Concept**: Maintain thread-local ENV context, refresh on invalidation events. + +**Architecture**: +```c +// Global TLS context (replaces per-call ENV checks) +static __thread FastLaneEnvCtx g_fastlane_ctx; +static __thread int g_fastlane_ctx_version = 0; +extern int g_env_snapshot_version; // Incremented on ENV change + +static inline const FastLaneEnvCtx* fastlane_ctx_get(void) { + if (__builtin_expect(g_fastlane_ctx_version != g_env_snapshot_version, 0)) { + // Refresh from snapshot (rare) + const HakmemEnvSnapshot* snap = hakmem_env_snapshot(); + g_fastlane_ctx.c7_ultra_enabled = snap->tiny_c7_ultra_enabled; + // ... copy fields + g_fastlane_ctx_version = g_env_snapshot_version; + } + return &g_fastlane_ctx; +} + +// In hot path: +const FastLaneEnvCtx* ctx = fastlane_ctx_get(); // 1 TLS read + 1 branch +if (class_idx == 7 && ctx->c7_ultra_enabled) { // Direct struct access +``` + +**Pros**: +- **No API changes**: Existing functions unchanged +- **Single TLS read**: Version check is fast (1 global read + 1 TLS read) +- **Automatic invalidation**: Version bump triggers refresh +- **Easy integration**: Drop-in replacement for `hakmem_env_snapshot_enabled()` + +**Cons**: +- **Version management**: Need global version counter + invalidation hooks +- **Stale data risk**: If version check is missed, stale context used +- **Init complexity**: Each thread needs lazy init + version tracking +- **Debugging**: Harder to trace when context was last refreshed + +**Risk**: MEDIUM (version invalidation must be bulletproof) + +--- + +### Option C: Init-Time Fixed (High Risk) + +**Concept**: Read ENV once at process init, freeze configuration for lifetime. + +**Architecture**: +```c +// Global constants (set in constructor) +static bool g_c7_ultra_enabled_fixed; +static bool g_front_v3_enabled_fixed; + +__attribute__((constructor)) +static void fastlane_env_init(void) { + const HakmemEnvSnapshot* snap = hakmem_env_snapshot(); + g_c7_ultra_enabled_fixed = snap->tiny_c7_ultra_enabled; + g_front_v3_enabled_fixed = snap->tiny_front_v3_enabled; +} + +// Hot path: direct global read (no TLS) +if (class_idx == 7 && g_c7_ultra_enabled_fixed) { +``` + +**Pros**: +- **Zero TLS reads**: Direct global variable access +- **Maximum performance**: Compiler can constant-fold if known at link time +- **Simple implementation**: No lazy init, no version tracking + +**Cons**: +- **No runtime ENV changes**: ENV toggles require process restart +- **Breaks bench_profile**: `putenv()` in benchmarks will not work +- **No A/B testing**: Cannot toggle ENV for same-binary comparison +- **Box Theory violation**: No rollback/toggle capability + +**Risk**: HIGH (breaks existing workflow, violates Box Theory) + +--- + +### Recommended: **Option A (Entry-Point Snapshot Pass-Down)** + +**Reasoning**: +1. **Preserves Box Theory**: `env==NULL` → fallback to old path +2. **Clear separation**: ENV decisions at entry, pure logic below +3. **Benchmark compatible**: Works with `bench_profile` putenv + `hakmem_env_snapshot_refresh_from_env()` (snapshot updates in-place) +4. **Performance**: Removes repeated `hakmem_env_snapshot_enabled()` checks inside deep helpers + +**Trade-off acceptance**: +- Accept API changes (mechanical, low risk) +- Accept extra parameter (register pressure acceptable for hot path) +- Reject Option B's version management complexity +- Reject Option C's inflexibility + +--- + +## 3. Implementation Plan (Option A) + +### 3.1 Box Design + +**Box Name**: `EnvSnapshotConsolidationBox` (Phase 19-3) + +**Files**: +- Modified: `core/front/malloc_tiny_fast.h` + - Phase 19-3a: remove backwards `__builtin_expect(..., 0)` hints (DONE, +4.42% GO). + - Phase 19-3b/c: thread `const HakmemEnvSnapshot* env` down to eliminate repeated `hakmem_env_snapshot_enabled()` checks. +- Modified: `core/box/hak_wrappers.inc.h` + - Compute `env` once per wrapper entry and pass it to hot helpers (especially when `HAKMEM_FASTLANE_DIRECT=1`). +- Optional (only if separate rollback gate is desired): + - New: `core/box/env_snapshot_consolidation_env_box.{h,c}` (cached gate + refresh hook) + +**ENV Gate**: +- Base: `HAKMEM_ENV_SNAPSHOT=0/1` (Phase 4 E1 gate; promoted ON in presets) +- Optional: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0/1` (default 0, opt-in) — gates only the “pass-down” refactor for A/B safety. + +**Rollback**: +- If using `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION`: set it to `0`. +- Otherwise: `HAKMEM_FASTLANE_DIRECT=0` falls back to the wrapper/FastLane path (still safe). + +### 3.2 API Design + +**Pass-down API (recommended)**: +```c +// Wrapper entry (malloc/free): read snapshot ONCE, pass down. +const HakmemEnvSnapshot* env = + hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + +// Hot helpers accept optional env pointer (NULL-safe). +void* malloc_tiny_fast_with_env(size_t size, const HakmemEnvSnapshot* env); +int free_tiny_fast_with_env(void* ptr, const HakmemEnvSnapshot* env); +``` + +### 3.3 Migration Plan (Incremental) + +**Phase 19-3a (DONE)**: remove backwards UNLIKELY hints at the 5 hottest call sites in `core/front/malloc_tiny_fast.h`. +- `__builtin_expect(hakmem_env_snapshot_enabled(), 0)` → `hakmem_env_snapshot_enabled()` +- Measured: **GO (+4.42%)** + +**Phase 19-3b (NEXT)**: wrapper entry computes `const HakmemEnvSnapshot* env` once and passes it down. +- Wrapper entry: + - `const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL;` +- Hot helpers: + - Replace repeated `hakmem_env_snapshot_enabled()` checks with `if (env) { ... } else { ... }` + - Keep `env==NULL` fallback path unchanged. +- Target: reduce repeated gate checks across hot helpers (especially inside `free_tiny_fast*`). + +**Phase 19-3c (OPTIONAL)**: propagate `env` into legacy fallback + metadata cache helpers to eliminate the remaining call sites: +- `core/box/tiny_legacy_fallback_box.h` +- `core/box/tiny_metadata_cache_hot_box.h` + +### 3.4 Files to Modify + +1. `core/front/malloc_tiny_fast.h` + - Phase 19-3b: add `_with_env` helper variants or thread `env` through internal helpers. + - Replace the remaining repeated `hakmem_env_snapshot_enabled()` call sites with `env`-based checks. +2. `core/box/hak_wrappers.inc.h` + - Compute `env` once per entry and pass it down (especially for `HAKMEM_FASTLANE_DIRECT` path). +3. (Optional) `core/box/tiny_legacy_fallback_box.h` + - Thread `env` into the legacy fallback helper to eliminate an extra gate check. +4. (Optional) `core/box/tiny_metadata_cache_hot_box.h` + - Same for metadata-cache effective checks. + +--- + +## 4. Safety / Box Theory + +### 4.1 Boundary Preservation + +**L0 (ENV gate)**: +- `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` → `env==NULL` (or not passed down), fallback to existing path +- `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1` → `env` is passed down, new path +- Compile-time flag: `#if HAKMEM_ENV_SNAPSHOT_CONSOLIDATION` for complete removal + +**L1 (Hot inline)**: +- No algorithmic changes, only ENV check consolidation +- Existing `malloc_tiny_fast()` / `free_tiny_fast()` logic unchanged +- `env` is read-only (const pointer) + +**L2 (Cold fallback)**: +- Cold paths unchanged (no context propagation needed) +- Legacy fallback accepts optional `env` + +**L3 (Stats/Observability)**: +- Add counter: `ENV_CONSOLIDATION_STAT_INC(enabled_calls)` +- Track: pass-down hits, fallback path usage +- Perf verification: reduced `hakmem_env_snapshot_enabled()` hot samples + +### 4.2 Fail-Fast + +**NULL env handling**: +- All functions accept `env==NULL` → fallback to existing path +- No crashes, no undefined behavior +- Debug builds: assert(`env!=NULL`) only if the pass-down gate is enabled (optional) + +**ENV invalidation**: +- Snapshot refresh is handled by the existing Phase 4 E1 mechanism: + - `bench_profile` uses `hakmem_env_snapshot_refresh_from_env()` after `putenv()` + - Snapshot updates in-place, so the `env` pointer remains valid + +### 4.3 Rollback + +**Runtime rollback**: +```sh +HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0 # Disable new path +``` + +**Compile-time rollback**: +```c +#if HAKMEM_ENV_SNAPSHOT_CONSOLIDATION + // New context path +#else + // Old scattered ENV checks (preserved) +#endif +``` + +**Gradual rollout**: +1. Phase 19-3a: UNLIKELY hint removal (DONE, GO) +2. Phase 19-3b: wrapper pass-down to hot helpers (measure) +3. Phase 19-3c: legacy + metadata pass-down (measure) +4. Graduate: add to `MIXED_TINYV3_C7_SAFE` preset if GO + +### 4.4 Observability + +**Stats counters** (debug builds): +```c +typedef struct { + uint64_t env_passdown_hits; // wrapper passed non-NULL env + uint64_t env_null_fallback; // env==NULL, used old path + uint64_t malloc_env_path; // malloc used env pass-down + uint64_t free_env_path; // free used env pass-down +} EnvConsolidationStats; +``` + +**Perf validation**: +- Before: `perf record` shows `hakmem_env_snapshot_enabled` at ~7% +- After: `hakmem_env_snapshot_enabled` should drop to <1% +- Expected: deep helpers stop calling `hakmem_env_snapshot_enabled()` repeatedly (only wrapper entry remains) + +**A/B testing**: +```sh +# Baseline (Phase 19-2 state) +HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0 ./bench_random_mixed_hakmem 200000000 400 1 + +# Optimized (Phase 19-3) +HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1 ./bench_random_mixed_hakmem 200000000 400 1 +``` + +--- + +## 5. Expected Performance + +### 5.1 Instruction Reduction Estimate + +**Current overhead** (per malloc+free operation): +- 5 calls to `hakmem_env_snapshot_enabled()`: + - Each: gate loads + branches (and legacy lazy-init path on first call) + - Total: **~5 gate checks** per operation across hot helpers + +**After Phase 19-3**: +- 1 call at wrapper entry: + - `hakmem_env_snapshot_enabled()` once + - `hakmem_env_snapshot()` once (when enabled) + - Deep helpers use `if (env)` + direct field reads (no further gate checks) + +**Reduction**: +- **Gate checks**: ~5 → 1 (wrapper entry only) +- **Branches**: reduce repeated gate branches inside hot helpers +- **Instructions**: target ~-10 instructions/op (order-of-magnitude) + +### 5.2 Branch Reduction Estimate + +**Current branching**: +- `hakmem_env_snapshot_enabled()`: 2 branches (ctor_mode check + gate check) +- Called 5 times = **10 branches/op** + +**After Phase 19-3**: +- Gate check is done once at wrapper entry; deep helpers reuse `env` pointer. + +**Reduction**: 10 → 4 = **-6 branches/op** (conservative estimate: -4 branches/op accounting for overlap) + +### 5.3 Throughput Estimate + +**Phase 19-1 Design Doc** (Candidate B) estimates: +- Instructions: -10.0/op +- Branches: -4.0/op +- Throughput: **+5-8%** + +**Phase 19-3 targets** (aligned with Candidate B): +- Instructions: **-10.0/op** ✓ +- Branches: **-4.0/op** ✓ +- Throughput: **+5-8%** (expected on top of Phase 19-2 baseline) + +**Validation criteria**: +- Perf stat shows instruction count reduction: ≥8.0/op (80% of estimate) +- Perf stat shows branch count reduction: ≥3.0/op (75% of estimate) +- Throughput improvement: ≥4.0% (50% of lower bound estimate) + +--- + +## 6. Risk Assessment + +### 6.1 Technical Risks + +**MEDIUM: API Signature Changes** +- Risk: Adding context parameter changes function signatures +- Mitigation: Keep old signatures, add `_ctx` variants +- Rollback: NULL context → fallback to old implementation +- Timeline: 1 phase at a time (19-3a → 19-3b → 19-3c) + +**MEDIUM: ENV Invalidation** +- Risk: Runtime ENV changes (bench_profile putenv) may not refresh context +- Mitigation: Phase 19-3 inherits Phase 4 E1 refresh mechanism +- Limitation: Same as current ENV snapshot (requires explicit refresh) +- Future: Add version tracking (Option B) if runtime toggle needed + +**LOW: Register Pressure** +- Risk: Extra context parameter may increase register spills +- Mitigation: Context is const pointer (register-friendly) +- Validation: Check perf stat for stall increases +- Rollback: Disable via ENV if regression detected + +**LOW: Lazy Init Overhead** +- Risk: First call to `fastlane_env_ctx()` adds init cost +- Mitigation: One-time per thread (amortized over millions of ops) +- Measurement: Should be <0.1% overhead (verified via perf) + +### 6.2 Performance Risks + +**Risk: Overhead greater than savings** +- Scenario: Context struct access slower than optimized TLS reads +- Likelihood: LOW (struct access is 1-2 instructions, TLS read is 5-10) +- Detection: Perf stat will show instruction count increase +- Rollback: ENV=0 immediately reverts + +**Risk: Branch predictor thrashing** +- Scenario: New branch patterns confuse CPU predictor +- Likelihood: LOW (reducing branches helps predictor) +- Detection: Branch miss rate increases in perf stat +- Rollback: ENV=0 immediately reverts + +### 6.3 Integration Risks + +**Risk: Breaks bench_profile ENV refresh** +- Scenario: Context cached before putenv(), stale values used +- Likelihood: MEDIUM (same issue as Phase 4 E1) +- Mitigation: Follow Phase 4 E1 pattern (explicit refresh hook) +- Validation: Run bench suite with ENV toggles + +**Risk: Conflicts with FastLane Direct (Phase 19-2)** +- Scenario: Phase 19-2 removed wrapper, context injection point unclear +- Likelihood: LOW (context added at new entry point) +- Mitigation: Phase 19-3 builds on Phase 19-2 baseline +- Validation: A/B test with FASTLANE_DIRECT=1 + ENV_CONSOLIDATION=1 + +--- + +## 7. Validation Checklist + +### 7.1 Pre-Implementation + +- [ ] Verify Phase 4 E1 (ENV snapshot) is stable and working +- [ ] Verify Phase 19-2 (FASTLANE_DIRECT) is stable baseline +- [ ] Document current `hakmem_env_snapshot_enabled()` call sites (5 locations) +- [ ] Create test plan for ENV refresh (bench_profile compatibility) + +### 7.2 Implementation + +- [ ] Implement `fastlane_env_ctx_box.h` (context struct + getter) +- [ ] Add `malloc_tiny_fast_ctx()` variant (Phase 19-3a) +- [ ] Add `free_tiny_fast_ctx()` variant (Phase 19-3b) +- [ ] Propagate context to `tiny_legacy_fallback_box.h` (Phase 19-3c) +- [ ] Add ENV gate `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0/1` +- [ ] Add stats counters (debug builds) + +### 7.3 Testing (Per Phase) + +**Phase 19-3a (malloc path)**: +- [ ] Correctness: Run `make test` suite (all tests pass) +- [ ] Perf stat: Measure instruction/branch reduction (ENV=0 vs ENV=1) +- [ ] Perf record: Verify `hakmem_env_snapshot_enabled` samples drop +- [ ] Benchmark: Mixed 10-run (expect +2-3% from malloc path alone) + +**Phase 19-3b (free path)**: +- [ ] Correctness: Run `make test` + Larson (all tests pass) +- [ ] Perf stat: Measure cumulative reduction (vs baseline) +- [ ] Perf record: Verify further reduction in ENV check samples +- [ ] Benchmark: Mixed 10-run (expect +3-5% cumulative) + +**Phase 19-3c (legacy + metadata)**: +- [ ] Correctness: Full test suite including multithreaded +- [ ] Perf stat: Verify -10.0 instr/op, -4.0 branches/op (goal) +- [ ] Perf record: `hakmem_env_snapshot_enabled` <1% samples +- [ ] Benchmark: Mixed 10-run (expect +5-8% cumulative) + +### 7.4 A/B Test (Final Validation) + +**Benchmark suite**: +```sh +# Baseline (Phase 19-2) +HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0 \ + scripts/run_mixed_10_cleanenv.sh + +# Optimized (Phase 19-3) +HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1 \ + scripts/run_mixed_10_cleanenv.sh +``` + +**GO/NO-GO criteria**: +- **GO**: Mean throughput +5.0% or higher (within ±20% of +5-8% estimate) +- **NEUTRAL**: +2.0% to +5.0% → keep as research box, preset-only promotion +- **NO-GO**: <+2.0% or regression → revert, analyze perf data + +**Perf stat validation**: +```sh +perf stat -e cycles,instructions,branches,branch-misses,L1-icache-load-misses \ + -- ./bench_random_mixed_hakmem 200000000 400 1 +``` + +**Expected deltas**: +- Instructions/op: -8.0 to -12.0 (target: -10.0) +- Branches/op: -3.0 to -5.0 (target: -4.0) +- Branch-miss%: unchanged or slightly better (fewer branches) +- Throughput: +4.0% to +10.0% (target: +5-8%) + +--- + +## 8. Rollout Plan + +### 8.1 Phase 19-3a: malloc Path (Week 1) + +**Scope**: Add context to malloc hot path +- Modify `malloc_tiny_fast()` to accept context +- Update C7 ULTRA check (line 236) +- Add `fastlane_env_ctx_box.h` +- Update wrapper.c `malloc()` + +**Timeline**: 4-6 hours implementation + 2 hours testing +**Risk**: LOW (isolated to alloc path) +**Rollback**: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` + +### 8.2 Phase 19-3b: free Path (Week 1) + +**Scope**: Add context to free hot path +- Modify `free_tiny_fast()` to accept context +- Update C7 ULTRA checks (lines 624, 830) +- Update front V3 checks (lines 403, 910) +- Update wrapper.c `free()` + +**Timeline**: 4-6 hours implementation + 2 hours testing +**Risk**: LOW-MEDIUM (more call sites than malloc) +**Rollback**: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` + +### 8.3 Phase 19-3c: Legacy + Metadata (Week 2) + +**Scope**: Propagate context to helper boxes +- Update `tiny_legacy_fallback_box.h` (line 28) +- Update `tiny_metadata_cache_hot_box.h` (line 64) +- Add context parameter to helper functions + +**Timeline**: 3-4 hours implementation + 2 hours testing +**Risk**: MEDIUM (touches multiple boxes) +**Rollback**: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` + +### 8.4 Graduate (Week 2-3) + +**Promotion criteria**: +- All phases pass A/B testing (GO verdict) +- Cumulative throughput gain ≥+5.0% +- No correctness regressions (all tests pass) +- Perf validation confirms instruction reduction + +**Promotion actions**: +1. Add to `MIXED_TINYV3_C7_SAFE` preset: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=1` +2. Document in optimization roadmap +3. Update Box Theory index +4. Keep ENV default=0 (opt-in) until production validation + +**Rollback strategy**: +- Preset level: Remove from preset, keep code +- Code level: `#if HAKMEM_ENV_SNAPSHOT_CONSOLIDATION` → disable at compile time +- Emergency: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` global default + +--- + +## 9. Future Optimization Opportunities + +### 9.1 Version-Based Invalidation (Option B) + +If runtime ENV changes become important: +- Add global `g_env_snapshot_version` counter +- Increment on ENV change (bench_profile, runtime toggle) +- Each thread checks version, refreshes context if stale +- Overhead: +1 global read per operation (still net win vs 10 TLS reads) + +### 9.2 Route Table Consolidation + +Extend context to include pre-computed routes: +```c +typedef struct { + bool c7_ultra_enabled; + bool front_v3_enabled; + bool metadata_cache_eff; + SmallRouteKind route_kind[8]; // Pre-computed per class +} FastLaneEnvCtx; +``` + +**Benefit**: Eliminate `tiny_static_route_get_kind_fast()` calls +**Impact**: Additional -3-4 instructions/op, -1-2 branches/op + +### 9.3 Constructor Init (Option C Hybrid) + +For production builds (no bench_profile): +- Use `__attribute__((constructor))` to init context at startup +- Eliminate lazy init check (g_init always 1) +- Benefit: -1 branch per operation (init check) +- Limitation: No runtime ENV changes (production-only optimization) + +--- + +## 10. Comparison to Phase 4 E1 + +### Phase 4 E1 (ENV Snapshot) + +**What it did**: +- Consolidated 3 ENV reads (`tiny_c7_ultra_enabled_env`, `tiny_front_v3_enabled`, `tiny_metadata_cache_enabled`) into 1 snapshot struct +- Result: +3.92% throughput (Mixed) +- Status: Promoted in presets (global default still OFF) + +**Limitation**: +- Still calls `hakmem_env_snapshot_enabled()` 5 times per operation +- Each call: gate loads + branches +- ENV check overhead remains: ~7% perf samples + +### Phase 19-3 (ENV Snapshot Consolidation) + +**What it does**: +- Eliminates repeated `hakmem_env_snapshot_enabled()` calls inside deep helpers: + - wrapper entry does the gate check once and passes `const HakmemEnvSnapshot* env` down + - deep helpers use `if (env)` + direct field reads + +**Benefit over Phase 4 E1**: +- Phase 4 E1: Consolidated ENV **values** (3 gates → 1 snapshot) +- Phase 19-3: Consolidates ENV **checks** (5 snapshot calls → 1 context call) +- Complementary: Phase 19-3 builds on Phase 4 E1 infrastructure + +**Combined impact**: +- Phase 4 E1: +3.92% (ENV value consolidation) +- Phase 19-3: +5-8% (ENV check consolidation) +- Not additive (overlap), but Phase 19-3 should subsume Phase 4 E1 gains + +--- + +## 11. Conclusion + +Phase 19-3 (ENV Snapshot Consolidation) targets a clear, measurable overhead: +- **Current**: repeated `hakmem_env_snapshot_enabled()` gate checks scattered across hot helpers +- **After**: wrapper entry gate check once + `env` pass-down +- **Reduction**: fewer gate branches + fewer loads + less code/layout churn + +**Expected outcome**: +5-8% throughput (aligned with Phase 19-1 Design Candidate B estimate) + +**Recommended approach**: **Option A (Entry-Point Snapshot)** +- Clear API, type-safe context passing +- Preserves Box Theory (NULL context → fallback) +- Gradual migration (3 sub-phases) +- Benchmark-compatible (bench_profile refresh works) + +**Risk**: MEDIUM (API changes, ENV invalidation handling) +**Effort**: 8-12 hours (implementation) + 6-8 hours (testing) +**Timeline**: 2 weeks (3 sub-phases + A/B validation) + +**Next steps**: +1. Phase 19-3a done (UNLIKELY hint removal, GO) +2. Implement Phase 19-3b (wrapper env pass-down to hot helpers) +3. A/B test (expect +1-3% incremental on top of 19-3a) +4. Implement Phase 19-3c (legacy + metadata pass-down) +5. Final A/B test +7. Graduate if GO (add to MIXED_TINYV3_C7_SAFE preset) + +This positions Phase 19-3 as a **high-ROI, medium-risk** optimization with clear measurement criteria and rollback strategy. diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md new file mode 100644 index 00000000..0d750491 --- /dev/null +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_3_REVISED_INSTRUCTIONS.md @@ -0,0 +1,406 @@ +# Phase 19-3: ENV Snapshot Consolidation — Revised Implementation Instructions + +## 0. Design Corrections (from initial analysis) + +**Key mistakes in initial design**: + +1. **Wrong**: Create new TLS ctx + **Right**: Use existing `HakmemEnvSnapshot*` from `core/box/hakmem_env_snapshot_box.h` + +2. **Wrong**: Option A with static `__thread g_init` (doesn't respect `hakmem_env_snapshot_refresh_from_env()`) + **Right**: Pass `const HakmemEnvSnapshot* env` down the call stack (refresh works automatically) + +3. **Wrong**: Modify `core/wrapper.c` + **Right**: Modify `core/box/hak_wrappers.inc.h` (actual integration point) + +4. **Wrong**: Keep `__builtin_expect(hakmem_env_snapshot_enabled(), 0)` + **Right**: Remove UNLIKELY hint (Phase 19-1 trap: snapshot is now ON by default, hint is backwards) + +--- + +## 1. Strategy: Simplest Path (3 micro-phases) + +### Phase 19-3a: Remove UNLIKELY hint from ENV checks + +**Problem**: `__builtin_expect(hakmem_env_snapshot_enabled(), 0)` appears in: +- `core/front/malloc_tiny_fast.h:236` (C7 ULTRA alloc) +- `core/front/malloc_tiny_fast.h:403` (Front V3 free hotcold) +- `core/front/malloc_tiny_fast.h:624` (C7 ULTRA free) +- `core/front/malloc_tiny_fast.h:830` (C7 ULTRA free larson) +- `core/front/malloc_tiny_fast.h:910` (Front V3 free larson) + +**Issue**: Snapshot is now ON in presets → UNLIKELY hint is backwards (same trap as Phase 19-1 NO-GO) + +**Fix**: Replace `__builtin_expect(hakmem_env_snapshot_enabled(), 0)` with `hakmem_env_snapshot_enabled()` + +**Expected**: +0-2% from correct branch prediction + +**A/B Test**: +```sh +# Before (with UNLIKELY hint) +scripts/run_mixed_10_cleanenv.sh + +# After (without hint) +scripts/run_mixed_10_cleanenv.sh +``` + +**GO Criteria**: No regression (±1%) + +--- + +### Phase 19-3b: Pass snapshot down from wrapper entry + +**Current state**: Each callee calls `hakmem_env_snapshot_enabled()` independently +- 5 calls × 2 TLS reads each = **10 TLS reads/op** + +**Proposed**: +```c +// core/box/hak_wrappers.inc.h (malloc wrapper) +void* malloc(size_t size) { + // Entry-point: read ENV snapshot once + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + + if (fastlane_direct_enabled()) { + // NOTE: Keep FastLane safety rule: do not use fast paths before init. + if (g_initialized) { + void* ptr = malloc_tiny_fast_with_env(size, env); // NEW: pass env + if (ptr != NULL) return ptr; + // IMPORTANT: malloc miss must fall through to existing wrapper path + // (do NOT call malloc_cold() directly; it expects lock_depth to be incremented). + } + } + + void* ptr = front_fastlane_try_malloc(size); + if (__builtin_expect(ptr != NULL, 1)) return ptr; + // Not handled → continue to existing wrapper path below (wrap_shape / lock_depth / init waits / malloc_cold(...)). + // (Do not duplicate the full wrapper here; only the env pass-down is new.) + /* existing wrapper path */ +} + +// core/box/hak_wrappers.inc.h (free wrapper) +void free(void* ptr) { + if (__builtin_expect(!ptr, 0)) return; + + // Entry-point: read ENV snapshot once + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + + if (fastlane_direct_enabled()) { + // NOTE: Keep FastLane safety rule: do not use fast paths before init. + if (g_initialized) { + if (free_tiny_fast_with_env(ptr, env)) return; // NEW: pass env + free_cold(ptr); + return; + } + } + + if (front_fastlane_try_free(ptr)) return; + free_cold(ptr, wrapper_env_cfg_fast()); +} +``` + +**Propagate env down**: +```c +// core/front/malloc_tiny_fast.h (example) +static inline void* malloc_tiny_fast_with_env(size_t size, const HakmemEnvSnapshot* env) { + // ... existing logic ... + + // OLD: if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { + // NEW: if (env) { + if (env) { + // Use snapshot from env (NO additional TLS read) + if (tiny_c7_ultra_enabled_cached(env)) { // NEW: use cached check + // C7 ULTRA path + } + } + + // ... rest of logic ... +} +``` + +**Helper functions** (optional, if needed): +```c +// core/box/hakmem_env_snapshot_box.h +static inline bool tiny_c7_ultra_enabled_cached(const HakmemEnvSnapshot* env) { + if (!env) return 0; + // Read from snapshot (already in cache, no TLS read) + return env->tiny_c7_ultra_enabled; +} +``` + +**Expected**: +- TLS reads: 10/op → 2/op (just wrapper entry check) +- Instructions: -8.0 to -10.0/op +- Throughput: **+3-5%** + +**ENV Gate** (optional, if conservative rollout needed): +```c +// core/box/env_snapshot_consolidation_env_box.h (NEW, if needed) +extern _Atomic int g_env_snapshot_consolidation_enabled; + +static inline bool env_snapshot_consolidation_enabled(void) { + return atomic_load_explicit(&g_env_snapshot_consolidation_enabled, memory_order_relaxed); +} +``` + +**Alternative** (no new ENV gate): +- Just always pass env down when HAKMEM_FASTLANE_DIRECT=1 +- Rely on existing FASTLANE_DIRECT gate for rollback + +--- + +### Phase 19-3c: Propagate to all callees (if 19-3b is GO) + +**Targets**: +- `tiny_legacy_fallback_box.h:28` (ENV snapshot check) +- `tiny_metadata_cache_hot_box.h:64` (metadata cache check) +- Other helper functions that currently call `hakmem_env_snapshot_enabled()` + +**Expected cumulative**: **+5-8%** + +--- + +## 2. Box Theory Compliance + +**Boundary**: +- L0 (ENV gate): Optional `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0/1` OR reuse `HAKMEM_FASTLANE_DIRECT=1` +- L1 (Hot inline): `const HakmemEnvSnapshot* env` parameter (NULL-safe) +- L2 (Fallback): If `env == NULL`, use old path (call `hakmem_env_snapshot_enabled()`) + +**Rollback**: +- Runtime: `HAKMEM_ENV_SNAPSHOT_CONSOLIDATION=0` (or `HAKMEM_FASTLANE_DIRECT=0`) +- Compile-time: `#if` guards on env parameter propagation +- NULL-safe: `if (!env) { /* old path */ }` + +**Observability**: +- Perf stat: TLS read count reduction +- `hakmem_env_snapshot_enabled()` samples should drop from 7% → <1% + +**Refresh compatibility**: +- ✅ Works: Wrapper reads fresh snapshot on each operation +- ✅ `bench_profile` putenv() + refresh works (no static cache) +- ✅ No version tracking needed + +--- + +## 3. Implementation Steps + +### Step 1: Phase 19-3a (UNLIKELY hint removal) + +**Files to modify**: +```sh +# Find all instances +grep -n "__builtin_expect(hakmem_env_snapshot_enabled(), 0)" core/front/malloc_tiny_fast.h + +# Replace with +# hakmem_env_snapshot_enabled() +``` + +**Lines to change**: +- `malloc_tiny_fast.h:236` +- `malloc_tiny_fast.h:403` +- `malloc_tiny_fast.h:624` +- `malloc_tiny_fast.h:830` +- `malloc_tiny_fast.h:910` + +**A/B Test**: +```sh +# Baseline (before) +scripts/run_mixed_10_cleanenv.sh + +# Optimized (after) +scripts/run_mixed_10_cleanenv.sh +``` + +**GO Criteria**: No regression (±1%) + +--- + +### Step 2: Phase 19-3b (Pass env from wrapper) + +**2.1 Update wrappers** (`core/box/hak_wrappers.inc.h`) + +Find malloc wrapper (around line 674): +```c +void* malloc(size_t size) { + // ADD: Read snapshot once at entry + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + + // ... rest of existing logic ... + // Replace malloc_tiny_fast(size) with malloc_tiny_fast_with_env(size, env) +} +``` + +Find free wrapper (around line 188): +```c +void free(void* ptr) { + // ADD: Read snapshot once at entry + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + + // ... rest of existing logic ... + // Replace free_tiny_fast(ptr) with free_tiny_fast_with_env(ptr, env) +} +``` + +**2.2 Add _with_env variants** (`core/front/malloc_tiny_fast.h`) + +Option A: Rename existing functions (breaking change, requires full propagation) +Option B: Add new `_with_env` variants, keep old functions as wrappers (safer) + +**Recommended: Option B** (incremental migration) + +```c +// malloc_tiny_fast.h (add new variant) +static inline void* malloc_tiny_fast_with_env(size_t size, const HakmemEnvSnapshot* env) { + // Replace hakmem_env_snapshot_enabled() checks with (env && ...) + // ... existing logic ... +} + +// Keep old function as wrapper (for gradual migration) +static inline void* malloc_tiny_fast(size_t size) { + const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL; + return malloc_tiny_fast_with_env(size, env); +} +``` + +**2.3 Update ENV checks inside _with_env** + +Replace: +```c +if (__builtin_expect(hakmem_env_snapshot_enabled(), 0)) { + // C7 ULTRA check +} +``` + +With: +```c +if (env) { + // Use env snapshot (NO TLS read) + if (env->tiny_c7_ultra_enabled) { + // C7 ULTRA path + } +} +``` + +**2.4 Build & Verify** + +```sh +make clean && make -j bench_random_mixed_hakmem +# Should compile without errors +``` + +**2.5 A/B Test** + +```sh +# Baseline (19-3a only) +scripts/run_mixed_10_cleanenv.sh + +# Optimized (19-3b with env passing) +scripts/run_mixed_10_cleanenv.sh +``` + +**GO Criteria**: +3% minimum + +--- + +### Step 3: Phase 19-3c (Propagate to helpers) + +**If 19-3b is GO**, propagate env to: +- `tiny_legacy_fallback_box.h` +- `tiny_metadata_cache_hot_box.h` +- Other helper functions + +**Expected cumulative**: +5-8% + +--- + +## 4. Safety Checklist + +- [ ] NULL-safe: All `if (env)` checks handle NULL correctly +- [ ] Refresh works: Wrapper reads fresh snapshot each call +- [ ] Rollback: Can disable via ENV or compile flag +- [ ] No static cache: No version tracking needed +- [ ] Existing code paths preserved: Old functions still work + +--- + +## 5. Expected Performance (Cumulative) + +| Phase | TLS reads/op | Instructions/op | Throughput | +|-------|--------------|-----------------|------------| +| Baseline (19-1b) | 10 | 169.45 | 52.06M ops/s | +| 19-3a (hint fix) | 10 | ~169 | +0-2% | +| 19-3b (env pass) | 2 | ~159-161 | +3-5% | +| 19-3c (helpers) | 2 | ~157-159 | +5-8% | + +**Target after Phase 19-3c**: +- Throughput: **54.7-56.2M ops/s** (vs 52.06M baseline) +- Instructions/op: **157-159** (vs 169.45 baseline) +- Gap to libc (135.92): **+15-17%** (vs +24.6% before 19-3) + +--- + +## 6. Perf Validation + +**Before Phase 19-3**: +```sh +perf stat -e cycles,instructions -- ./bench_random_mixed_hakmem 200000000 400 1 +# Instructions: ~169.45/op +``` + +**After Phase 19-3c**: +```sh +perf stat -e cycles,instructions -- ./bench_random_mixed_hakmem 200000000 400 1 +# Instructions: ~157-159/op (target: -10.0 reduction) +``` + +**Perf record validation**: +```sh +perf record -g -- ./bench_random_mixed_hakmem 50000000 400 1 +perf report --stdio --no-children | grep hakmem_env_snapshot_enabled +# Should show <1% samples (down from 7%) +``` + +--- + +## 7. Risk Assessment + +**Phase 19-3a**: **LOW** +- Simple search-replace +- No algorithmic changes +- Worst case: ±0% (no-op) + +**Phase 19-3b**: **MEDIUM** +- API signature changes (add env parameter) +- Mechanical changes (low semantic risk) +- Rollback: Keep old functions as wrappers + +**Phase 19-3c**: **MEDIUM** +- Wider propagation +- More functions to update +- Same rollback strategy + +--- + +## 8. Timeline + +- **Phase 19-3a**: 1-2 hours (search-replace + A/B) +- **Phase 19-3b**: 4-6 hours (wrapper + _with_env variants + A/B) +- **Phase 19-3c**: 3-4 hours (helper propagation + A/B) + +**Total**: 8-12 hours (1-2 days part-time) + +--- + +## 9. Next Steps After This Phase + +If Phase 19-3 achieves +5-8%: +- Current: 52.06M ops/s +- After 19-3: ~54.7-56.2M ops/s +- Gap to libc (79.72M): ~+42-46% + +**Remaining candidates** (from Phase 19-1 Design): +- Candidate C: Stats removal (+3-5%, already done in BENCH_MINIMAL) +- Candidate D: Header inline (+2-3%) +- Candidate E: Route fast path (+2-3%) + +**Or**: Re-profile with `perf record` to find next hot path (self% ≥ 5%)