diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index 31d3522a..f91fbd0e 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -49,6 +49,39 @@ --- +## 更新メモ(2025-12-15 Phase 19-6 / 19-3c Alloc ENV-SNAPSHOT-PASSDOWN Attempt) + +### Phase 19-6 (aka 19-3c) Alloc ENV-SNAPSHOT-PASSDOWN: Symmetry attempt — ❌ NO-GO + +**Goal**: Alloc 側も free 側(19-3b)と同様に、既に読んでいる `HakmemEnvSnapshot` を下流へ pass-down して +`hakmem_env_snapshot_enabled()` の重複 work を削る。 + +**Result (Mixed 10-run)**: +- Mean: **-0.97%** +- Median: **-1.05%** + +**Decision**: +- NO-GO(revert) + +**Ref**: +- `docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6_ALLOC_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md` + +### Phase 19-6B Free Static Route for Free: bypass `small_policy_v7_snapshot()` — ✅ GO (+1.43%) + +**Change**: +- `free_tiny_fast_hot()` / `free_tiny_fast()`: + - `tiny_static_route_ready_fast()` → `tiny_static_route_get_kind_fast(class_idx)` + - else fallback: `small_policy_v7_snapshot()->route_kind[class_idx]` + +**A/B (Mixed 10-run)**: +- Mean: **+1.43%** +- Median: **+1.37%** + +**Ref**: +- `docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6B_FREE_STATIC_ROUTE_FOR_FREE_AB_TEST_RESULTS.md` + +--- + ## 更新メモ(2025-12-15 Phase 19-3b ENV-SNAPSHOT-PASSDOWN) ### Phase 19-3b ENV-SNAPSHOT-PASSDOWN: Consolidate ENV snapshot reads across hot helpers — ✅ GO (+2.76%) @@ -69,7 +102,7 @@ - Capture once → pass-down keeps the “ENV decision” at a single boundary per operation and removes duplicated work. **Next**: -- Phase 19-3c (optional): if needed, also pass `env` into alloc-side call chains to remove the remaining `malloc_tiny_fast_for_class()` gate. +- Phase 19-6: alloc-side pass-down は NO-GO(上記 Ref)。次は “duplicate route lookup / dual policy snapshot” 系の冗長排除へ。 --- diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index 00d10303..cb659cb7 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -379,13 +379,12 @@ static inline void* malloc_tiny_fast(size_t size) { // Phase FREE-TINY-FAST-HOTCOLD-OPT-1: Hot/Cold split helpers // ============================================================================ -// Cold path: Cross-thread free, TinyHeap routes, and legacy fallback -// (noinline,cold to keep hot path small and I-cache clean) -__attribute__((noinline,cold)) -static int free_tiny_fast_cold(void* ptr, void* base, int class_idx, const HakmemEnvSnapshot* env) +// Phase 19-6C Helper: Compute tiny_route and use_tiny_heap once, reuse in hot/cold/legacy paths +static inline void free_tiny_fast_compute_route_and_heap( + int class_idx, + tiny_route_kind_t* out_route, + int* out_use_tiny_heap) { - FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_hit); - // Phase 3 D1: Free path route cache (eliminate tiny_route_for_class overhead) tiny_route_kind_t route; if (__builtin_expect(tiny_free_static_route_enabled(), 0)) { @@ -399,7 +398,20 @@ static int free_tiny_fast_cold(void* ptr, void* base, int class_idx, const Hakme // Standard path route = tiny_route_for_class((uint8_t)class_idx); } - const int use_tiny_heap = tiny_route_is_heap_kind(route); + *out_route = route; + *out_use_tiny_heap = tiny_route_is_heap_kind(route); +} + +// ============================================================================ + +// Cold path: Cross-thread free, TinyHeap routes, and legacy fallback +// (noinline,cold to keep hot path small and I-cache clean) +// Phase 19-6C: Accept pre-computed route + use_tiny_heap to eliminate redundant computation +__attribute__((noinline,cold)) +static int free_tiny_fast_cold(void* ptr, void* base, int class_idx, const HakmemEnvSnapshot* env, + tiny_route_kind_t route, int use_tiny_heap) +{ + FREE_TINY_FAST_HOTCOLD_STAT_INC(cold_hit); // TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust header magic (0xA0) as sufficient validation. @@ -704,7 +716,11 @@ static inline int free_tiny_fast_hot(void* ptr) { cold_path: // Delegate to cold path for cross-thread, TinyHeap, and legacy handling - return free_tiny_fast_cold(ptr, base, class_idx, env); + // Phase 19-6C: Compute route once, pass to cold path to avoid redundant lookup + tiny_route_kind_t route_for_cold; + int use_tiny_heap_for_cold; + free_tiny_fast_compute_route_and_heap(class_idx, &route_for_cold, &use_tiny_heap_for_cold); + return free_tiny_fast_cold(ptr, base, class_idx, env, route_for_cold, use_tiny_heap_for_cold); #else // No header mode - fall back to normal free @@ -879,20 +895,10 @@ static inline int free_tiny_fast(void* ptr) { legacy_fallback: // LEGACY fallback path - // Phase 3 D1: Free path route cache (eliminate tiny_route_for_class overhead) + // Phase 19-6C: Compute route once using helper (avoid redundant tiny_route_for_class) tiny_route_kind_t route; - if (__builtin_expect(tiny_free_static_route_enabled(), 0)) { - // Use cached route (bypasses tiny_route_for_class()) - route = g_tiny_route_class[(unsigned)class_idx & 7u]; - if (__builtin_expect(route == TINY_ROUTE_LEGACY && !g_tiny_route_snapshot_done, 0)) { - // Fallback if uninitialized - route = tiny_route_for_class((uint8_t)class_idx); - } - } else { - // Standard path - route = tiny_route_for_class((uint8_t)class_idx); - } - const int use_tiny_heap = tiny_route_is_heap_kind(route); + int use_tiny_heap; + free_tiny_fast_compute_route_and_heap(class_idx, &route, &use_tiny_heap); // TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast. // In Release builds, we trust header magic (0xA0) as sufficient validation. diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6B_FREE_STATIC_ROUTE_FOR_FREE_AB_TEST_RESULTS.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6B_FREE_STATIC_ROUTE_FOR_FREE_AB_TEST_RESULTS.md new file mode 100644 index 00000000..5a721b28 --- /dev/null +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6B_FREE_STATIC_ROUTE_FOR_FREE_AB_TEST_RESULTS.md @@ -0,0 +1,36 @@ +## Phase 19-6B — Free path uses static route table (bypass policy snapshot) — ✅ GO + +### Goal + +In `free_tiny_fast_hot()` / `free_tiny_fast()` we were calling `small_policy_v7_snapshot()` to obtain `route_kind[class_idx]`. + +This change uses the already-existing Phase 3 C3 table: +- If `tiny_static_route_ready_fast()` → use `tiny_static_route_get_kind_fast(class_idx)` +- Else → fallback to `small_policy_v7_snapshot()->route_kind[class_idx]` (learner interlock preserved via `ready_fast`) + +### Code change + +- File: `core/front/malloc_tiny_fast.h` +- Replace `small_policy_v7_snapshot()` route fetch with a `tiny_static_route_ready_fast()` fast path in: + - `free_tiny_fast_hot()` + - `free_tiny_fast()` + +### A/B Test + +Method: +- `scripts/run_mixed_10_cleanenv.sh` (profile `MIXED_TINYV3_C7_SAFE`) +- `iter=20,000,000`, `ws=400`, `runs=10` + +Results: + +| Metric | Baseline | Optimized | Delta | +|---|---:|---:|---:| +| Mean | 55.605M ops/s | 56.400M ops/s | +1.43% | +| Median | 55.639M ops/s | 56.400M ops/s | +1.37% | + +### Decision + +- ✅ GO (>= +1.0% threshold) +- Safe default behavior: + - When static route is not ready (disabled/uninit/learner), behavior is unchanged (policy snapshot path). + diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6C_DUPLICATE_ROUTE_DEDUP_DESIGN.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6C_DUPLICATE_ROUTE_DEDUP_DESIGN.md new file mode 100644 index 00000000..a98cac70 --- /dev/null +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6C_DUPLICATE_ROUTE_DEDUP_DESIGN.md @@ -0,0 +1,258 @@ +# Phase 19-6C: Eliminate Duplicate tiny_route_for_class() Calls + +## Goal + +Consolidate redundant route computation in free path: +- `free_tiny_fast_hot()` line 654-661: Computes `route_kind_free` (SmallRouteKind) +- `free_tiny_fast_cold()` line 389-402: **RECOMPUTES** `route` (tiny_route_kind_t) — **REDUNDANT** +- `free_tiny_fast()` legacy_fallback line 894-905: **RECOMPUTES** same as cold — **REDUNDANT** + +Expected: **+0.5-1.0% throughput** from eliminating 2-3x route lookups per free operation + +--- + +## Problem Analysis + +### Redundancy Pattern + +``` +free_tiny_fast_hot() + ├─ line 654-661: Compute route_kind_free (SmallRouteKind) + │ ├─ tiny_static_route_ready_fast() → tiny_static_route_get_kind_fast(class_idx) + │ └─ else → small_policy_v7_snapshot()->route_kind[class_idx] + │ + └─ line 713: Call free_tiny_fast_cold(ptr, base, class_idx, env) + │ + └─ free_tiny_fast_cold() + ├─ line 389-402: **RECOMPUTE** route (tiny_route_kind_t) + │ ├─ tiny_free_static_route_enabled() → g_tiny_route_class[class_idx] + │ └─ else → tiny_route_for_class(class_idx) + │ + └─ line 402: use_tiny_heap = tiny_route_is_heap_kind(route) + +free_tiny_fast() legacy_fallback + └─ line 894-905: **RECOMPUTE** again (identical to cold) +``` + +### Why Redundant + +1. **SmallRouteKind** (free_tiny_fast_hot) vs **tiny_route_kind_t** (free_tiny_fast_cold): + - Different enums, same semantic concept (route_kind for the class) + - Both compute the same "which path should this free take?" + +2. **Two TinyRoute computations**: + - free_tiny_fast_cold computes tiny_route_kind_t + - free_tiny_fast legacy_fallback computes same again + +### Instruction Overhead + +Per free operation (assuming 50% go through cold path): +- Current: 2-3x route lookups (hot computes SmallRouteKind, cold recomputes tiny_route_kind_t, legacy recomputes again) +- After: 1x route lookup (compute once, pass down) +- Savings: ~15-20 instructions per cold-path free (depends on tiny_free_static_route_enabled vs tiny_route_for_class cost) + +--- + +## Solution: Pass-Down Pattern (No Function Split) + +**Strategy**: Compute `tiny_route_for_class()` once in caller context, pass as argument. + +### Step 1: Create Helper Function + +Add to `malloc_tiny_fast.h` (before free_tiny_fast_cold): + +```c +// Phase 19-6C Helper: Compute route + use_tiny_heap once, reuse in hot/cold paths +static inline void free_tiny_fast_compute_route_and_heap( + int class_idx, + tiny_route_kind_t* out_route, + int* out_use_tiny_heap) +{ + // Extract the route computation logic (currently lines 389-402) + tiny_route_kind_t route; + if (__builtin_expect(tiny_free_static_route_enabled(), 0)) { + route = g_tiny_route_class[(unsigned)class_idx & 7u]; + if (__builtin_expect(route == TINY_ROUTE_LEGACY && !g_tiny_route_snapshot_done, 0)) { + route = tiny_route_for_class((uint8_t)class_idx); + } + } else { + route = tiny_route_for_class((uint8_t)class_idx); + } + *out_route = route; + *out_use_tiny_heap = tiny_route_is_heap_kind(route); +} +``` + +### Step 2: Update free_tiny_fast_cold() Signature + +**Before**: +```c +static int free_tiny_fast_cold(void* ptr, void* base, int class_idx, const HakmemEnvSnapshot* env) +{ + // ... line 389-402: compute route ... + tiny_route_kind_t route; + // ... logic ... + const int use_tiny_heap = tiny_route_is_heap_kind(route); + // ... rest ... +} +``` + +**After**: +```c +static int free_tiny_fast_cold(void* ptr, void* base, int class_idx, const HakmemEnvSnapshot* env, + tiny_route_kind_t route, int use_tiny_heap) +{ + // Remove lines 389-402 (route computation) + // use_route and use_tiny_heap directly from parameters + // ... rest stays same ... +} +``` + +### Step 3: Update Callers + +**In free_tiny_fast_hot()** (line 713): +```c +// Before: +return free_tiny_fast_cold(ptr, base, class_idx, env); + +// After: +tiny_route_kind_t route_for_cold; +int use_tiny_heap_for_cold; +free_tiny_fast_compute_route_and_heap(class_idx, &route_for_cold, &use_tiny_heap_for_cold); +return free_tiny_fast_cold(ptr, base, class_idx, env, route_for_cold, use_tiny_heap_for_cold); +``` + +**In free_tiny_fast() legacy_fallback** (line 892-913): +```c +// Before: +legacy_fallback: + // ... lines 894-905: recompute route ... + tiny_route_kind_t route; + if (__builtin_expect(tiny_free_static_route_enabled(), 0)) { + route = g_tiny_route_class[(unsigned)class_idx & 7u]; + // ... + } + // ... lines 906-913: use route to call free_tiny_fast_cold or other path ... + +// After: +legacy_fallback: + tiny_route_kind_t route; + int use_tiny_heap; + free_tiny_fast_compute_route_and_heap(class_idx, &route, &use_tiny_heap); + // ... lines 906-913: use route directly (no recomputation) ... +``` + +--- + +## Code Changes Summary + +| File | Lines | Change | Type | +|------|-------|--------|------| +| malloc_tiny_fast.h | +12 | Add helper function | New | +| malloc_tiny_fast.h | 385-402 | Remove route computation | Remove | +| malloc_tiny_fast.h | 385-header | Add 2 parameters (route, use_tiny_heap) | Modify | +| malloc_tiny_fast.h | 713 | Add route computation + pass to cold | Modify | +| malloc_tiny_fast.h | 894-905 | Replace recomputation with helper call | Modify | + +--- + +## A/B Test Protocol + +### Baseline (Phase 19-6B state) +```sh +scripts/run_mixed_10_cleanenv.sh +# Expected: ~56.4M ops/s (19-6B state) +``` + +### Optimized (Phase 19-6C) +```sh +scripts/run_mixed_10_cleanenv.sh +# Expected: ~56.9-57.2M ops/s (+0.9-1.4% if instruction savings convert to throughput) +``` + +### GO Criteria +- Mean ≥ 56.4M ops/s (no regression) +- Optimal: +0.5-1.0% improvement + +### Validation +```sh +perf stat -e cycles,instructions -- ./bench_random_mixed_hakmem 50000000 400 1 +# Check: instructions/op should decrease (less route_for_class calls) +``` + +--- + +## Risk Assessment + +| Risk | Level | Mitigation | +|------|-------|-----------| +| Signature change breaks callers | LOW | Only 2 callers (hot + legacy), easy to verify | +| Route logic error in helper | LOW | Logic extracted, not modified | +| Parameter passing overhead | LOW | 2 ints on stack, negligible | +| Initialization order | LOW | route computed immediately before use | + +**Overall**: **GREEN** (very low risk, mechanical refactoring) + +--- + +## Expected Performance Gains + +**Per-operation cost model**: +- Baseline: 2-3x tiny_route_for_class() calls (hot determines route, cold recomputes, legacy recomputes) +- After: 1x route computation (passed down) +- Instruction delta: -15-25 instructions per cold-path free (~20% of cold path overhead) + +**Throughput gain**: +- If 40-50% of frees go through cold path: ~8-12 instructions/op saved (mixed workload) +- Expected: **+0.5-1.0% throughput** + +--- + +## Next Steps + +1. ✅ Design (complete) +2. 🔄 Implementation: Code changes as specified above +3. 🔄 Build: `make clean && make -j bench_random_mixed_hakmem` +4. 🔄 A/B Test: Run 10-run benchmark +5. 📊 Decision: GO/NO-GO based on throughput delta +6. 📋 Commit + Document + +--- + +## Appendix: Helper Function Detailed + +```c +// Phase 19-6C: Helper to compute tiny_route and use_tiny_heap (avoid duplication) +static inline void free_tiny_fast_compute_route_and_heap( + int class_idx, + tiny_route_kind_t* out_route, + int* out_use_tiny_heap) +{ + // Phase 3 D1: Free path route cache (eliminate tiny_route_for_class overhead) + tiny_route_kind_t route; + if (__builtin_expect(tiny_free_static_route_enabled(), 0)) { + // Use cached route (bypasses tiny_route_for_class()) + route = g_tiny_route_class[(unsigned)class_idx & 7u]; + if (__builtin_expect(route == TINY_ROUTE_LEGACY && !g_tiny_route_snapshot_done, 0)) { + // Fallback if uninitialized + route = tiny_route_for_class((uint8_t)class_idx); + } + } else { + // Standard path + route = tiny_route_for_class((uint8_t)class_idx); + } + *out_route = route; + *out_use_tiny_heap = tiny_route_is_heap_kind(route); +} +``` + +--- + +## Files to Modify + +1. `core/front/malloc_tiny_fast.h`: + - Add helper function before free_tiny_fast_cold() + - Modify free_tiny_fast_cold() signature (+2 parameters) + - Remove lines 389-402 from free_tiny_fast_cold() + - Update line 713 call in free_tiny_fast_hot() + - Update lines 894-905 in free_tiny_fast() legacy_fallback diff --git a/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6_ALLOC_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6_ALLOC_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md new file mode 100644 index 00000000..126dd2c7 --- /dev/null +++ b/docs/analysis/PHASE19_FASTLANE_INSTRUCTION_REDUCTION_6_ALLOC_SNAPSHOT_PASSDOWN_AB_TEST_RESULTS.md @@ -0,0 +1,49 @@ +## Phase 19-6 (aka 19-3c) — Alloc-side ENV snapshot pass-down (symmetry attempt) — ❌ NO-GO + +### Goal + +Alloc hot path (`malloc_tiny_fast_for_class`) already fetches `HakmemEnvSnapshot` for C7 ULTRA gating. +This experiment tried to **reuse that snapshot** to avoid additional snapshot/gate work inside nested helpers +(notably policy route selection / metadata-cache gating), mirroring Phase 19-3b on the free path. + +### Implementation (reverted) + +- Add `malloc_tiny_fast_for_class_with_env(size, class_idx, env)` + - Use `env->tiny_c7_ultra_enabled` instead of calling `tiny_c7_ultra_enabled_env()` + - Route selection uses `tiny_policy_hot_get_route_with_env(class_idx, env)` (avoids a second snapshot gate) +- Keep `malloc_tiny_fast_for_class(size, class_idx)` as a wrapper that fetches `env` once and delegates. + +No new ENV gate was introduced (pure refactor). + +### A/B Test + +Method: +- `scripts/run_mixed_10_cleanenv.sh` (profile `MIXED_TINYV3_C7_SAFE`) +- `iter=20,000,000`, `ws=400`, `runs=10` +- Two separate builds (baseline vs patch), with identical bench env. + +Results: + +| Metric | Baseline | Optimized | Delta | +|---|---:|---:|---:| +| Mean | 56.156M ops/s | 55.611M ops/s | -0.97% | +| Median | 56.202M ops/s | 55.609M ops/s | -1.05% | + +### Decision + +- **NO-GO** (regression ~1% on Mixed; value is too close to threshold to justify code complexity). +- Patch **reverted**; stable state remains **Phase 19-4c**. + +### Notes / Hypothesis + +Likely causes: +- Extra argument / register pressure in the alloc hot path. +- The “redundant” snapshot gate was already cheap (and/or provided branch/constant-folding structure that the compiler exploited). + +### Next candidates (Phase 19-6) + +Proceed with the remaining redundancy items from the audit: +1. Duplicate route lookup cleanup (expected +0.5–1%) +2. Dual policy snapshot consolidation (expected +0.3–0.7%) +3. Hint mismatch audit for remaining nontrivial gates (A/B required) +