Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement

## Summary

Completed Phase 54-60 optimization work:

**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset

**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY

**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc

**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized

## Key Metrics

- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes

## Files Added/Modified

New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h

Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py

Documentation: Phase 40-60 analysis documents

## Design Decisions

1. Profile separation (core/bench_profile.h):
   - MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
   - MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)

2. Box Theory compliance:
   - All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
   - Single conversion points maintained
   - No physical deletions (compile-out only)

3. Lessons learned:
   - SSOT effective only where redundancy exists (Phase 60 showed limits)
   - Branch prediction extremely effective (~0 cycles for well-predicted branches)
   - Early-exit pattern valuable even when seemingly redundant

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-17 06:24:01 +09:00
parent ee5e2cc9c4
commit 7adbcdfcb6
68 changed files with 11736 additions and 187 deletions

View File

@ -74,6 +74,7 @@
#include "../box/free_cold_shape_stats_box.h" // Phase 5 E5-3a: Free cold shape stats
#include "../box/free_tiny_fast_mono_dualhot_env_box.h" // Phase 9: MONO DUALHOT ENV gate
#include "../box/free_tiny_fast_mono_legacy_direct_env_box.h" // Phase 10: MONO LEGACY DIRECT ENV gate
#include "../box/alloc_passdown_ssot_env_box.h" // Phase 60: Alloc pass-down SSOT
// Helper: current thread id (low 32 bits) for owner check
#ifndef TINY_SELF_U32_LOCAL_DEFINED
@ -83,6 +84,51 @@ static inline uint32_t tiny_self_u32_local(void) {
}
#endif
// ============================================================================
// Phase 60: Alloc Pass-Down Context (SSOT)
// ============================================================================
// Alloc context: 入口で 1回だけ計算し、下流へ引き回す
typedef struct {
const HakmemEnvSnapshot* env; // ENV snapshot (NULL if snapshot disabled)
SmallRouteKind route_kind; // Route kind (LEGACY/ULTRA/MID/V7)
bool c7_ultra_on; // C7 ULTRA enabled
bool alloc_dualhot_on; // Alloc DUALHOT enabled (C0-C3 direct path)
} alloc_passdown_context_t;
// Phase ALLOC-TINY-FAST-DUALHOT-2: Probe window ENV gate (safe from early putenv)
// Phase 39: BENCH_MINIMAL → 固定 0 (lazy-init 削除) — GO +1.98%
static inline int alloc_dualhot_enabled(void) {
#if HAKMEM_BENCH_MINIMAL
return 0; // FAST v3: 定数化 (default OFF)
#else
static int g = -1;
static int g_probe_left = 64; // Probe window: tolerate early putenv before gate init
if (__builtin_expect(g == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ALLOC_DUALHOT");
if (e && *e && *e != '0') {
g = 1;
} else if (g_probe_left > 0) {
g_probe_left--;
// Still probing: return "not yet set" without committing 0
if (e == NULL) {
return 0; // Env not set (yet), but keep probing
}
g = 0; // Explicitly set to "0"
} else {
g = 0; // Probe window expired, commit OFF
}
#if !HAKMEM_BUILD_RELEASE
if (g == 1) {
fprintf(stderr, "[DUALHOT-INIT] alloc_dualhot_enabled() = %d (probe_left=%d)\n", g, g_probe_left);
fflush(stderr);
}
#endif
}
return g;
#endif
}
// ============================================================================
// ENV Control (cached, lazy init)
// ============================================================================
@ -144,30 +190,33 @@ static inline int front_gate_unified_enabled(void) {
// - NULL on failure (caller falls back to normal path)
//
// Phase ALLOC-TINY-FAST-DUALHOT-2: Probe window ENV gate (safe from early putenv)
// Phase 39: BENCH_MINIMAL → 固定 0 (lazy-init 削除) — GO +1.98%
static inline int alloc_dualhot_enabled(void) {
#if HAKMEM_BENCH_MINIMAL
return 0; // FAST v3: 定数化 (default OFF)
#else
static int g = -1;
static int g_probe_left = 64; // Probe window: tolerate early putenv before gate init
if (__builtin_expect(g == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ALLOC_DUALHOT");
if (e && *e && *e != '0') {
g = 1;
} else if (g_probe_left > 0) {
g_probe_left--;
// Still probing: return "not yet set" without committing 0
if (e == NULL) {
return 0; // Env not set (yet), but keep probing
}
} else {
g = 0; // Probe window exhausted, commit to 0
}
// ============================================================================
// Phase 60: Alloc context SSOT helper (入口で 1回だけ計算)
// ============================================================================
// Phase 60: 入口で ENV snapshot, route kind, C7 ULTRA, DUALHOT を 1回だけ取得
// Phase 43 教訓: Branch は store より高い → この関数自体は追加 branch なしalways_inline
__attribute__((always_inline))
static inline alloc_passdown_context_t alloc_passdown_context_compute(int class_idx) {
alloc_passdown_context_t ctx;
// 1. ENV snapshot (入口で 1回だけ)
ctx.env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL;
// 2. C7 ULTRA enabled (入口で 1回だけ)
ctx.c7_ultra_on = ctx.env ? ctx.env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env();
// 3. Alloc DUALHOT enabled (入口で 1回だけ)
ctx.alloc_dualhot_on = alloc_dualhot_enabled();
// 4. Route kind (入口で 1回だけ)
if (tiny_static_route_ready_fast()) {
ctx.route_kind = tiny_static_route_get_kind_fast(class_idx);
} else {
ctx.route_kind = tiny_policy_hot_get_route_with_env((uint32_t)class_idx, ctx.env);
}
return g;
#endif
return ctx;
}
// Phase 2 B3: tiny_alloc_route_cold() - Handle rare routes (V7, MID, ULTRA)
@ -232,9 +281,126 @@ static void* tiny_alloc_route_cold(SmallRouteKind route_kind, int class_idx, siz
return tiny_cold_refill_and_alloc(class_idx);
}
// Phase 60: malloc_tiny_fast_for_class_ssot() - SSOT mode (context pre-computed)
__attribute__((always_inline))
static inline void* malloc_tiny_fast_for_class_ssot(size_t size, int class_idx, const alloc_passdown_context_t* ctx) {
// Stats (class_idx already validated by gate)
tiny_front_alloc_stat_inc(class_idx);
ALLOC_GATE_STAT_INC_CLASS(class_idx);
// Phase 60: Use pre-computed context (避免重複計算)
// C7 ULTRA early-exit (skip policy snapshot for common case)
if (class_idx == 7 && ctx->c7_ultra_on) {
void* ultra_p = tiny_c7_ultra_alloc(size);
if (TINY_HOT_LIKELY(ultra_p != NULL)) {
return ultra_p;
}
// C7 ULTRA miss → fall through to policy-based routing
}
// C0-C3 direct path (second hot path)
if ((unsigned)class_idx <= 3u) {
if (ctx->alloc_dualhot_on) {
// Direct to LEGACY unified cache (no policy snapshot)
void* ptr = tiny_hot_alloc_fast(class_idx);
if (TINY_HOT_LIKELY(ptr != NULL)) {
return ptr;
}
return tiny_cold_refill_and_alloc(class_idx);
}
}
// Routing dispatch: Use pre-computed route_kind from context
const tiny_env_cfg_t* env_cfg = tiny_env_cfg();
if (TINY_HOT_LIKELY(env_cfg->alloc_route_shape)) {
// B3 optimized: Prioritize LEGACY with LIKELY hint
if (TINY_HOT_LIKELY(ctx->route_kind == SMALL_ROUTE_LEGACY)) {
// Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1)
if (__builtin_expect(env_cfg->tiny_prefetch, 0)) {
__builtin_prefetch(&g_unified_cache[class_idx], 0, 3);
}
// LEGACY fast path: Unified Cache hot/cold
void* ptr = tiny_hot_alloc_fast(class_idx);
if (TINY_HOT_LIKELY(ptr != NULL)) {
return ptr;
}
return tiny_cold_refill_and_alloc(class_idx);
}
// Rare routes: delegate to cold helper
return tiny_alloc_route_cold(ctx->route_kind, class_idx, size);
}
// Original dispatch (backward compatible, default)
switch (ctx->route_kind) {
case SMALL_ROUTE_ULTRA: {
// Phase TLS-UNIFY-1: Unified ULTRA TLS pop for C4-C6 (C7 handled above)
void* base = tiny_ultra_tls_pop((uint8_t)class_idx);
if (TINY_HOT_LIKELY(base != NULL)) {
if (class_idx == 6) FREE_PATH_STAT_INC(c6_ultra_alloc_hit);
else if (class_idx == 5) FREE_PATH_STAT_INC(c5_ultra_alloc_hit);
else if (class_idx == 4) FREE_PATH_STAT_INC(c4_ultra_alloc_hit);
return tiny_base_to_user_inline(base);
}
// ULTRA miss → fallback to LEGACY
break;
}
case SMALL_ROUTE_MID_V35: {
// Phase v11a-3: MID v3.5 allocation
void* v35p = small_mid_v35_alloc(class_idx, size);
if (TINY_HOT_LIKELY(v35p != NULL)) {
return v35p;
}
// MID v3.5 miss → fallback to LEGACY
break;
}
case SMALL_ROUTE_V7: {
// Phase v7: SmallObject v7 allocation (research box)
void* v7p = small_heap_alloc_fast_v7_stub(size, (uint8_t)class_idx);
if (TINY_HOT_LIKELY(v7p != NULL)) {
return v7p;
}
// V7 miss → fallback to LEGACY
break;
}
case SMALL_ROUTE_MID_V3: {
// Phase MID-V3: MID v3 allocation (257-768B, C5-C6)
void* v3p = small_mid_v35_alloc(class_idx, size);
if (TINY_HOT_LIKELY(v3p != NULL)) {
return v3p;
}
break;
}
case SMALL_ROUTE_LEGACY:
default:
break;
}
// Phase 3 C1: TLS cache prefetch (prefetch g_unified_cache[class_idx] to L1)
if (__builtin_expect(env_cfg->tiny_prefetch, 0)) {
__builtin_prefetch(&g_unified_cache[class_idx], 0, 3);
}
// LEGACY fallback: Unified Cache hot/cold path
void* ptr = tiny_hot_alloc_fast(class_idx);
if (TINY_HOT_LIKELY(ptr != NULL)) {
return ptr;
}
return tiny_cold_refill_and_alloc(class_idx);
}
// Phase ALLOC-GATE-SSOT-1: malloc_tiny_fast_for_class() - body (class_idx already known)
__attribute__((always_inline))
static inline void* malloc_tiny_fast_for_class(size_t size, int class_idx) {
// Phase 60: SSOT mode (ENV gated)
if (alloc_passdown_ssot_enabled()) {
alloc_passdown_context_t ctx = alloc_passdown_context_compute(class_idx);
return malloc_tiny_fast_for_class_ssot(size, class_idx, &ctx);
}
// Original path (backward compatible, default)
// Stats (class_idx already validated by gate)
tiny_front_alloc_stat_inc(class_idx);
ALLOC_GATE_STAT_INC_CLASS(class_idx);