Files
hakmem/core/bench_profile.h
Moe Charm (CI) 7adbcdfcb6 Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary

Completed Phase 54-60 optimization work:

**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset

**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY

**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc

**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized

## Key Metrics

- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes

## Files Added/Modified

New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h

Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py

Documentation: Phase 40-60 analysis documents

## Design Decisions

1. Profile separation (core/bench_profile.h):
   - MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
   - MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)

2. Box Theory compliance:
   - All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
   - Single conversion points maintained
   - No physical deletions (compile-out only)

3. Lessons learned:
   - SSOT effective only where redundancy exists (Phase 60 showed limits)
   - Branch prediction extremely effective (~0 cycles for well-predicted branches)
   - Early-exit pattern valuable even when seemingly redundant

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-17 06:24:01 +09:00

226 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#pragma once
#include <dlfcn.h>
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <unistd.h>
#ifdef USE_HAKMEM
#include "box/wrapper_env_box.h" // wrapper_env_refresh_from_env (Phase 2 B4)
#include "box/tiny_static_route_box.h" // tiny_static_route_refresh_from_env (Phase 3 C3)
#include "box/hakmem_env_snapshot_box.h" // hakmem_env_snapshot_refresh_from_env (Phase 4 E1)
#include "box/tiny_free_route_cache_env_box.h" // tiny_free_static_route_refresh_from_env (Phase 8)
#include "box/tiny_c7_preserve_header_env_box.h" // tiny_c7_preserve_header_env_refresh_from_env (Phase 13 v1)
#include "box/tiny_tcache_env_box.h" // tiny_tcache_env_refresh_from_env (Phase 14 v1)
#include "box/tiny_unified_lifo_env_box.h" // tiny_unified_lifo_env_refresh_from_env (Phase 15 v1)
#include "box/front_fastlane_alloc_legacy_direct_env_box.h" // front_fastlane_alloc_legacy_direct_env_refresh_from_env (Phase 16 v1)
#include "box/fastlane_direct_env_box.h" // fastlane_direct_env_refresh_from_env (Phase 19-1)
#include "box/tiny_header_hotfull_env_box.h" // tiny_header_hotfull_env_refresh_from_env (Phase 21)
#endif
// env が未設定のときだけ既定値を入れる
static inline void bench_setenv_default(const char* key, const char* val) {
if (getenv(key) != NULL) return;
static void* (*real_malloc)(size_t) = NULL;
static int (*real_putenv)(char*) = NULL;
if (!real_malloc) {
real_malloc = (void* (*)(size_t))dlsym(RTLD_NEXT, "malloc");
if (!real_malloc) real_malloc = malloc;
}
if (!real_putenv) {
real_putenv = (int (*)(char*))dlsym(RTLD_NEXT, "putenv");
if (!real_putenv) real_putenv = putenv;
}
size_t klen = strlen(key);
size_t vlen = strlen(val);
char* buf = (char*)real_malloc(klen + vlen + 2);
if (!buf) return;
memcpy(buf, key, klen);
buf[klen] = '=';
memcpy(buf + klen + 1, val, vlen);
buf[klen + 1 + vlen] = '\0';
{
char msg[256];
int n = snprintf(msg, sizeof(msg), "[bench_profile] set %s=%s\n", key, val);
if (n > 0) {
if (n > (int)sizeof(msg)) n = (int)sizeof(msg);
ssize_t w = write(2, msg, (size_t)n);
(void)w;
}
}
real_putenv(buf); // takes ownership; do not free
}
// ベンチ専用: HAKMEM_PROFILE に応じて ENV をプリセットする
static inline void bench_apply_mixed_tinyv3_c7_common(void) {
bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
bench_setenv_default("HAKMEM_TINY_C7_HOT", "1");
bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x0");
bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_V4_ENABLED", "0");
bench_setenv_default("HAKMEM_SMALL_SEGMENT_V4_ENABLED", "0");
bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
bench_setenv_default("HAKMEM_TINY_FRONT_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_TINY_FRONT_V3_LUT_ENABLED", "1");
bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_ENABLED", "1");
// Phase FREE-TINY-FAST-DUALHOT-1: C0-C3 direct fast free (skip policy snapshot)
bench_setenv_default("HAKMEM_FREE_TINY_FAST_HOTCOLD", "1");
// Phase 2 B4: Wrapper hot/cold split (malloc/free wrapper shape)
bench_setenv_default("HAKMEM_WRAP_SHAPE", "1");
// Phase 4 E1: ENV Snapshot Consolidation (+3.92% proven on Mixed)
bench_setenv_default("HAKMEM_ENV_SNAPSHOT", "1");
// Phase 5 E4-1: Free wrapper ENV snapshot (+3.51% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FREE_WRAPPER_ENV_SNAPSHOT", "1");
// Phase 5 E4-2: Malloc wrapper ENV snapshot (+21.83% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_MALLOC_WRAPPER_ENV_SNAPSHOT", "1");
// Phase 5 E5-1: Free Tiny Direct Path (+3.35% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FREE_TINY_DIRECT", "1");
// Phase 6-1: Front FastLane (Layer Collapse) (+11.13% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FRONT_FASTLANE", "1");
// Phase 6-2: Front FastLane Free DeDup (+5.18% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FRONT_FASTLANE_FREE_DEDUP", "1");
// Phase 21: Tiny Header HotFull (alloc header hot/cold split; opt-out with 0)
bench_setenv_default("HAKMEM_TINY_HEADER_HOTFULL", "1");
// Phase 19-1b: FastLane Direct (wrapper layer bypass, +5.88% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1");
// Phase 9: FREE-TINY-FAST MONO DUALHOT (+2.72% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FREE_TINY_FAST_MONO_DUALHOT", "1");
// Phase 10: FREE-TINY-FAST MONO LEGACY DIRECT (+1.89% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT", "1");
// Phase 4-4: C6 ULTRA free+alloc 統合を有効化 (default OFF, manual opt-in)
bench_setenv_default("HAKMEM_TINY_C6_ULTRA_FREE_ENABLED", "0");
// Phase MID-V3: Mid/Pool HotBox v3
// Mixed (161024B) では MID_V3(C6) が大きく遅くなるため、デフォルト OFF に固定。
// C6-heavy プロファイル側でのみ ON を推奨するC6-heavy のみ最適化対象)。
bench_setenv_default("HAKMEM_MID_V3_ENABLED", "0");
bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x0");
// Phase 2 B3: Routing branch shape optimization (LIKELY on LEGACY, cold helper for rare routes)
bench_setenv_default("HAKMEM_TINY_ALLOC_ROUTE_SHAPE", "1");
// Phase 3 C3: Static routing (policy_snapshot bypass, +2.2% proven)
bench_setenv_default("HAKMEM_TINY_STATIC_ROUTE", "1");
// Phase 3 D1: Free route cache (TLS cache for free path routing, +2.19% proven)
bench_setenv_default("HAKMEM_FREE_STATIC_ROUTE", "1");
}
static inline void bench_apply_profile(void) {
const char* p = getenv("HAKMEM_PROFILE");
if (!p || !*p) return;
if (strcmp(p, "MIXED_TINYV3_C7_SAFE") == 0) {
// Speed-first default (Phase 57): do not set HAKMEM_SS_MEM_LEAN here.
bench_apply_mixed_tinyv3_c7_common();
} else if (strcmp(p, "MIXED_TINYV3_C7_BALANCED") == 0) {
// Balanced mode (Phase 55/56): LEAN+OFF (prewarm suppression only).
bench_apply_mixed_tinyv3_c7_common();
bench_setenv_default("HAKMEM_SS_MEM_LEAN", "1");
bench_setenv_default("HAKMEM_SS_MEM_LEAN_DECOMMIT", "OFF");
bench_setenv_default("HAKMEM_SS_MEM_LEAN_TARGET_MB", "10");
} else if (strcmp(p, "C6_HEAVY_LEGACY_POOLV1") == 0) {
bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_ENABLED", "0");
bench_setenv_default("HAKMEM_MID_DESC_CACHE_ENABLED", "1");
// Phase 4-4: C6 ULTRA free+alloc 統合を有効化 (default OFF, manual opt-in)
bench_setenv_default("HAKMEM_TINY_C6_ULTRA_FREE_ENABLED", "0");
// Phase MID-V3: Mid/Pool HotBox v3 (257-768B, C6 only)
bench_setenv_default("HAKMEM_MID_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x40");
// Phase 6-1: Front FastLane (Layer Collapse) (+11.13% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FRONT_FASTLANE", "1");
// Phase 6-2: Front FastLane Free DeDup (+5.18% proven on Mixed, 10-run)
bench_setenv_default("HAKMEM_FRONT_FASTLANE_FREE_DEDUP", "1");
// Phase 21: Tiny Header HotFull (alloc header hot/cold split; opt-out with 0)
bench_setenv_default("HAKMEM_TINY_HEADER_HOTFULL", "1");
// Phase 19-1b: FastLane Direct (wrapper layer bypass)
bench_setenv_default("HAKMEM_FASTLANE_DIRECT", "1");
// Phase 2 B3: Routing branch shape optimization (LIKELY on LEGACY, cold helper for rare routes)
bench_setenv_default("HAKMEM_TINY_ALLOC_ROUTE_SHAPE", "1");
} else if (strcmp(p, "C6_V7_STUB") == 0) {
// Phase v7-1: C6-only v7 stub 実験用MID v3 fallback
bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
bench_setenv_default("HAKMEM_MID_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_MID_V3_CLASSES", "0x40");
// v7 stub ON (C6-only)
bench_setenv_default("HAKMEM_SMALL_HEAP_V7_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V7_CLASSES", "0x40");
} else if (strcmp(p, "C6_HEAVY_LEGACY_POOLV1_FLATTEN") == 0) {
// LEGACY mid/smallmid ベンチ専用C7_SAFE では使用しない)
bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "LEGACY");
bench_setenv_default("HAKMEM_TINY_C6_HOT", "0");
bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_ENABLED", "1");
bench_setenv_default("HAKMEM_POOL_V1_FLATTEN_STATS", "1");
bench_setenv_default("HAKMEM_POOL_ZERO_MODE", "header");
} else if (strcmp(p, "DEBUG_TINY_FRONT_PERF") == 0) {
bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
bench_setenv_default("HAKMEM_TINY_C7_HOT", "1");
bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x80");
bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
bench_setenv_default("HAKMEM_TINY_FRONT_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_TINY_FRONT_V3_LUT_ENABLED", "1");
bench_setenv_default("HAKMEM_TINY_PTR_FAST_CLASSIFY_ENABLED", "1");
} else if (strcmp(p, "C6_SMALL_HEAP_V3_EXPERIMENT") == 0) {
// C6 を SmallObject v3 に載せる研究用(標準では使用しない)
bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
bench_setenv_default("HAKMEM_TINY_C6_HOT", "1");
bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x40"); // C6 only
bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x0");
bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
} else if (strcmp(p, "C6_SMALL_HEAP_V4_EXPERIMENT") == 0) {
// C6 を SmallObject v4 に載せる研究用(標準では使用しない)
bench_setenv_default("HAKMEM_TINY_HEAP_PROFILE", "C7_SAFE");
bench_setenv_default("HAKMEM_TINY_C6_HOT", "1");
bench_setenv_default("HAKMEM_TINY_HOTHEAP_V2", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_ENABLED", "0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V3_CLASSES", "0x0");
bench_setenv_default("HAKMEM_SMALL_HEAP_V4_ENABLED", "1");
bench_setenv_default("HAKMEM_SMALL_HEAP_V4_CLASSES", "0x40"); // C6 only
bench_setenv_default("HAKMEM_POOL_V2_ENABLED", "0");
}
#ifdef USE_HAKMEM
// Phase 3 C3 Step 0: Ensure policy snapshot reflects final ENV after putenv defaults.
small_policy_v7_bump_version();
// Phase 2 B4: Sync wrapper ENV cache after bench_profile putenv defaults.
wrapper_env_refresh_from_env();
// Phase 3 C3: Sync static route cache after bench_profile putenv defaults.
tiny_static_route_refresh_from_env();
// Phase 4 E1: Sync ENV snapshot cache after bench_profile putenv defaults.
hakmem_env_snapshot_refresh_from_env();
// Phase 8: Sync free static route ENV cache after bench_profile putenv defaults.
tiny_free_static_route_refresh_from_env();
// Phase 13 v1: Sync C7 preserve header ENV cache after bench_profile putenv defaults.
tiny_c7_preserve_header_env_refresh_from_env();
// Phase 14 v1: Sync tcache ENV cache after bench_profile putenv defaults.
tiny_tcache_env_refresh_from_env();
// Phase 15 v1: Sync LIFO ENV cache after bench_profile putenv defaults.
tiny_unified_lifo_env_refresh_from_env();
// Phase 16 v1: Sync LEGACY direct ENV cache after bench_profile putenv defaults.
front_fastlane_alloc_legacy_direct_env_refresh_from_env();
// Phase 19-1: Sync FastLane Direct ENV cache after bench_profile putenv defaults.
fastlane_direct_env_refresh_from_env();
// Phase 21: Sync Tiny Header HotFull ENV cache after bench_profile putenv defaults.
tiny_header_hotfull_env_refresh_from_env();
#endif
}