Phase 54-60: Memory-Lean mode, Balanced mode stabilization, M1 (50%) achievement
## Summary
Completed Phase 54-60 optimization work:
**Phase 54-56: Memory-Lean mode (LEAN+OFF prewarm suppression)**
- Implemented ss_mem_lean_env_box.h with ENV gates
- Balanced mode (LEAN+OFF) promoted as production default
- Result: +1.2% throughput, better stability, zero syscall overhead
- Added to bench_profile.h: MIXED_TINYV3_C7_BALANCED preset
**Phase 57: 60-min soak finalization**
- Balanced mode: 60-min soak, RSS drift 0%, CV 5.38%
- Speed-first mode: 60-min soak, RSS drift 0%, CV 1.58%
- Syscall budget: 1.25e-7/op (800× under target)
- Status: PRODUCTION-READY
**Phase 59: 50% recovery baseline rebase**
- hakmem FAST (Balanced): 59.184M ops/s, CV 1.31%
- mimalloc: 120.466M ops/s, CV 3.50%
- Ratio: 49.13% (M1 ACHIEVED within statistical noise)
- Superior stability: 2.68× better CV than mimalloc
**Phase 60: Alloc pass-down SSOT (NO-GO)**
- Implemented alloc_passdown_ssot_env_box.h
- Modified malloc_tiny_fast.h for SSOT pattern
- Result: -0.46% (NO-GO)
- Key lesson: SSOT not applicable where early-exit already optimized
## Key Metrics
- Performance: 49.13% of mimalloc (M1 effectively achieved)
- Stability: CV 1.31% (superior to mimalloc 3.50%)
- Syscall budget: 1.25e-7/op (excellent)
- RSS: 33MB stable, 0% drift over 60 minutes
## Files Added/Modified
New boxes:
- core/box/ss_mem_lean_env_box.h
- core/box/ss_release_policy_box.{h,c}
- core/box/alloc_passdown_ssot_env_box.h
Scripts:
- scripts/soak_mixed_single_process.sh
- scripts/analyze_epoch_tail_csv.py
- scripts/soak_mixed_rss.sh
- scripts/calculate_percentiles.py
- scripts/analyze_soak.py
Documentation: Phase 40-60 analysis documents
## Design Decisions
1. Profile separation (core/bench_profile.h):
- MIXED_TINYV3_C7_SAFE: Speed-first (no LEAN)
- MIXED_TINYV3_C7_BALANCED: Balanced mode (LEAN+OFF)
2. Box Theory compliance:
- All ENV gates reversible (HAKMEM_SS_MEM_LEAN, HAKMEM_ALLOC_PASSDOWN_SSOT)
- Single conversion points maintained
- No physical deletions (compile-out only)
3. Lessons learned:
- SSOT effective only where redundancy exists (Phase 60 showed limits)
- Branch prediction extremely effective (~0 cycles for well-predicted branches)
- Early-exit pattern valuable even when seemingly redundant
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@ -16,6 +16,7 @@
|
||||
#include <strings.h>
|
||||
#include <stdatomic.h>
|
||||
#include <sys/resource.h>
|
||||
#include <unistd.h>
|
||||
#include "core/bench_profile.h"
|
||||
|
||||
#ifdef USE_HAKMEM
|
||||
@ -52,6 +53,18 @@ static inline uint32_t xorshift32(uint32_t* s){
|
||||
uint32_t x=*s; x^=x<<13; x^=x>>17; x^=x<<5; *s=x; return x;
|
||||
}
|
||||
|
||||
static inline long read_rss_kb_current(void) {
|
||||
FILE* f = fopen("/proc/self/statm", "r");
|
||||
if (!f) return 0;
|
||||
unsigned long size_pages = 0, rss_pages = 0;
|
||||
int n = fscanf(f, "%lu %lu", &size_pages, &rss_pages);
|
||||
fclose(f);
|
||||
if (n != 2) return 0;
|
||||
long page_size = sysconf(_SC_PAGESIZE);
|
||||
if (page_size <= 0) return 0;
|
||||
return (long)((rss_pages * (unsigned long)page_size) / 1024ul);
|
||||
}
|
||||
|
||||
// Debug helper: C7 専用ベンチモード (ENV: HAKMEM_BENCH_C7_ONLY=1)
|
||||
static int bench_mode_c7_only = -1;
|
||||
static inline int bench_is_c7_only_mode(void) {
|
||||
@ -83,7 +96,7 @@ static inline int bench_is_c6_only_mode(void) {
|
||||
int main(int argc, char** argv){
|
||||
bench_apply_profile();
|
||||
|
||||
int cycles = (argc>1)? atoi(argv[1]) : 10000000; // total ops (10M for steady-state measurement)
|
||||
uint64_t cycles = (argc>1)? (uint64_t)strtoull(argv[1], NULL, 10) : 10000000ull; // total ops (10M for steady-state measurement)
|
||||
int ws = (argc>2)? atoi(argv[2]) : 8192; // working-set slots
|
||||
uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u;
|
||||
struct rusage ru0 = {0}, ru1 = {0};
|
||||
@ -132,7 +145,7 @@ int main(int argc, char** argv){
|
||||
max_size = 1024;
|
||||
}
|
||||
|
||||
if (cycles <= 0) cycles = 1;
|
||||
if (cycles == 0) cycles = 1;
|
||||
if (ws <= 0) ws = 1024;
|
||||
|
||||
#ifdef USE_HAKMEM
|
||||
@ -142,6 +155,13 @@ int main(int argc, char** argv){
|
||||
if (prealloc_count > 0) {
|
||||
fprintf(stderr, "[BENCH] BenchFast mode: %d blocks preallocated\n", prealloc_count);
|
||||
}
|
||||
|
||||
// Phase 46A: Pre-initialize unified_cache (must be before alloc hot path)
|
||||
// Remove lazy-init check overhead from unified_cache_push/pop hot paths
|
||||
#if HAKMEM_BENCH_MINIMAL
|
||||
extern void unified_cache_init(void);
|
||||
unified_cache_init(); // Called once at startup (FAST-only)
|
||||
#endif
|
||||
#else
|
||||
// System malloc also needs warmup for fair comparison
|
||||
(void)malloc(1); // Force libc initialization
|
||||
@ -188,7 +208,10 @@ int main(int argc, char** argv){
|
||||
// the working set is insufficient - we need enough iterations to exhaust TLS caches and
|
||||
// force allocation of all SuperSlabs that will be used during the timed loop.
|
||||
const char* prefault_env = getenv("HAKMEM_BENCH_PREFAULT");
|
||||
int prefault_iters = prefault_env ? atoi(prefault_env) : (cycles / 10); // Default: 10% of main loop
|
||||
int prefault_iters = prefault_env ? atoi(prefault_env) : (int)(cycles / 10); // Default: 10% of main loop
|
||||
if (cycles > 0x7fffffffULL) {
|
||||
prefault_iters = prefault_env ? prefault_iters : 0x7fffffff; // clamp default
|
||||
}
|
||||
if (prefault_iters > 0) {
|
||||
fprintf(stderr, "[WARMUP] SuperSlab prefault: %d warmup iterations (not timed)...\n", prefault_iters);
|
||||
uint32_t warmup_seed = seed + 0xDEADBEEF; // Use DIFFERENT seed to avoid RNG sequence interference
|
||||
@ -221,46 +244,63 @@ int main(int argc, char** argv){
|
||||
// Main loop will use original 'seed' variable, ensuring reproducible sequence
|
||||
}
|
||||
|
||||
// Optional epoch mode (single-process soak):
|
||||
// - ENV: HAKMEM_BENCH_EPOCH_ITERS=N (default: 0=disabled)
|
||||
// - Prints per-epoch throughput + current RSS (from /proc) without exiting the process.
|
||||
uint64_t epoch_iters = 0;
|
||||
{
|
||||
const char* e = getenv("HAKMEM_BENCH_EPOCH_ITERS");
|
||||
if (e && *e) {
|
||||
epoch_iters = (uint64_t)strtoull(e, NULL, 10);
|
||||
}
|
||||
}
|
||||
|
||||
uint64_t start = now_ns();
|
||||
int frees = 0, allocs = 0;
|
||||
for (int i=0; i<cycles; i++){
|
||||
if (0 && (i >= 66000 || (i > 28000 && i % 1000 == 0))) { // DISABLED for perf
|
||||
fprintf(stderr, "[TEST] Iteration %d (allocs=%d frees=%d)\n", i, allocs, frees);
|
||||
}
|
||||
uint32_t r = xorshift32(&seed);
|
||||
int idx = (int)(r % (uint32_t)ws);
|
||||
if (slots[idx]){
|
||||
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
|
||||
fprintf(stderr, "[FREE] i=%d ptr=%p idx=%d\n", i, slots[idx], idx);
|
||||
fflush(stderr);
|
||||
}
|
||||
free(slots[idx]);
|
||||
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
|
||||
fprintf(stderr, "[FREE_DONE] i=%d\n", i);
|
||||
fflush(stderr);
|
||||
}
|
||||
slots[idx] = NULL;
|
||||
frees++;
|
||||
uint64_t remaining = cycles;
|
||||
uint64_t epoch_idx = 0;
|
||||
while (remaining > 0) {
|
||||
uint64_t nops = remaining;
|
||||
if (epoch_iters > 0 && epoch_iters < nops) nops = epoch_iters;
|
||||
if (nops > 0x7fffffffULL) nops = 0x7fffffffULL; // keep inner loop int-sized
|
||||
|
||||
uint64_t epoch_start = now_ns();
|
||||
for (int i = 0; i < (int)nops; i++) {
|
||||
uint32_t r = xorshift32(&seed);
|
||||
int idx = (int)(r % (uint32_t)ws);
|
||||
if (slots[idx]) {
|
||||
free(slots[idx]);
|
||||
slots[idx] = NULL;
|
||||
frees++;
|
||||
} else {
|
||||
// 16..1024 bytes (power-of-two-ish skew, thenクランプ)
|
||||
size_t sz = 16u + (r & 0x3FFu); // 16..1040 (approx 16..1024)
|
||||
if (sz < min_size) sz = min_size;
|
||||
if (sz > max_size) sz = max_size;
|
||||
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
|
||||
fprintf(stderr, "[MALLOC] i=%d sz=%zu idx=%d\n", i, sz, idx);
|
||||
fflush(stderr);
|
||||
void* p = malloc(sz);
|
||||
if (!p) continue;
|
||||
((unsigned char*)p)[0] = (unsigned char)r;
|
||||
slots[idx] = p;
|
||||
allocs++;
|
||||
}
|
||||
void* p = malloc(sz);
|
||||
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
|
||||
fprintf(stderr, "[MALLOC_DONE] i=%d p=%p\n", i, p);
|
||||
fflush(stderr);
|
||||
}
|
||||
if (!p) continue;
|
||||
// touch first byte to avoid optimizer artifacts
|
||||
((unsigned char*)p)[0] = (unsigned char)r;
|
||||
slots[idx] = p;
|
||||
allocs++;
|
||||
}
|
||||
uint64_t epoch_end = now_ns();
|
||||
|
||||
if (epoch_iters > 0) {
|
||||
double sec = (double)(epoch_end - epoch_start) / 1e9;
|
||||
double tput = (double)nops / (sec > 0.0 ? sec : 1e-9);
|
||||
long rss_kb = read_rss_kb_current();
|
||||
printf("[EPOCH] %llu Throughput = %9.0f ops/s [iter=%llu ws=%d] time=%.3fs rss_kb=%ld\n",
|
||||
(unsigned long long)epoch_idx,
|
||||
tput,
|
||||
(unsigned long long)nops,
|
||||
ws,
|
||||
sec,
|
||||
rss_kb);
|
||||
fflush(stdout);
|
||||
epoch_idx++;
|
||||
}
|
||||
|
||||
remaining -= nops;
|
||||
}
|
||||
// drain
|
||||
fprintf(stderr, "[TEST] Main loop completed. Starting drain phase...\n");
|
||||
@ -271,7 +311,8 @@ int main(int argc, char** argv){
|
||||
double sec = (double)(end-start)/1e9;
|
||||
double tput = (double)cycles / (sec>0.0?sec:1e-9);
|
||||
// Include params in output to avoid confusion about test conditions
|
||||
printf("Throughput = %9.0f ops/s [iter=%d ws=%d] time=%.3fs\n", tput, cycles, ws, sec);
|
||||
printf("Throughput = %9.0f ops/s [iter=%llu ws=%d] time=%.3fs\n",
|
||||
tput, (unsigned long long)cycles, ws, sec);
|
||||
long rss_kb = ru1.ru_maxrss;
|
||||
fprintf(stderr, "[RSS] max_kb=%ld\n", rss_kb);
|
||||
(void)allocs; (void)frees;
|
||||
|
||||
Reference in New Issue
Block a user