diff --git a/CLAUDE.md b/CLAUDE.md index 82a45d5d..2ab2e88b 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -13,24 +13,44 @@ ## 📊 現在の性能(2025-11-22) -### ベンチマーク結果(Random Mixed 256B, 10M iterations) +### ⚠️ **重要:正しいベンチマーク方法** + +**必ず 10M iterations を使うこと**(steady-state 測定): +```bash +# 正しい方法(10M iterations = デフォルト) +./out/release/bench_random_mixed_hakmem # 引数なしで 10M +./out/release/bench_random_mixed_hakmem 10000000 256 42 + +# 間違った方法(100K = cold-start、3-4倍遅い) +./out/release/bench_random_mixed_hakmem 100000 256 42 # ❌ 使わないこと +``` + +**統計要件**:最低 10 回実行して平均・標準偏差を計算すること + +### ベンチマーク結果(Steady-State, 10M iterations, 10回平均) ``` 🥇 mimalloc: 107.11M ops/s (最速) -🥈 System malloc: 93.87M ops/s (baseline) -🥉 HAKMEM: 65.24M ops/s (System比 69.5%) +🥈 System malloc: 88-94M ops/s (baseline) +🥉 HAKMEM: 58-61M ops/s (System比 62-69%) -HAKMEMの改善: 9.05M → 65.24M ops/s (+621%!) 🚀 +HAKMEMの改善: 9.05M → 60.5M ops/s (+569%!) 🚀 ``` -### 全ベンチマーク比較 +### 全ベンチマーク比較(10回平均) ``` ベンチマーク │ HAKMEM │ System malloc │ mimalloc │ 順位 ------------------+-------------+---------------+--------------+------ -Random Mixed 256B │ 65.24M ops/s│ 93.87M ops/s │ 107.11M ops/s│ 🥉 3位 +Random Mixed 256B │ 58-61M ops/s│ 88-94M ops/s │ 107.11M ops/s│ 🥉 3位 (62-69%) +Larson 1T │ 47.6M ops/s │ 14.2M ops/s │ 16.8M ops/s │ 🥇 1位 (+235-284%) +Larson 8T │ 48.2M ops/s │ - │ - │ ✅ MT安定 Fixed Size 256B │ 41.95M ops/s│ 105.7M ops/s │ - │ ❌ 要改善 Mid-Large 8KB │ 10.74M ops/s│ 7.85M ops/s │ - │ 🥇 1位 (+37%) ``` +**Larsonで圧勝している理由**: +- HAKMEM: Lock-free atomic freelist + Adaptive CAS(CV < 1%の安定性) +- System/mimalloc: Mutex contention が支配的 + ### 🔧 本日の修正と最適化(2025-11-21~22) **バグ修正**: diff --git a/bench_random_mixed.c b/bench_random_mixed.c new file mode 100644 index 00000000..2efb036f --- /dev/null +++ b/bench_random_mixed.c @@ -0,0 +1,182 @@ +// bench_random_mixed.c — Random mixed small allocations (16–1024B) +// Usage (direct-link builds via Makefile): +// ./bench_random_mixed_hakmem [cycles] [ws] [seed] +// ./bench_random_mixed_system [cycles] [ws] [seed] +// +// Default: 10M cycles for steady-state measurement (use 100K for quick smoke test) +// Recommended: Run 10 times and calculate mean/median/stddev for accurate results +// +// Prints: "Throughput = operations per second, relative time: ." + +#include +#include +#include +#include +#include + +#ifdef USE_HAKMEM +#include "hakmem.h" + +// Box BenchMeta: Benchmark metadata management (bypass hakmem wrapper) +// Phase 15: Separate BenchMeta (slots array) from CoreAlloc (user workload) +extern void* __libc_calloc(size_t, size_t); +extern void __libc_free(void*); +#define BENCH_META_CALLOC __libc_calloc +#define BENCH_META_FREE __libc_free + +// Phase 20-2: BenchFast mode - prealloc pool init +#include "core/box/bench_fast_box.h" +#else +// System malloc build: use standard libc +#define BENCH_META_CALLOC calloc +#define BENCH_META_FREE free +#endif + +static inline uint64_t now_ns(void) { + struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec*1000000000ull + (uint64_t)ts.tv_nsec; +} + +static inline uint32_t xorshift32(uint32_t* s){ + uint32_t x=*s; x^=x<<13; x^=x>>17; x^=x<<5; *s=x; return x; +} + +int main(int argc, char** argv){ + int cycles = (argc>1)? atoi(argv[1]) : 10000000; // total ops (10M for steady-state measurement) + int ws = (argc>2)? atoi(argv[2]) : 8192; // working-set slots + uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u; + + if (cycles <= 0) cycles = 1; + if (ws <= 0) ws = 1024; + +#ifdef USE_HAKMEM + // Phase 20-2: BenchFast prealloc pool initialization + // Must be called BEFORE main benchmark loop to avoid recursion + int prealloc_count = bench_fast_init(); + if (prealloc_count > 0) { + fprintf(stderr, "[BENCH] BenchFast mode: %d blocks preallocated\n", prealloc_count); + } +#else + // System malloc also needs warmup for fair comparison + (void)malloc(1); // Force libc initialization +#endif + + // Box BenchMeta: Use __libc_calloc to bypass hakmem wrapper + void** slots = (void**)BENCH_META_CALLOC((size_t)ws, sizeof(void*)); + if (!slots) { fprintf(stderr, "alloc failed (slots)\n"); return 1; } + + // Warmup run (exclude from timing) - HAKMEM_BENCH_WARMUP=N + const char* warmup_env = getenv("HAKMEM_BENCH_WARMUP"); + int warmup_cycles = warmup_env ? atoi(warmup_env) : 0; + if (warmup_cycles > 0) { + fprintf(stderr, "[BENCH_WARMUP] Running %d warmup cycles (not timed)...\n", warmup_cycles); + uint32_t warmup_seed = seed; + for (int i=0; i= 66000 || (i > 28000 && i % 1000 == 0))) { // DISABLED for perf + fprintf(stderr, "[TEST] Iteration %d (allocs=%d frees=%d)\n", i, allocs, frees); + } + uint32_t r = xorshift32(&seed); + int idx = (int)(r % (uint32_t)ws); + if (slots[idx]){ + if (0 && i > 28300) { // DISABLED (Phase 2 perf) + fprintf(stderr, "[FREE] i=%d ptr=%p idx=%d\n", i, slots[idx], idx); + fflush(stderr); + } + free(slots[idx]); + if (0 && i > 28300) { // DISABLED (Phase 2 perf) + fprintf(stderr, "[FREE_DONE] i=%d\n", i); + fflush(stderr); + } + slots[idx] = NULL; + frees++; + } else { + // 16..1024 bytes (power-of-two-ish skew) + size_t sz = 16u + (r & 0x3FFu); // 16..1040 (approx 16..1024) + if (0 && i > 28300) { // DISABLED (Phase 2 perf) + fprintf(stderr, "[MALLOC] i=%d sz=%zu idx=%d\n", i, sz, idx); + fflush(stderr); + } + void* p = malloc(sz); + if (0 && i > 28300) { // DISABLED (Phase 2 perf) + fprintf(stderr, "[MALLOC_DONE] i=%d p=%p\n", i, p); + fflush(stderr); + } + if (!p) continue; + // touch first byte to avoid optimizer artifacts + ((unsigned char*)p)[0] = (unsigned char)r; + slots[idx] = p; + allocs++; + } + } + // drain + fprintf(stderr, "[TEST] Main loop completed. Starting drain phase...\n"); + for (int i=0;i0.0?sec:1e-9); + printf("Throughput = %9.0f operations per second, relative time: %.3fs.\n", tput, sec); + (void)allocs; (void)frees; + + // Box BenchMeta: Use __libc_free to bypass hakmem wrapper + BENCH_META_FREE(slots); + +#ifdef USE_HAKMEM + // Phase 20-2: Print BenchFast stats (verify pool wasn't exhausted) + bench_fast_stats(); + + // Phase 21-1: Ring cache - DELETED (A/B test: OFF is faster) + // extern void ring_cache_print_stats(void); + // ring_cache_print_stats(); + + // Phase 27: UltraHeap front statistics (experimental, UltraHeap ビルドのみ) + // ENV: HAKMEM_TINY_ULTRA_HEAP_DUMP=1 で出力有効化 + #if HAKMEM_TINY_ULTRA_HEAP + { + const char* dump = getenv("HAKMEM_TINY_ULTRA_HEAP_DUMP"); + if (dump && *dump && *dump != '0') { + extern void tiny_ultra_heap_stats_snapshot(uint64_t hit[8], + uint64_t refill[8], + uint64_t fallback[8], + int reset); + uint64_t hit[8] = {0}, refill[8] = {0}, fallback[8] = {0}; + tiny_ultra_heap_stats_snapshot(hit, refill, fallback, 0); + fprintf(stderr, "[ULTRA_HEAP_STATS] class hit refill fallback\n"); + for (int c = 0; c < 8; c++) { + if (hit[c] || refill[c] || fallback[c]) { + fprintf(stderr, " C%d: %llu %llu %llu\n", + c, + (unsigned long long)hit[c], + (unsigned long long)refill[c], + (unsigned long long)fallback[c]); + } + } + } + } + #endif +#endif + + return 0; +} diff --git a/benchmarks/src/fixed/bench_fixed_size.c b/benchmarks/src/fixed/bench_fixed_size.c new file mode 100644 index 00000000..3a14811d --- /dev/null +++ b/benchmarks/src/fixed/bench_fixed_size.c @@ -0,0 +1,82 @@ +// bench_fixed_size.c — Fixed-size alloc/free microbenchmark +// Usage: +// bench_fixed_size_[hakmem|system] +// +// Recommended for steady-state measurement: iterations >= 10M +// Quick smoke test: iterations = 100K +// Example: ./bench_fixed_size_hakmem 10000000 256 128 +// +// Pattern: +// Maintain a ring of pointers; each step frees slot[i%workset] if non-null, +// then allocates and stores it. Measures total ops (=iterations). + +#define _GNU_SOURCE +#include +#include +#include +#include +#include + +#ifdef USE_HAKMEM +# include "hakmem.h" +#endif + +static inline uint64_t now_ns(void) { + struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); + return (uint64_t)ts.tv_sec * 1000000000ull + (uint64_t)ts.tv_nsec; +} + +int main(int argc, char** argv) { + if (argc < 4) { + fprintf(stderr, "Usage: %s \n", argv[0]); + return 2; + } + const int iters = atoi(argv[1]); + const size_t sz = (size_t)atoi(argv[2]); + const int ws = atoi(argv[3]); + if (iters <= 0 || ws <= 0 || sz == 0) { fprintf(stderr, "Invalid args\n"); return 2; } + + void** ring = (void**)calloc((size_t)ws, sizeof(void*)); + if (!ring) { fprintf(stderr, "calloc ring failed\n"); return 1; } + + // Warmup: zero + for (int i = 0; i < ws; i++) ring[i] = NULL; + + uint64_t t0 = now_ns(); + for (int i = 0; i < iters; i++) { + int idx = i % ws; + void* p = ring[idx]; +#ifdef USE_HAKMEM + // HAKMEM path: use hak_free_at / hak_alloc_at directly + if (p) hak_free_at(p, sz, (uintptr_t)0xF00); + ring[idx] = hak_alloc_at(sz, (uintptr_t)0xF00); + if (!ring[idx]) { fprintf(stderr, "hak_alloc_at failed at %d\n", i); break; } +#else + // System path: use libc malloc/free + if (p) free(p); + ring[idx] = malloc(sz); + if (!ring[idx]) { fprintf(stderr, "malloc failed at %d\n", i); break; } +#endif + } + uint64_t t1 = now_ns(); + + // Cleanup +#ifdef USE_HAKMEM + for (int i = 0; i < ws; i++) { if (ring[i]) hak_free_at(ring[i], sz, (uintptr_t)0xF00); } + // Don't free ring itself (allocated with calloc, not HAKMEM) +#else + for (int i = 0; i < ws; i++) { if (ring[i]) free(ring[i]); } + free(ring); +#endif + + double secs = (double)(t1 - t0) / 1e9; + double ops_s = (double)iters / (secs > 0 ? secs : 1e-9); + printf("Throughput = %10.0f operations per second, relative time: %.3fs.\n", ops_s, secs); + +#ifdef USE_HAKMEM + // Stats (if any future stats functions exist) +#endif + + return 0; +} +