// bench_random_mixed.c — Random mixed small allocations (16–1024B) // Usage (direct-link builds via Makefile): // ./bench_random_mixed_hakmem [cycles] [ws] [seed] // ./bench_random_mixed_system [cycles] [ws] [seed] // // Default: 10M cycles for steady-state measurement (use 100K for quick smoke test) // Recommended: Run 10 times and calculate mean/median/stddev for accurate results // // Prints: "Throughput = operations per second, relative time: ." #include #include #include #include #include #ifdef USE_HAKMEM #include "hakmem.h" // Box BenchMeta: Benchmark metadata management (bypass hakmem wrapper) // Phase 15: Separate BenchMeta (slots array) from CoreAlloc (user workload) extern void* __libc_calloc(size_t, size_t); extern void __libc_free(void*); #define BENCH_META_CALLOC __libc_calloc #define BENCH_META_FREE __libc_free // Phase 20-2: BenchFast mode - prealloc pool init #include "core/box/bench_fast_box.h" #else // System malloc build: use standard libc #define BENCH_META_CALLOC calloc #define BENCH_META_FREE free #endif static inline uint64_t now_ns(void) { struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts); return (uint64_t)ts.tv_sec*1000000000ull + (uint64_t)ts.tv_nsec; } static inline uint32_t xorshift32(uint32_t* s){ uint32_t x=*s; x^=x<<13; x^=x>>17; x^=x<<5; *s=x; return x; } int main(int argc, char** argv){ int cycles = (argc>1)? atoi(argv[1]) : 10000000; // total ops (10M for steady-state measurement) int ws = (argc>2)? atoi(argv[2]) : 8192; // working-set slots uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u; // サイズレンジ(Tiny-only / Non-Tiny-only の比較用) // 既定: 16..1040 bytes(元の挙動と同等) size_t min_size = 16u; size_t max_size = 16u + 0x3FFu; // 16..1040 ≒ 16..1024 // 優先順位: argv[4]/argv[5] → ENV → 既定 if (argc > 4) { long v = atol(argv[4]); if (v > 0) min_size = (size_t)v; } else { const char* e = getenv("HAKMEM_BENCH_MIN_SIZE"); if (e && *e) { long v = atol(e); if (v > 0) min_size = (size_t)v; } } if (argc > 5) { long v = atol(argv[5]); if (v > 0) max_size = (size_t)v; } else { const char* e = getenv("HAKMEM_BENCH_MAX_SIZE"); if (e && *e) { long v = atol(e); if (v > 0) max_size = (size_t)v; } } if (min_size < 1) min_size = 1; if (max_size < min_size) max_size = min_size; if (cycles <= 0) cycles = 1; if (ws <= 0) ws = 1024; #ifdef USE_HAKMEM // Phase 20-2: BenchFast prealloc pool initialization // Must be called BEFORE main benchmark loop to avoid recursion int prealloc_count = bench_fast_init(); if (prealloc_count > 0) { fprintf(stderr, "[BENCH] BenchFast mode: %d blocks preallocated\n", prealloc_count); } #else // System malloc also needs warmup for fair comparison (void)malloc(1); // Force libc initialization #endif // Box BenchMeta: Use __libc_calloc to bypass hakmem wrapper void** slots = (void**)BENCH_META_CALLOC((size_t)ws, sizeof(void*)); if (!slots) { fprintf(stderr, "alloc failed (slots)\n"); return 1; } // Warmup run (exclude from timing) - HAKMEM_BENCH_WARMUP=N const char* warmup_env = getenv("HAKMEM_BENCH_WARMUP"); int warmup_cycles = warmup_env ? atoi(warmup_env) : 0; if (warmup_cycles > 0) { fprintf(stderr, "[BENCH_WARMUP] Running %d warmup cycles (not timed)...\n", warmup_cycles); uint32_t warmup_seed = seed; for (int i=0; i max_size) sz = max_size; void* p = malloc(sz); if (p) { ((unsigned char*)p)[0] = (unsigned char)r; slots[idx] = p; } } } // Drain warmup allocations for (int i=0;i 2-4x throughput improvement // // Key insight: Page faults occur when allocating from NEW SuperSlabs. A single pass through // the working set is insufficient - we need enough iterations to exhaust TLS caches and // force allocation of all SuperSlabs that will be used during the timed loop. const char* prefault_env = getenv("HAKMEM_BENCH_PREFAULT"); int prefault_iters = prefault_env ? atoi(prefault_env) : (cycles / 10); // Default: 10% of main loop if (prefault_iters > 0) { fprintf(stderr, "[WARMUP] SuperSlab prefault: %d warmup iterations (not timed)...\n", prefault_iters); uint32_t warmup_seed = seed + 0xDEADBEEF; // Use DIFFERENT seed to avoid RNG sequence interference int warmup_allocs = 0, warmup_frees = 0; // Run same workload as main loop, but don't time it for (int i = 0; i < prefault_iters; i++) { uint32_t r = xorshift32(&warmup_seed); int idx = (int)(r % (uint32_t)ws); if (slots[idx]) { free(slots[idx]); slots[idx] = NULL; warmup_frees++; } else { size_t sz = 16u + (r & 0x3FFu); // 16..1040 bytes(後段でクランプ) if (sz < min_size) sz = min_size; if (sz > max_size) sz = max_size; void* p = malloc(sz); if (p) { ((unsigned char*)p)[0] = (unsigned char)r; slots[idx] = p; warmup_allocs++; } } } fprintf(stderr, "[WARMUP] Complete. Allocated=%d Freed=%d SuperSlabs populated.\n\n", warmup_allocs, warmup_frees); // Main loop will use original 'seed' variable, ensuring reproducible sequence } uint64_t start = now_ns(); int frees = 0, allocs = 0; for (int i=0; i= 66000 || (i > 28000 && i % 1000 == 0))) { // DISABLED for perf fprintf(stderr, "[TEST] Iteration %d (allocs=%d frees=%d)\n", i, allocs, frees); } uint32_t r = xorshift32(&seed); int idx = (int)(r % (uint32_t)ws); if (slots[idx]){ if (0 && i > 28300) { // DISABLED (Phase 2 perf) fprintf(stderr, "[FREE] i=%d ptr=%p idx=%d\n", i, slots[idx], idx); fflush(stderr); } free(slots[idx]); if (0 && i > 28300) { // DISABLED (Phase 2 perf) fprintf(stderr, "[FREE_DONE] i=%d\n", i); fflush(stderr); } slots[idx] = NULL; frees++; } else { // 16..1024 bytes (power-of-two-ish skew, thenクランプ) size_t sz = 16u + (r & 0x3FFu); // 16..1040 (approx 16..1024) if (sz < min_size) sz = min_size; if (sz > max_size) sz = max_size; if (0 && i > 28300) { // DISABLED (Phase 2 perf) fprintf(stderr, "[MALLOC] i=%d sz=%zu idx=%d\n", i, sz, idx); fflush(stderr); } void* p = malloc(sz); if (0 && i > 28300) { // DISABLED (Phase 2 perf) fprintf(stderr, "[MALLOC_DONE] i=%d p=%p\n", i, p); fflush(stderr); } if (!p) continue; // touch first byte to avoid optimizer artifacts ((unsigned char*)p)[0] = (unsigned char)r; slots[idx] = p; allocs++; } } // drain fprintf(stderr, "[TEST] Main loop completed. Starting drain phase...\n"); for (int i=0;i0.0?sec:1e-9); // Include params in output to avoid confusion about test conditions printf("Throughput = %9.0f ops/s [iter=%d ws=%d] time=%.3fs\n", tput, cycles, ws, sec); (void)allocs; (void)frees; // Box BenchMeta: Use __libc_free to bypass hakmem wrapper BENCH_META_FREE(slots); #ifdef USE_HAKMEM // Phase 20-2: Print BenchFast stats (verify pool wasn't exhausted) bench_fast_stats(); // Production Performance Measurements (ENV-gated: HAKMEM_MEASURE_UNIFIED_CACHE=1) extern void unified_cache_print_measurements(void); extern void tls_sll_print_measurements(void); extern void shared_pool_print_measurements(void); unified_cache_print_measurements(); tls_sll_print_measurements(); shared_pool_print_measurements(); // Warm Pool Stats (ENV-gated: HAKMEM_WARM_POOL_STATS=1) extern void tiny_warm_pool_print_stats_public(void); tiny_warm_pool_print_stats_public(); // Phase 21-1: Ring cache - DELETED (A/B test: OFF is faster) // extern void ring_cache_print_stats(void); // ring_cache_print_stats(); // Phase 27: UltraHeap front statistics (experimental, UltraHeap ビルドのみ) // ENV: HAKMEM_TINY_ULTRA_HEAP_DUMP=1 で出力有効化 #if HAKMEM_TINY_ULTRA_HEAP { const char* dump = getenv("HAKMEM_TINY_ULTRA_HEAP_DUMP"); if (dump && *dump && *dump != '0') { extern void tiny_ultra_heap_stats_snapshot(uint64_t hit[8], uint64_t refill[8], uint64_t fallback[8], int reset); uint64_t hit[8] = {0}, refill[8] = {0}, fallback[8] = {0}; tiny_ultra_heap_stats_snapshot(hit, refill, fallback, 0); fprintf(stderr, "[ULTRA_HEAP_STATS] class hit refill fallback\n"); for (int c = 0; c < 8; c++) { if (hit[c] || refill[c] || fallback[c]) { fprintf(stderr, " C%d: %llu %llu %llu\n", c, (unsigned long long)hit[c], (unsigned long long)refill[c], (unsigned long long)fallback[c]); } } } } #endif #endif return 0; }