Files
hakmem/bench_random_mixed.c
Moe Charm (CI) 093f362231 Add Page Box layer for C7 class optimization
- Implement tiny_page_box.c/h: per-thread page cache between UC and Shared Pool
- Integrate Page Box into Unified Cache refill path
- Remove legacy SuperSlab implementation (merged into smallmid)
- Add HAKMEM_TINY_PAGE_BOX_CLASSES env var for selective class enabling
- Update bench_random_mixed.c with Page Box statistics

Current status: Implementation safe, no regressions.
Page Box ON/OFF shows minimal difference - pool strategy needs tuning.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-05 15:31:44 +09:00

273 lines
9.9 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// bench_random_mixed.c — Random mixed small allocations (161024B)
// Usage (direct-link builds via Makefile):
// ./bench_random_mixed_hakmem [cycles] [ws] [seed]
// ./bench_random_mixed_system [cycles] [ws] [seed]
//
// Default: 10M cycles for steady-state measurement (use 100K for quick smoke test)
// Recommended: Run 10 times and calculate mean/median/stddev for accurate results
//
// Prints: "Throughput = <ops/s> operations per second, relative time: <s>."
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <string.h>
#ifdef USE_HAKMEM
#include "hakmem.h"
// Box BenchMeta: Benchmark metadata management (bypass hakmem wrapper)
// Phase 15: Separate BenchMeta (slots array) from CoreAlloc (user workload)
extern void* __libc_calloc(size_t, size_t);
extern void __libc_free(void*);
#define BENCH_META_CALLOC __libc_calloc
#define BENCH_META_FREE __libc_free
// Phase 20-2: BenchFast mode - prealloc pool init
#include "core/box/bench_fast_box.h"
#else
// System malloc build: use standard libc
#define BENCH_META_CALLOC calloc
#define BENCH_META_FREE free
#endif
static inline uint64_t now_ns(void) {
struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec*1000000000ull + (uint64_t)ts.tv_nsec;
}
static inline uint32_t xorshift32(uint32_t* s){
uint32_t x=*s; x^=x<<13; x^=x>>17; x^=x<<5; *s=x; return x;
}
int main(int argc, char** argv){
int cycles = (argc>1)? atoi(argv[1]) : 10000000; // total ops (10M for steady-state measurement)
int ws = (argc>2)? atoi(argv[2]) : 8192; // working-set slots
uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u;
// サイズレンジTiny-only / Non-Tiny-only の比較用)
// 既定: 16..1040 bytes元の挙動と同等
size_t min_size = 16u;
size_t max_size = 16u + 0x3FFu; // 16..1040 ≒ 16..1024
// 優先順位: argv[4]/argv[5] → ENV → 既定
if (argc > 4) {
long v = atol(argv[4]);
if (v > 0) min_size = (size_t)v;
} else {
const char* e = getenv("HAKMEM_BENCH_MIN_SIZE");
if (e && *e) {
long v = atol(e);
if (v > 0) min_size = (size_t)v;
}
}
if (argc > 5) {
long v = atol(argv[5]);
if (v > 0) max_size = (size_t)v;
} else {
const char* e = getenv("HAKMEM_BENCH_MAX_SIZE");
if (e && *e) {
long v = atol(e);
if (v > 0) max_size = (size_t)v;
}
}
if (min_size < 1) min_size = 1;
if (max_size < min_size) max_size = min_size;
if (cycles <= 0) cycles = 1;
if (ws <= 0) ws = 1024;
#ifdef USE_HAKMEM
// Phase 20-2: BenchFast prealloc pool initialization
// Must be called BEFORE main benchmark loop to avoid recursion
int prealloc_count = bench_fast_init();
if (prealloc_count > 0) {
fprintf(stderr, "[BENCH] BenchFast mode: %d blocks preallocated\n", prealloc_count);
}
#else
// System malloc also needs warmup for fair comparison
(void)malloc(1); // Force libc initialization
#endif
// Box BenchMeta: Use __libc_calloc to bypass hakmem wrapper
void** slots = (void**)BENCH_META_CALLOC((size_t)ws, sizeof(void*));
if (!slots) { fprintf(stderr, "alloc failed (slots)\n"); return 1; }
// Warmup run (exclude from timing) - HAKMEM_BENCH_WARMUP=N
const char* warmup_env = getenv("HAKMEM_BENCH_WARMUP");
int warmup_cycles = warmup_env ? atoi(warmup_env) : 0;
if (warmup_cycles > 0) {
fprintf(stderr, "[BENCH_WARMUP] Running %d warmup cycles (not timed)...\n", warmup_cycles);
uint32_t warmup_seed = seed;
for (int i=0; i<warmup_cycles; i++){
uint32_t r = xorshift32(&warmup_seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]){
free(slots[idx]);
slots[idx] = NULL;
} else {
size_t sz = 16u + (r & 0x3FFu);
if (sz < min_size) sz = min_size;
if (sz > max_size) sz = max_size;
void* p = malloc(sz);
if (p) {
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
}
}
}
// Drain warmup allocations
for (int i=0;i<ws;i++){ if (slots[i]) { free(slots[i]); slots[i]=NULL; } }
fprintf(stderr, "[BENCH_WARMUP] Warmup completed. Starting timed run...\n");
}
// SuperSlab Prefault Phase: Pre-allocate SuperSlabs BEFORE timing starts
// Purpose: Trigger ALL page faults during warmup (cold path) instead of during timed loop (hot path)
// Strategy: Run warmup iterations matching the actual benchmark workload
// Expected: This eliminates ~132K page faults from timed section -> 2-4x throughput improvement
//
// Key insight: Page faults occur when allocating from NEW SuperSlabs. A single pass through
// the working set is insufficient - we need enough iterations to exhaust TLS caches and
// force allocation of all SuperSlabs that will be used during the timed loop.
const char* prefault_env = getenv("HAKMEM_BENCH_PREFAULT");
int prefault_iters = prefault_env ? atoi(prefault_env) : (cycles / 10); // Default: 10% of main loop
if (prefault_iters > 0) {
fprintf(stderr, "[WARMUP] SuperSlab prefault: %d warmup iterations (not timed)...\n", prefault_iters);
uint32_t warmup_seed = seed + 0xDEADBEEF; // Use DIFFERENT seed to avoid RNG sequence interference
int warmup_allocs = 0, warmup_frees = 0;
// Run same workload as main loop, but don't time it
for (int i = 0; i < prefault_iters; i++) {
uint32_t r = xorshift32(&warmup_seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]) {
free(slots[idx]);
slots[idx] = NULL;
warmup_frees++;
} else {
size_t sz = 16u + (r & 0x3FFu); // 16..1040 bytes後段でクランプ
if (sz < min_size) sz = min_size;
if (sz > max_size) sz = max_size;
void* p = malloc(sz);
if (p) {
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
warmup_allocs++;
}
}
}
fprintf(stderr, "[WARMUP] Complete. Allocated=%d Freed=%d SuperSlabs populated.\n\n",
warmup_allocs, warmup_frees);
// Main loop will use original 'seed' variable, ensuring reproducible sequence
}
uint64_t start = now_ns();
int frees = 0, allocs = 0;
for (int i=0; i<cycles; i++){
if (0 && (i >= 66000 || (i > 28000 && i % 1000 == 0))) { // DISABLED for perf
fprintf(stderr, "[TEST] Iteration %d (allocs=%d frees=%d)\n", i, allocs, frees);
}
uint32_t r = xorshift32(&seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]){
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[FREE] i=%d ptr=%p idx=%d\n", i, slots[idx], idx);
fflush(stderr);
}
free(slots[idx]);
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[FREE_DONE] i=%d\n", i);
fflush(stderr);
}
slots[idx] = NULL;
frees++;
} else {
// 16..1024 bytes (power-of-two-ish skew, thenクランプ)
size_t sz = 16u + (r & 0x3FFu); // 16..1040 (approx 16..1024)
if (sz < min_size) sz = min_size;
if (sz > max_size) sz = max_size;
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[MALLOC] i=%d sz=%zu idx=%d\n", i, sz, idx);
fflush(stderr);
}
void* p = malloc(sz);
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[MALLOC_DONE] i=%d p=%p\n", i, p);
fflush(stderr);
}
if (!p) continue;
// touch first byte to avoid optimizer artifacts
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
allocs++;
}
}
// drain
fprintf(stderr, "[TEST] Main loop completed. Starting drain phase...\n");
for (int i=0;i<ws;i++){ if (slots[i]) { free(slots[i]); slots[i]=NULL; } }
fprintf(stderr, "[TEST] Drain phase completed.\n");
uint64_t end = now_ns();
double sec = (double)(end-start)/1e9;
double tput = (double)cycles / (sec>0.0?sec:1e-9);
// Include params in output to avoid confusion about test conditions
printf("Throughput = %9.0f ops/s [iter=%d ws=%d] time=%.3fs\n", tput, cycles, ws, sec);
(void)allocs; (void)frees;
// Box BenchMeta: Use __libc_free to bypass hakmem wrapper
BENCH_META_FREE(slots);
#ifdef USE_HAKMEM
// Phase 20-2: Print BenchFast stats (verify pool wasn't exhausted)
bench_fast_stats();
// Production Performance Measurements (ENV-gated: HAKMEM_MEASURE_UNIFIED_CACHE=1)
extern void unified_cache_print_measurements(void);
extern void tls_sll_print_measurements(void);
extern void shared_pool_print_measurements(void);
unified_cache_print_measurements();
tls_sll_print_measurements();
shared_pool_print_measurements();
// Warm Pool Stats (ENV-gated: HAKMEM_WARM_POOL_STATS=1)
extern void tiny_warm_pool_print_stats_public(void);
tiny_warm_pool_print_stats_public();
// Phase 21-1: Ring cache - DELETED (A/B test: OFF is faster)
// extern void ring_cache_print_stats(void);
// ring_cache_print_stats();
// Phase 27: UltraHeap front statistics (experimental, UltraHeap ビルドのみ)
// ENV: HAKMEM_TINY_ULTRA_HEAP_DUMP=1 で出力有効化
#if HAKMEM_TINY_ULTRA_HEAP
{
const char* dump = getenv("HAKMEM_TINY_ULTRA_HEAP_DUMP");
if (dump && *dump && *dump != '0') {
extern void tiny_ultra_heap_stats_snapshot(uint64_t hit[8],
uint64_t refill[8],
uint64_t fallback[8],
int reset);
uint64_t hit[8] = {0}, refill[8] = {0}, fallback[8] = {0};
tiny_ultra_heap_stats_snapshot(hit, refill, fallback, 0);
fprintf(stderr, "[ULTRA_HEAP_STATS] class hit refill fallback\n");
for (int c = 0; c < 8; c++) {
if (hit[c] || refill[c] || fallback[c]) {
fprintf(stderr, " C%d: %llu %llu %llu\n",
c,
(unsigned long long)hit[c],
(unsigned long long)refill[c],
(unsigned long long)fallback[c]);
}
}
}
}
#endif
#endif
return 0;
}