Files
hakmem/bench_random_mixed.c
2025-12-19 03:45:01 +09:00

434 lines
16 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// bench_random_mixed.c — Random mixed small allocations (161024B)
// Usage (direct-link builds via Makefile):
// ./bench_random_mixed_hakmem [cycles] [ws] [seed]
// ./bench_random_mixed_system [cycles] [ws] [seed]
//
// Default: 10M cycles for steady-state measurement (use 100K for quick smoke test)
// Recommended: Run 10 times and calculate mean/median/stddev for accurate results
//
// Prints: "Throughput = <ops/s> operations per second, relative time: <s>."
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <string.h>
#include <strings.h>
#include <stdatomic.h>
#include <sys/resource.h>
#include <unistd.h>
#include "core/bench_profile.h"
#ifdef USE_HAKMEM
#include "hakmem.h"
#include "hakmem_build_flags.h"
#include "core/box/c7_meta_used_counter_box.h"
#include "core/box/tiny_class_stats_box.h"
#include "core/box/tiny_class_policy_box.h"
#include "core/box/ss_stats_box.h"
#include "core/box/warm_pool_rel_counters_box.h"
#include "core/box/tiny_mem_stats_box.h"
#include "core/box/tiny_inline_slots_overflow_stats_box.h"
// Box BenchMeta: Benchmark metadata management (bypass hakmem wrapper)
// Phase 15: Separate BenchMeta (slots array) from CoreAlloc (user workload)
extern void* __libc_calloc(size_t, size_t);
extern void __libc_free(void*);
#define BENCH_META_CALLOC __libc_calloc
#define BENCH_META_FREE __libc_free
// Phase 20-2: BenchFast mode - prealloc pool init
#include "core/box/bench_fast_box.h"
#else
// System malloc build: use standard libc
#define BENCH_META_CALLOC calloc
#define BENCH_META_FREE free
#endif
static inline uint64_t now_ns(void) {
struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec*1000000000ull + (uint64_t)ts.tv_nsec;
}
static inline uint32_t xorshift32(uint32_t* s){
uint32_t x=*s; x^=x<<13; x^=x>>17; x^=x<<5; *s=x; return x;
}
static inline long read_rss_kb_current(void) {
FILE* f = fopen("/proc/self/statm", "r");
if (!f) return 0;
unsigned long size_pages = 0, rss_pages = 0;
int n = fscanf(f, "%lu %lu", &size_pages, &rss_pages);
fclose(f);
if (n != 2) return 0;
long page_size = sysconf(_SC_PAGESIZE);
if (page_size <= 0) return 0;
return (long)((rss_pages * (unsigned long)page_size) / 1024ul);
}
// Debug helper: C7 専用ベンチモード (ENV: HAKMEM_BENCH_C7_ONLY=1)
static int bench_mode_c7_only = -1;
static inline int bench_is_c7_only_mode(void) {
if (bench_mode_c7_only == -1) {
const char* e = getenv("HAKMEM_BENCH_C7_ONLY");
bench_mode_c7_only = (e && *e && *e != '0') ? 1 : 0;
}
return bench_mode_c7_only;
}
// C5/C6 専用ベンチモード (ENV: HAKMEM_BENCH_C5_ONLY / HAKMEM_BENCH_C6_ONLY)
static int bench_mode_c5_only = -1;
static int bench_mode_c6_only = -1;
static inline int bench_is_c5_only_mode(void) {
if (bench_mode_c5_only == -1) {
const char* e = getenv("HAKMEM_BENCH_C5_ONLY");
bench_mode_c5_only = (e && *e && *e != '0') ? 1 : 0;
}
return bench_mode_c5_only;
}
static inline int bench_is_c6_only_mode(void) {
if (bench_mode_c6_only == -1) {
const char* e = getenv("HAKMEM_BENCH_C6_ONLY");
bench_mode_c6_only = (e && *e && *e != '0') ? 1 : 0;
}
return bench_mode_c6_only;
}
int main(int argc, char** argv){
bench_apply_profile();
uint64_t cycles = (argc>1)? (uint64_t)strtoull(argv[1], NULL, 10) : 10000000ull; // total ops (10M for steady-state measurement)
int ws = (argc>2)? atoi(argv[2]) : 8192; // working-set slots
uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u;
struct rusage ru0 = {0}, ru1 = {0};
getrusage(RUSAGE_SELF, &ru0);
// サイズレンジTiny-only / Non-Tiny-only の比較用)
// 既定: 16..1040 bytes元の挙動と同等
size_t min_size = 16u;
size_t max_size = 16u + 0x3FFu; // 16..1040 ≒ 16..1024
// 優先順位: argv[4]/argv[5] → ENV → 既定
if (argc > 4) {
long v = atol(argv[4]);
if (v > 0) min_size = (size_t)v;
} else {
const char* e = getenv("HAKMEM_BENCH_MIN_SIZE");
if (e && *e) {
long v = atol(e);
if (v > 0) min_size = (size_t)v;
}
}
if (argc > 5) {
long v = atol(argv[5]);
if (v > 0) max_size = (size_t)v;
} else {
const char* e = getenv("HAKMEM_BENCH_MAX_SIZE");
if (e && *e) {
long v = atol(e);
if (v > 0) max_size = (size_t)v;
}
}
if (min_size < 1) min_size = 1;
if (max_size < min_size) max_size = min_size;
// C5/C6/C7 専用モード: サイズを各クラス帯に固定
if (bench_is_c5_only_mode()) {
min_size = 256;
max_size = 256;
} else if (bench_is_c6_only_mode()) {
min_size = 512;
max_size = 512;
} else if (bench_is_c7_only_mode()) {
min_size = 1024;
max_size = 1024;
}
if (cycles == 0) cycles = 1;
if (ws <= 0) ws = 1024;
#ifdef USE_HAKMEM
// Phase 20-2: BenchFast prealloc pool initialization
// Must be called BEFORE main benchmark loop to avoid recursion
int prealloc_count = bench_fast_init();
if (prealloc_count > 0) {
fprintf(stderr, "[BENCH] BenchFast mode: %d blocks preallocated\n", prealloc_count);
}
// Phase 46A: Pre-initialize unified_cache (must be before alloc hot path)
// Remove lazy-init check overhead from unified_cache_push/pop hot paths
#if HAKMEM_BENCH_MINIMAL
extern void unified_cache_init(void);
unified_cache_init(); // Called once at startup (FAST-only)
#endif
#else
// System malloc also needs warmup for fair comparison
(void)malloc(1); // Force libc initialization
#endif
// Box BenchMeta: Use __libc_calloc to bypass hakmem wrapper
void** slots = (void**)BENCH_META_CALLOC((size_t)ws, sizeof(void*));
if (!slots) { fprintf(stderr, "alloc failed (slots)\n"); return 1; }
// Warmup run (exclude from timing) - HAKMEM_BENCH_WARMUP=N
const char* warmup_env = getenv("HAKMEM_BENCH_WARMUP");
int warmup_cycles = warmup_env ? atoi(warmup_env) : 0;
if (warmup_cycles > 0) {
fprintf(stderr, "[BENCH_WARMUP] Running %d warmup cycles (not timed)...\n", warmup_cycles);
uint32_t warmup_seed = seed;
for (int i=0; i<warmup_cycles; i++){
uint32_t r = xorshift32(&warmup_seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]){
free(slots[idx]);
slots[idx] = NULL;
} else {
size_t sz = 16u + (r & 0x3FFu);
if (sz < min_size) sz = min_size;
if (sz > max_size) sz = max_size;
void* p = malloc(sz);
if (p) {
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
}
}
}
// Drain warmup allocations
for (int i=0;i<ws;i++){ if (slots[i]) { free(slots[i]); slots[i]=NULL; } }
fprintf(stderr, "[BENCH_WARMUP] Warmup completed. Starting timed run...\n");
}
// SuperSlab Prefault Phase: Pre-allocate SuperSlabs BEFORE timing starts
// Purpose: Trigger ALL page faults during warmup (cold path) instead of during timed loop (hot path)
// Strategy: Run warmup iterations matching the actual benchmark workload
// Expected: This eliminates ~132K page faults from timed section -> 2-4x throughput improvement
//
// Key insight: Page faults occur when allocating from NEW SuperSlabs. A single pass through
// the working set is insufficient - we need enough iterations to exhaust TLS caches and
// force allocation of all SuperSlabs that will be used during the timed loop.
const char* prefault_env = getenv("HAKMEM_BENCH_PREFAULT");
int prefault_iters = prefault_env ? atoi(prefault_env) : (int)(cycles / 10); // Default: 10% of main loop
if (cycles > 0x7fffffffULL) {
prefault_iters = prefault_env ? prefault_iters : 0x7fffffff; // clamp default
}
if (prefault_iters > 0) {
fprintf(stderr, "[WARMUP] SuperSlab prefault: %d warmup iterations (not timed)...\n", prefault_iters);
uint32_t warmup_seed = seed + 0xDEADBEEF; // Use DIFFERENT seed to avoid RNG sequence interference
int warmup_allocs = 0, warmup_frees = 0;
// Run same workload as main loop, but don't time it
for (int i = 0; i < prefault_iters; i++) {
uint32_t r = xorshift32(&warmup_seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]) {
free(slots[idx]);
slots[idx] = NULL;
warmup_frees++;
} else {
size_t sz = 16u + (r & 0x3FFu); // 16..1040 bytes後段でクランプ
if (sz < min_size) sz = min_size;
if (sz > max_size) sz = max_size;
void* p = malloc(sz);
if (p) {
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
warmup_allocs++;
}
}
}
fprintf(stderr, "[WARMUP] Complete. Allocated=%d Freed=%d SuperSlabs populated.\n\n",
warmup_allocs, warmup_frees);
// Main loop will use original 'seed' variable, ensuring reproducible sequence
}
// Optional epoch mode (single-process soak):
// - ENV: HAKMEM_BENCH_EPOCH_ITERS=N (default: 0=disabled)
// - Prints per-epoch throughput + current RSS (from /proc) without exiting the process.
uint64_t epoch_iters = 0;
{
const char* e = getenv("HAKMEM_BENCH_EPOCH_ITERS");
if (e && *e) {
epoch_iters = (uint64_t)strtoull(e, NULL, 10);
}
}
uint64_t start = now_ns();
int frees = 0, allocs = 0;
uint64_t remaining = cycles;
uint64_t epoch_idx = 0;
while (remaining > 0) {
uint64_t nops = remaining;
if (epoch_iters > 0 && epoch_iters < nops) nops = epoch_iters;
if (nops > 0x7fffffffULL) nops = 0x7fffffffULL; // keep inner loop int-sized
uint64_t epoch_start = now_ns();
for (int i = 0; i < (int)nops; i++) {
uint32_t r = xorshift32(&seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]) {
free(slots[idx]);
slots[idx] = NULL;
frees++;
} else {
size_t sz = 16u + (r & 0x3FFu); // 16..1040 (approx 16..1024)
if (sz < min_size) sz = min_size;
if (sz > max_size) sz = max_size;
void* p = malloc(sz);
if (!p) continue;
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
allocs++;
}
}
uint64_t epoch_end = now_ns();
if (epoch_iters > 0) {
double sec = (double)(epoch_end - epoch_start) / 1e9;
double tput = (double)nops / (sec > 0.0 ? sec : 1e-9);
long rss_kb = read_rss_kb_current();
printf("[EPOCH] %llu Throughput = %9.0f ops/s [iter=%llu ws=%d] time=%.3fs rss_kb=%ld\n",
(unsigned long long)epoch_idx,
tput,
(unsigned long long)nops,
ws,
sec,
rss_kb);
fflush(stdout);
epoch_idx++;
}
remaining -= nops;
}
// drain
fprintf(stderr, "[TEST] Main loop completed. Starting drain phase...\n");
for (int i=0;i<ws;i++){ if (slots[i]) { free(slots[i]); slots[i]=NULL; } }
fprintf(stderr, "[TEST] Drain phase completed.\n");
uint64_t end = now_ns();
getrusage(RUSAGE_SELF, &ru1);
double sec = (double)(end-start)/1e9;
double tput = (double)cycles / (sec>0.0?sec:1e-9);
// Include params in output to avoid confusion about test conditions
printf("Throughput = %9.0f ops/s [iter=%llu ws=%d] time=%.3fs\n",
tput, (unsigned long long)cycles, ws, sec);
long rss_kb = ru1.ru_maxrss;
fprintf(stderr, "[RSS] max_kb=%ld\n", rss_kb);
(void)allocs; (void)frees;
// Box BenchMeta: Use __libc_free to bypass hakmem wrapper
BENCH_META_FREE(slots);
#ifdef USE_HAKMEM
// Phase 20-2: Print BenchFast stats (verify pool wasn't exhausted)
bench_fast_stats();
// Production Performance Measurements (ENV-gated: HAKMEM_MEASURE_UNIFIED_CACHE=1)
extern void unified_cache_print_measurements(void);
extern void tls_sll_print_measurements(void);
extern void shared_pool_print_measurements(void);
unified_cache_print_measurements();
tls_sll_print_measurements();
shared_pool_print_measurements();
// OBSERVE: per-class class stats (thread/global) for policy tuning
const char* stats_dump_env = getenv("HAKMEM_TINY_STATS_DUMP");
const char* policy_profile_env = getenv("HAKMEM_TINY_POLICY_PROFILE");
int policy_is_auto = (policy_profile_env &&
strcasecmp(policy_profile_env, "auto") == 0);
int dump_stats = (stats_dump_env && *stats_dump_env && *stats_dump_env != '0') || policy_is_auto;
if (dump_stats) {
tiny_class_stats_dump_thread(stderr, "[CLASS_STATS_THREAD]");
tiny_class_stats_dump_global(stderr, "[CLASS_STATS_GLOBAL]");
}
const char* tiny_mem_dump_env = getenv("HAKMEM_TINY_MEM_DUMP");
if (tiny_mem_dump_env && *tiny_mem_dump_env && *tiny_mem_dump_env != '0') {
tiny_mem_stats_dump();
}
// Superslab/slab counters (ENV: HAKMEM_SS_STATS_DUMP=1)
ss_stats_dump_if_requested();
// Warm Pool Stats (ENV-gated: HAKMEM_WARM_POOL_STATS=1)
extern void tiny_warm_pool_print_stats_public(void);
tiny_warm_pool_print_stats_public();
if (policy_is_auto) {
tiny_class_policy_refresh_auto();
tiny_class_policy_dump("[POLICY_AUTO]");
}
#if HAKMEM_BUILD_RELEASE
// Minimal Release-side telemetry to verify Warm path usage (C7-only)
extern _Atomic uint64_t g_rel_c7_warm_pop;
extern _Atomic uint64_t g_rel_c7_warm_push;
fprintf(stderr,
"[REL_C7_CARVE] attempts=%llu success=%llu zero=%llu\n",
(unsigned long long)warm_pool_rel_c7_carve_attempts(),
(unsigned long long)warm_pool_rel_c7_carve_successes(),
(unsigned long long)warm_pool_rel_c7_carve_zeroes());
fprintf(stderr,
"[REL_C7_WARM] pop=%llu push=%llu\n",
(unsigned long long)atomic_load_explicit(&g_rel_c7_warm_pop, memory_order_relaxed),
(unsigned long long)atomic_load_explicit(&g_rel_c7_warm_push, memory_order_relaxed));
fprintf(stderr,
"[REL_C7_WARM_PREFILL] calls=%llu slabs=%llu\n",
(unsigned long long)warm_pool_rel_c7_prefill_calls(),
(unsigned long long)warm_pool_rel_c7_prefill_slabs());
fprintf(stderr,
"[REL_C7_META_USED_INC] total=%llu backend=%llu tls=%llu front=%llu\n",
(unsigned long long)c7_meta_used_total(),
(unsigned long long)c7_meta_used_backend(),
(unsigned long long)c7_meta_used_tls(),
(unsigned long long)c7_meta_used_front());
#else
fprintf(stderr,
"[DBG_C7_META_USED_INC] total=%llu backend=%llu tls=%llu front=%llu\n",
(unsigned long long)c7_meta_used_total(),
(unsigned long long)c7_meta_used_backend(),
(unsigned long long)c7_meta_used_tls(),
(unsigned long long)c7_meta_used_front());
#endif
// Phase 21-1: Ring cache - DELETED (A/B test: OFF is faster)
// extern void ring_cache_print_stats(void);
// ring_cache_print_stats();
// Phase 27: UltraHeap front statistics (experimental, UltraHeap ビルドのみ)
// ENV: HAKMEM_TINY_ULTRA_HEAP_DUMP=1 で出力有効化
#if HAKMEM_TINY_ULTRA_HEAP
{
const char* dump = getenv("HAKMEM_TINY_ULTRA_HEAP_DUMP");
if (dump && *dump && *dump != '0') {
extern void tiny_ultra_heap_stats_snapshot(uint64_t hit[8],
uint64_t refill[8],
uint64_t fallback[8],
int reset);
uint64_t hit[8] = {0}, refill[8] = {0}, fallback[8] = {0};
tiny_ultra_heap_stats_snapshot(hit, refill, fallback, 0);
fprintf(stderr, "[ULTRA_HEAP_STATS] class hit refill fallback\n");
for (int c = 0; c < 8; c++) {
if (hit[c] || refill[c] || fallback[c]) {
fprintf(stderr, " C%d: %llu %llu %llu\n",
c,
(unsigned long long)hit[c],
(unsigned long long)refill[c],
(unsigned long long)fallback[c]);
}
}
}
}
#endif
#endif
// Phase 87: Print overflow statistics
#ifdef USE_HAKMEM
tiny_inline_slots_overflow_report_stats();
#endif
return 0;
}