Files
hakmem/bench_random_mixed.c
Moe Charm (CI) 860991ee50 Performance Measurement Framework: Unified Cache, TLS SLL, Shared Pool Analysis
## Summary

Implemented production-grade measurement infrastructure to quantify top 3 bottlenecks:
- Unified cache hit/miss rates + refill cost
- TLS SLL usage patterns
- Shared pool lock contention distribution

## Changes

### 1. Unified Cache Metrics (tiny_unified_cache.h/c)
- Added atomic counters:
  - g_unified_cache_hits_global: successful cache pops
  - g_unified_cache_misses_global: refill triggers
  - g_unified_cache_refill_cycles_global: refill cost in CPU cycles (rdtsc)
- Instrumented `unified_cache_pop_or_refill()` to count hits
- Instrumented `unified_cache_refill()` with cycle measurement
- ENV-gated: HAKMEM_MEASURE_UNIFIED_CACHE=1 (default: off)
- Added unified_cache_print_measurements() output function

### 2. TLS SLL Metrics (tls_sll_box.h)
- Added atomic counters:
  - g_tls_sll_push_count_global: total pushes
  - g_tls_sll_pop_count_global: successful pops
  - g_tls_sll_pop_empty_count_global: empty list conditions
- Instrumented push/pop paths
- Added tls_sll_print_measurements() output function

### 3. Shared Pool Contention (hakmem_shared_pool_acquire.c)
- Added atomic counters:
  - g_sp_stage2_lock_acquired_global: Stage 2 locks
  - g_sp_stage3_lock_acquired_global: Stage 3 allocations
  - g_sp_alloc_lock_contention_global: total lock acquisitions
- Instrumented all pthread_mutex_lock calls in hot paths
- Added shared_pool_print_measurements() output function

### 4. Benchmark Integration (bench_random_mixed.c)
- Called all 3 print functions after benchmark loop
- Functions active only when HAKMEM_MEASURE_UNIFIED_CACHE=1 set

## Design Principles

- **Zero overhead when disabled**: Inline checks with __builtin_expect hints
- **Atomic relaxed memory order**: Minimal synchronization overhead
- **ENV-gated**: Single flag controls all measurements
- **Production-safe**: Compiles in release builds, no functional changes

## Usage

```bash
HAKMEM_MEASURE_UNIFIED_CACHE=1 ./bench_allocators_hakmem bench_random_mixed_hakmem 1000000 256 42
```

Output (when enabled):
```
========================================
Unified Cache Statistics
========================================
Hits:        1234567
Misses:      56789
Hit Rate:    95.6%
Avg Refill Cycles: 1234

========================================
TLS SLL Statistics
========================================
Total Pushes:     1234567
Total Pops:       345678
Pop Empty Count:  12345
Hit Rate:         98.8%

========================================
Shared Pool Contention Statistics
========================================
Stage 2 Locks:    123456 (33%)
Stage 3 Locks:    234567 (67%)
Total Contention: 357 locks per 1M ops
```

## Next Steps

1. **Enable measurements** and run benchmarks to gather data
2. **Analyze miss rates**: Which bottleneck dominates?
3. **Profile hottest stage**: Focus optimization on top contributor
4. Possible targets:
   - Increase unified cache capacity if miss rate >5%
   - Profile if TLS SLL is unused (potential legacy code removal)
   - Analyze if Stage 2 lock can be replaced with CAS

## Makefile Updates

Added core/box/tiny_route_box.o to:
- OBJS_BASE (test build)
- SHARED_OBJS (shared library)
- BENCH_HAKMEM_OBJS_BASE (benchmark)
- TINY_BENCH_OBJS_BASE (tiny benchmark)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 18:26:39 +09:00

192 lines
6.8 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// bench_random_mixed.c — Random mixed small allocations (161024B)
// Usage (direct-link builds via Makefile):
// ./bench_random_mixed_hakmem [cycles] [ws] [seed]
// ./bench_random_mixed_system [cycles] [ws] [seed]
//
// Default: 10M cycles for steady-state measurement (use 100K for quick smoke test)
// Recommended: Run 10 times and calculate mean/median/stddev for accurate results
//
// Prints: "Throughput = <ops/s> operations per second, relative time: <s>."
#include <stdio.h>
#include <stdlib.h>
#include <stdint.h>
#include <time.h>
#include <string.h>
#ifdef USE_HAKMEM
#include "hakmem.h"
// Box BenchMeta: Benchmark metadata management (bypass hakmem wrapper)
// Phase 15: Separate BenchMeta (slots array) from CoreAlloc (user workload)
extern void* __libc_calloc(size_t, size_t);
extern void __libc_free(void*);
#define BENCH_META_CALLOC __libc_calloc
#define BENCH_META_FREE __libc_free
// Phase 20-2: BenchFast mode - prealloc pool init
#include "core/box/bench_fast_box.h"
#else
// System malloc build: use standard libc
#define BENCH_META_CALLOC calloc
#define BENCH_META_FREE free
#endif
static inline uint64_t now_ns(void) {
struct timespec ts; clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec*1000000000ull + (uint64_t)ts.tv_nsec;
}
static inline uint32_t xorshift32(uint32_t* s){
uint32_t x=*s; x^=x<<13; x^=x>>17; x^=x<<5; *s=x; return x;
}
int main(int argc, char** argv){
int cycles = (argc>1)? atoi(argv[1]) : 10000000; // total ops (10M for steady-state measurement)
int ws = (argc>2)? atoi(argv[2]) : 8192; // working-set slots
uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u;
if (cycles <= 0) cycles = 1;
if (ws <= 0) ws = 1024;
#ifdef USE_HAKMEM
// Phase 20-2: BenchFast prealloc pool initialization
// Must be called BEFORE main benchmark loop to avoid recursion
int prealloc_count = bench_fast_init();
if (prealloc_count > 0) {
fprintf(stderr, "[BENCH] BenchFast mode: %d blocks preallocated\n", prealloc_count);
}
#else
// System malloc also needs warmup for fair comparison
(void)malloc(1); // Force libc initialization
#endif
// Box BenchMeta: Use __libc_calloc to bypass hakmem wrapper
void** slots = (void**)BENCH_META_CALLOC((size_t)ws, sizeof(void*));
if (!slots) { fprintf(stderr, "alloc failed (slots)\n"); return 1; }
// Warmup run (exclude from timing) - HAKMEM_BENCH_WARMUP=N
const char* warmup_env = getenv("HAKMEM_BENCH_WARMUP");
int warmup_cycles = warmup_env ? atoi(warmup_env) : 0;
if (warmup_cycles > 0) {
fprintf(stderr, "[BENCH_WARMUP] Running %d warmup cycles (not timed)...\n", warmup_cycles);
uint32_t warmup_seed = seed;
for (int i=0; i<warmup_cycles; i++){
uint32_t r = xorshift32(&warmup_seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]){
free(slots[idx]);
slots[idx] = NULL;
} else {
size_t sz = 16u + (r & 0x3FFu);
void* p = malloc(sz);
if (p) {
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
}
}
}
// Drain warmup allocations
for (int i=0;i<ws;i++){ if (slots[i]) { free(slots[i]); slots[i]=NULL; } }
fprintf(stderr, "[BENCH_WARMUP] Warmup completed. Starting timed run...\n");
}
uint64_t start = now_ns();
int frees = 0, allocs = 0;
for (int i=0; i<cycles; i++){
if (0 && (i >= 66000 || (i > 28000 && i % 1000 == 0))) { // DISABLED for perf
fprintf(stderr, "[TEST] Iteration %d (allocs=%d frees=%d)\n", i, allocs, frees);
}
uint32_t r = xorshift32(&seed);
int idx = (int)(r % (uint32_t)ws);
if (slots[idx]){
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[FREE] i=%d ptr=%p idx=%d\n", i, slots[idx], idx);
fflush(stderr);
}
free(slots[idx]);
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[FREE_DONE] i=%d\n", i);
fflush(stderr);
}
slots[idx] = NULL;
frees++;
} else {
// 16..1024 bytes (power-of-two-ish skew)
size_t sz = 16u + (r & 0x3FFu); // 16..1040 (approx 16..1024)
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[MALLOC] i=%d sz=%zu idx=%d\n", i, sz, idx);
fflush(stderr);
}
void* p = malloc(sz);
if (0 && i > 28300) { // DISABLED (Phase 2 perf)
fprintf(stderr, "[MALLOC_DONE] i=%d p=%p\n", i, p);
fflush(stderr);
}
if (!p) continue;
// touch first byte to avoid optimizer artifacts
((unsigned char*)p)[0] = (unsigned char)r;
slots[idx] = p;
allocs++;
}
}
// drain
fprintf(stderr, "[TEST] Main loop completed. Starting drain phase...\n");
for (int i=0;i<ws;i++){ if (slots[i]) { free(slots[i]); slots[i]=NULL; } }
fprintf(stderr, "[TEST] Drain phase completed.\n");
uint64_t end = now_ns();
double sec = (double)(end-start)/1e9;
double tput = (double)cycles / (sec>0.0?sec:1e-9);
// Include params in output to avoid confusion about test conditions
printf("Throughput = %9.0f ops/s [iter=%d ws=%d] time=%.3fs\n", tput, cycles, ws, sec);
(void)allocs; (void)frees;
// Box BenchMeta: Use __libc_free to bypass hakmem wrapper
BENCH_META_FREE(slots);
#ifdef USE_HAKMEM
// Phase 20-2: Print BenchFast stats (verify pool wasn't exhausted)
bench_fast_stats();
// Production Performance Measurements (ENV-gated: HAKMEM_MEASURE_UNIFIED_CACHE=1)
extern void unified_cache_print_measurements(void);
extern void tls_sll_print_measurements(void);
extern void shared_pool_print_measurements(void);
unified_cache_print_measurements();
tls_sll_print_measurements();
shared_pool_print_measurements();
// Phase 21-1: Ring cache - DELETED (A/B test: OFF is faster)
// extern void ring_cache_print_stats(void);
// ring_cache_print_stats();
// Phase 27: UltraHeap front statistics (experimental, UltraHeap ビルドのみ)
// ENV: HAKMEM_TINY_ULTRA_HEAP_DUMP=1 で出力有効化
#if HAKMEM_TINY_ULTRA_HEAP
{
const char* dump = getenv("HAKMEM_TINY_ULTRA_HEAP_DUMP");
if (dump && *dump && *dump != '0') {
extern void tiny_ultra_heap_stats_snapshot(uint64_t hit[8],
uint64_t refill[8],
uint64_t fallback[8],
int reset);
uint64_t hit[8] = {0}, refill[8] = {0}, fallback[8] = {0};
tiny_ultra_heap_stats_snapshot(hit, refill, fallback, 0);
fprintf(stderr, "[ULTRA_HEAP_STATS] class hit refill fallback\n");
for (int c = 0; c < 8; c++) {
if (hit[c] || refill[c] || fallback[c]) {
fprintf(stderr, " C%d: %llu %llu %llu\n",
c,
(unsigned long long)hit[c],
(unsigned long long)refill[c],
(unsigned long long)fallback[c]);
}
}
}
}
#endif
#endif
return 0;
}