Performance Measurement Framework: Unified Cache, TLS SLL, Shared Pool Analysis
## Summary Implemented production-grade measurement infrastructure to quantify top 3 bottlenecks: - Unified cache hit/miss rates + refill cost - TLS SLL usage patterns - Shared pool lock contention distribution ## Changes ### 1. Unified Cache Metrics (tiny_unified_cache.h/c) - Added atomic counters: - g_unified_cache_hits_global: successful cache pops - g_unified_cache_misses_global: refill triggers - g_unified_cache_refill_cycles_global: refill cost in CPU cycles (rdtsc) - Instrumented `unified_cache_pop_or_refill()` to count hits - Instrumented `unified_cache_refill()` with cycle measurement - ENV-gated: HAKMEM_MEASURE_UNIFIED_CACHE=1 (default: off) - Added unified_cache_print_measurements() output function ### 2. TLS SLL Metrics (tls_sll_box.h) - Added atomic counters: - g_tls_sll_push_count_global: total pushes - g_tls_sll_pop_count_global: successful pops - g_tls_sll_pop_empty_count_global: empty list conditions - Instrumented push/pop paths - Added tls_sll_print_measurements() output function ### 3. Shared Pool Contention (hakmem_shared_pool_acquire.c) - Added atomic counters: - g_sp_stage2_lock_acquired_global: Stage 2 locks - g_sp_stage3_lock_acquired_global: Stage 3 allocations - g_sp_alloc_lock_contention_global: total lock acquisitions - Instrumented all pthread_mutex_lock calls in hot paths - Added shared_pool_print_measurements() output function ### 4. Benchmark Integration (bench_random_mixed.c) - Called all 3 print functions after benchmark loop - Functions active only when HAKMEM_MEASURE_UNIFIED_CACHE=1 set ## Design Principles - **Zero overhead when disabled**: Inline checks with __builtin_expect hints - **Atomic relaxed memory order**: Minimal synchronization overhead - **ENV-gated**: Single flag controls all measurements - **Production-safe**: Compiles in release builds, no functional changes ## Usage ```bash HAKMEM_MEASURE_UNIFIED_CACHE=1 ./bench_allocators_hakmem bench_random_mixed_hakmem 1000000 256 42 ``` Output (when enabled): ``` ======================================== Unified Cache Statistics ======================================== Hits: 1234567 Misses: 56789 Hit Rate: 95.6% Avg Refill Cycles: 1234 ======================================== TLS SLL Statistics ======================================== Total Pushes: 1234567 Total Pops: 345678 Pop Empty Count: 12345 Hit Rate: 98.8% ======================================== Shared Pool Contention Statistics ======================================== Stage 2 Locks: 123456 (33%) Stage 3 Locks: 234567 (67%) Total Contention: 357 locks per 1M ops ``` ## Next Steps 1. **Enable measurements** and run benchmarks to gather data 2. **Analyze miss rates**: Which bottleneck dominates? 3. **Profile hottest stage**: Focus optimization on top contributor 4. Possible targets: - Increase unified cache capacity if miss rate >5% - Profile if TLS SLL is unused (potential legacy code removal) - Analyze if Stage 2 lock can be replaced with CAS ## Makefile Updates Added core/box/tiny_route_box.o to: - OBJS_BASE (test build) - SHARED_OBJS (shared library) - BENCH_HAKMEM_OBJS_BASE (benchmark) - TINY_BENCH_OBJS_BASE (tiny benchmark) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -14,6 +14,28 @@
|
||||
#include <stdio.h>
|
||||
#include <stdatomic.h>
|
||||
|
||||
// ============================================================================
|
||||
// Performance Measurement: Shared Pool Lock Contention (ENV-gated)
|
||||
// ============================================================================
|
||||
// Global atomic counters for lock contention measurement
|
||||
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
|
||||
_Atomic uint64_t g_sp_stage2_lock_acquired_global = 0;
|
||||
_Atomic uint64_t g_sp_stage3_lock_acquired_global = 0;
|
||||
_Atomic uint64_t g_sp_alloc_lock_contention_global = 0;
|
||||
|
||||
// Check if measurement is enabled (cached)
|
||||
static inline int sp_measure_enabled(void) {
|
||||
static int g_measure = -1;
|
||||
if (__builtin_expect(g_measure == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
|
||||
g_measure = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
return g_measure;
|
||||
}
|
||||
|
||||
// Print statistics function
|
||||
void shared_pool_print_measurements(void);
|
||||
|
||||
// Stage 0.5: EMPTY slab direct scan(registry ベースの EMPTY 再利用)
|
||||
// Scan existing SuperSlabs for EMPTY slabs (highest reuse priority) to
|
||||
// avoid Stage 3 (mmap) when freed slabs are available.
|
||||
@ -266,6 +288,12 @@ stage2_fallback:
|
||||
|
||||
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
||||
|
||||
// Performance measurement: count Stage 2 lock acquisitions
|
||||
if (__builtin_expect(sp_measure_enabled(), 0)) {
|
||||
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global, 1, memory_order_relaxed);
|
||||
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Update SuperSlab metadata under mutex
|
||||
ss->slab_bitmap |= (1u << claimed_idx);
|
||||
ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);
|
||||
@ -349,6 +377,12 @@ stage2_scan:
|
||||
|
||||
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
||||
|
||||
// Performance measurement: count Stage 2 scan lock acquisitions
|
||||
if (__builtin_expect(sp_measure_enabled(), 0)) {
|
||||
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global, 1, memory_order_relaxed);
|
||||
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed);
|
||||
}
|
||||
|
||||
// Update SuperSlab metadata under mutex
|
||||
ss->slab_bitmap |= (1u << claimed_idx);
|
||||
ss_slab_meta_class_idx_set(ss, claimed_idx, (uint8_t)class_idx);
|
||||
@ -421,6 +455,12 @@ stage2_scan:
|
||||
|
||||
pthread_mutex_lock(&g_shared_pool.alloc_lock);
|
||||
|
||||
// Performance measurement: count Stage 3 lock acquisitions
|
||||
if (__builtin_expect(sp_measure_enabled(), 0)) {
|
||||
atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_global, 1, memory_order_relaxed);
|
||||
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed);
|
||||
}
|
||||
|
||||
// ========== Stage 3: Get new SuperSlab ==========
|
||||
// Try LRU cache first, then mmap
|
||||
SuperSlab* new_ss = NULL;
|
||||
@ -541,3 +581,39 @@ stage2_scan:
|
||||
}
|
||||
return 0; // ✅ Stage 3 success
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Performance Measurement: Print Statistics
|
||||
// ============================================================================
|
||||
void shared_pool_print_measurements(void) {
|
||||
if (!sp_measure_enabled()) {
|
||||
return; // Measurement disabled
|
||||
}
|
||||
|
||||
uint64_t stage2 = atomic_load_explicit(&g_sp_stage2_lock_acquired_global, memory_order_relaxed);
|
||||
uint64_t stage3 = atomic_load_explicit(&g_sp_stage3_lock_acquired_global, memory_order_relaxed);
|
||||
uint64_t total_locks = atomic_load_explicit(&g_sp_alloc_lock_contention_global, memory_order_relaxed);
|
||||
|
||||
if (total_locks == 0) {
|
||||
fprintf(stderr, "\n========================================\n");
|
||||
fprintf(stderr, "Shared Pool Contention Statistics\n");
|
||||
fprintf(stderr, "========================================\n");
|
||||
fprintf(stderr, "No lock acquisitions recorded\n");
|
||||
fprintf(stderr, "========================================\n\n");
|
||||
return;
|
||||
}
|
||||
|
||||
double stage2_pct = (100.0 * stage2) / total_locks;
|
||||
double stage3_pct = (100.0 * stage3) / total_locks;
|
||||
|
||||
fprintf(stderr, "\n========================================\n");
|
||||
fprintf(stderr, "Shared Pool Contention Statistics\n");
|
||||
fprintf(stderr, "========================================\n");
|
||||
fprintf(stderr, "Stage 2 Locks: %llu (%.1f%%)\n",
|
||||
(unsigned long long)stage2, stage2_pct);
|
||||
fprintf(stderr, "Stage 3 Locks: %llu (%.1f%%)\n",
|
||||
(unsigned long long)stage3, stage3_pct);
|
||||
fprintf(stderr, "Total Contention: %llu lock acquisitions\n",
|
||||
(unsigned long long)total_locks);
|
||||
fprintf(stderr, "========================================\n\n");
|
||||
}
|
||||
|
||||
Reference in New Issue
Block a user