Files
hakmem/analyze_results.py
Moe Charm (CI) 5685c2f4c9 Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)
Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being
implemented, causing all cache misses to go through expensive superslab_refill
registry scans.

Root Cause Analysis:
- Warm pool was initialized once and pushed a single slab after each refill
- When that slab was exhausted, it was discarded (not pushed back)
- Next refill would push another single slab, which was immediately exhausted
- Pool would oscillate between 0 and 1 items, yielding 0% hit rate

Solution: Secondary Prefill on Cache Miss
When warm pool becomes empty, we now do multiple superslab_refills and prefill
the pool with 3 additional HOT superlslabs before attempting to carve. This
builds a working set of slabs that can sustain allocation pressure.

Implementation Details:
- Modified unified_cache_refill() cold path to detect empty pool
- Added prefill loop: when pool count == 0, load 3 extra superlslabs
- Store extra slabs in warm pool, keep 1 in TLS for immediate carving
- Track prefill events in g_warm_pool_stats[].prefilled counter

Results (1M Random Mixed 256B allocations):
- Before: C7 hits=1, misses=3976, hit_rate=0.0%
- After:  C7 hits=3929, misses=3143, hit_rate=55.6%
- Throughput: 4.055M ops/s (maintained vs 4.07M baseline)
- Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s)

Performance Impact:
- No regression: throughput remained stable at ~4.1M ops/s
- Registry scan avoided in 55.6% of cache misses (significant savings)
- Warm pool now functioning as intended with strong locality

Configuration:
- TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill
- Prefill budget hardcoded to 3 (tunable via env var if needed later)
- All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1

Next Steps:
- Monitor for further optimization opportunities (prefill budget tuning)
- Consider adaptive prefill budget based on class-specific hit rates
- Validate at larger allocation counts (10M+ pending registry size fix)

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00

300 lines
13 KiB
Python
Executable File

#!/usr/bin/env python3
"""
Statistical analysis of Gatekeeper inlining optimization benchmark results.
"""
import math
import statistics
# Test 1: Standard benchmark (random_mixed 1000000 256 42)
# Format: ops/s (last value in CSV line)
test1_with_inline = [1009752.7, 1003150.9, 967146.5, 1031062.8, 1264682.2]
test1_no_inline = [1084443.4, 830483.4, 1025638.4, 849866.1, 980895.1]
# Test 2: Conservative profile (HAKMEM_TINY_PROFILE=conservative HAKMEM_SS_PREFAULT=0)
test2_with_inline = [906469.6, 1160466.4, 1175722.3, 1034643.5, 1199156.5]
test2_no_inline = [1079955.0, 1215846.1, 1214056.3, 1040608.7, 721006.3]
# Perf data - cycles
perf_cycles_with_inline = [72150892, 71930022, 70943072, 71028571, 71558451]
perf_cycles_no_inline = [75052700, 72509966, 72566977, 72510434, 72740722]
# Perf data - cache misses
perf_cache_with_inline = [257935, 255109, 239513, 253996, 273547]
perf_cache_no_inline = [338291, 279162, 279528, 281449, 301940]
# Perf data - L1 dcache load misses
perf_l1_with_inline = [737567, 722272, 736433, 720829, 746993]
perf_l1_no_inline = [764846, 707294, 748172, 731684, 737196]
def calc_stats(data):
"""Calculate mean, min, max, and standard deviation."""
return {
'mean': statistics.mean(data),
'min': min(data),
'max': max(data),
'stdev': statistics.stdev(data) if len(data) > 1 else 0,
'cv': (statistics.stdev(data) / statistics.mean(data) * 100) if len(data) > 1 and statistics.mean(data) != 0 else 0
}
def calc_improvement(with_inline, no_inline):
"""Calculate percentage improvement (positive = better)."""
# For ops/s: higher is better
# For cycles/cache-misses: lower is better
return ((with_inline - no_inline) / no_inline) * 100
def t_test_welch(data1, data2):
"""Welch's t-test for unequal variances."""
n1, n2 = len(data1), len(data2)
mean1, mean2 = statistics.mean(data1), statistics.mean(data2)
var1, var2 = statistics.variance(data1), statistics.variance(data2)
# Calculate t-statistic
t = (mean1 - mean2) / math.sqrt((var1/n1) + (var2/n2))
# Degrees of freedom (Welch-Satterthwaite)
df_num = ((var1/n1) + (var2/n2))**2
df_denom = ((var1/n1)**2)/(n1-1) + ((var2/n2)**2)/(n2-1)
df = df_num / df_denom
return abs(t), df
print("=" * 80)
print("GATEKEEPER INLINING OPTIMIZATION - PERFORMANCE ANALYSIS")
print("=" * 80)
print()
# Test 1 Analysis
print("TEST 1: Standard Benchmark (random_mixed 1000000 256 42)")
print("-" * 80)
stats_t1_inline = calc_stats(test1_with_inline)
stats_t1_no_inline = calc_stats(test1_no_inline)
improvement_t1 = calc_improvement(stats_t1_inline['mean'], stats_t1_no_inline['mean'])
print(f"BUILD A (WITH inlining):")
print(f" Mean ops/s: {stats_t1_inline['mean']:,.2f}")
print(f" Min ops/s: {stats_t1_inline['min']:,.2f}")
print(f" Max ops/s: {stats_t1_inline['max']:,.2f}")
print(f" Std Dev: {stats_t1_inline['stdev']:,.2f}")
print(f" CV: {stats_t1_inline['cv']:.2f}%")
print()
print(f"BUILD B (WITHOUT inlining):")
print(f" Mean ops/s: {stats_t1_no_inline['mean']:,.2f}")
print(f" Min ops/s: {stats_t1_no_inline['min']:,.2f}")
print(f" Max ops/s: {stats_t1_no_inline['max']:,.2f}")
print(f" Std Dev: {stats_t1_no_inline['stdev']:,.2f}")
print(f" CV: {stats_t1_no_inline['cv']:.2f}%")
print()
print(f"IMPROVEMENT: {improvement_t1:+.2f}%")
t_stat_t1, df_t1 = t_test_welch(test1_with_inline, test1_no_inline)
print(f"t-statistic: {t_stat_t1:.3f}, df: {df_t1:.2f}")
print()
# Test 2 Analysis
print("TEST 2: Conservative Profile (HAKMEM_TINY_PROFILE=conservative)")
print("-" * 80)
stats_t2_inline = calc_stats(test2_with_inline)
stats_t2_no_inline = calc_stats(test2_no_inline)
improvement_t2 = calc_improvement(stats_t2_inline['mean'], stats_t2_no_inline['mean'])
print(f"BUILD A (WITH inlining):")
print(f" Mean ops/s: {stats_t2_inline['mean']:,.2f}")
print(f" Min ops/s: {stats_t2_inline['min']:,.2f}")
print(f" Max ops/s: {stats_t2_inline['max']:,.2f}")
print(f" Std Dev: {stats_t2_inline['stdev']:,.2f}")
print(f" CV: {stats_t2_inline['cv']:.2f}%")
print()
print(f"BUILD B (WITHOUT inlining):")
print(f" Mean ops/s: {stats_t2_no_inline['mean']:,.2f}")
print(f" Min ops/s: {stats_t2_no_inline['min']:,.2f}")
print(f" Max ops/s: {stats_t2_no_inline['max']:,.2f}")
print(f" Std Dev: {stats_t2_no_inline['stdev']:,.2f}")
print(f" CV: {stats_t2_no_inline['cv']:.2f}%")
print()
print(f"IMPROVEMENT: {improvement_t2:+.2f}%")
t_stat_t2, df_t2 = t_test_welch(test2_with_inline, test2_no_inline)
print(f"t-statistic: {t_stat_t2:.3f}, df: {df_t2:.2f}")
print()
# Perf Analysis - Cycles
print("PERF ANALYSIS: CPU CYCLES")
print("-" * 80)
stats_cycles_inline = calc_stats(perf_cycles_with_inline)
stats_cycles_no_inline = calc_stats(perf_cycles_no_inline)
# For cycles, lower is better, so negate the improvement
improvement_cycles = -calc_improvement(stats_cycles_inline['mean'], stats_cycles_no_inline['mean'])
print(f"BUILD A (WITH inlining):")
print(f" Mean cycles: {stats_cycles_inline['mean']:,.0f}")
print(f" Min cycles: {stats_cycles_inline['min']:,.0f}")
print(f" Max cycles: {stats_cycles_inline['max']:,.0f}")
print(f" Std Dev: {stats_cycles_inline['stdev']:,.0f}")
print(f" CV: {stats_cycles_inline['cv']:.2f}%")
print()
print(f"BUILD B (WITHOUT inlining):")
print(f" Mean cycles: {stats_cycles_no_inline['mean']:,.0f}")
print(f" Min cycles: {stats_cycles_no_inline['min']:,.0f}")
print(f" Max cycles: {stats_cycles_no_inline['max']:,.0f}")
print(f" Std Dev: {stats_cycles_no_inline['stdev']:,.0f}")
print(f" CV: {stats_cycles_no_inline['cv']:.2f}%")
print()
print(f"REDUCTION: {improvement_cycles:+.2f}% (lower is better)")
t_stat_cycles, df_cycles = t_test_welch(perf_cycles_with_inline, perf_cycles_no_inline)
print(f"t-statistic: {t_stat_cycles:.3f}, df: {df_cycles:.2f}")
print()
# Perf Analysis - Cache Misses
print("PERF ANALYSIS: CACHE MISSES")
print("-" * 80)
stats_cache_inline = calc_stats(perf_cache_with_inline)
stats_cache_no_inline = calc_stats(perf_cache_no_inline)
improvement_cache = -calc_improvement(stats_cache_inline['mean'], stats_cache_no_inline['mean'])
print(f"BUILD A (WITH inlining):")
print(f" Mean misses: {stats_cache_inline['mean']:,.0f}")
print(f" Min misses: {stats_cache_inline['min']:,.0f}")
print(f" Max misses: {stats_cache_inline['max']:,.0f}")
print(f" Std Dev: {stats_cache_inline['stdev']:,.0f}")
print(f" CV: {stats_cache_inline['cv']:.2f}%")
print()
print(f"BUILD B (WITHOUT inlining):")
print(f" Mean misses: {stats_cache_no_inline['mean']:,.0f}")
print(f" Min misses: {stats_cache_no_inline['min']:,.0f}")
print(f" Max misses: {stats_cache_no_inline['max']:,.0f}")
print(f" Std Dev: {stats_cache_no_inline['stdev']:,.0f}")
print(f" CV: {stats_cache_no_inline['cv']:.2f}%")
print()
print(f"REDUCTION: {improvement_cache:+.2f}% (lower is better)")
t_stat_cache, df_cache = t_test_welch(perf_cache_with_inline, perf_cache_no_inline)
print(f"t-statistic: {t_stat_cache:.3f}, df: {df_cache:.2f}")
print()
# Perf Analysis - L1 Cache Misses
print("PERF ANALYSIS: L1 D-CACHE LOAD MISSES")
print("-" * 80)
stats_l1_inline = calc_stats(perf_l1_with_inline)
stats_l1_no_inline = calc_stats(perf_l1_no_inline)
improvement_l1 = -calc_improvement(stats_l1_inline['mean'], stats_l1_no_inline['mean'])
print(f"BUILD A (WITH inlining):")
print(f" Mean misses: {stats_l1_inline['mean']:,.0f}")
print(f" Min misses: {stats_l1_inline['min']:,.0f}")
print(f" Max misses: {stats_l1_inline['max']:,.0f}")
print(f" Std Dev: {stats_l1_inline['stdev']:,.0f}")
print(f" CV: {stats_l1_inline['cv']:.2f}%")
print()
print(f"BUILD B (WITHOUT inlining):")
print(f" Mean misses: {stats_l1_no_inline['mean']:,.0f}")
print(f" Min misses: {stats_l1_no_inline['min']:,.0f}")
print(f" Max misses: {stats_l1_no_inline['max']:,.0f}")
print(f" Std Dev: {stats_l1_no_inline['stdev']:,.0f}")
print(f" CV: {stats_l1_no_inline['cv']:.2f}%")
print()
print(f"REDUCTION: {improvement_l1:+.2f}% (lower is better)")
t_stat_l1, df_l1 = t_test_welch(perf_l1_with_inline, perf_l1_no_inline)
print(f"t-statistic: {t_stat_l1:.3f}, df: {df_l1:.2f}")
print()
# Summary Table
print("=" * 80)
print("SUMMARY TABLE")
print("=" * 80)
print()
print(f"{'Metric':<30} {'BUILD A':<15} {'BUILD B':<15} {'Difference':<12} {'% Change':>10}")
print("-" * 80)
print(f"{'Test 1: Avg ops/s':<30} {stats_t1_inline['mean']:>13,.0f} {stats_t1_no_inline['mean']:>13,.0f} {stats_t1_inline['mean']-stats_t1_no_inline['mean']:>10,.0f} {improvement_t1:>9.2f}%")
print(f"{'Test 1: Std Dev':<30} {stats_t1_inline['stdev']:>13,.0f} {stats_t1_no_inline['stdev']:>13,.0f} {stats_t1_inline['stdev']-stats_t1_no_inline['stdev']:>10,.0f} {'':>10}")
print(f"{'Test 1: CV %':<30} {stats_t1_inline['cv']:>12.2f}% {stats_t1_no_inline['cv']:>12.2f}% {'':>12} {'':>10}")
print()
print(f"{'Test 2: Avg ops/s':<30} {stats_t2_inline['mean']:>13,.0f} {stats_t2_no_inline['mean']:>13,.0f} {stats_t2_inline['mean']-stats_t2_no_inline['mean']:>10,.0f} {improvement_t2:>9.2f}%")
print(f"{'Test 2: Std Dev':<30} {stats_t2_inline['stdev']:>13,.0f} {stats_t2_no_inline['stdev']:>13,.0f} {stats_t2_inline['stdev']-stats_t2_no_inline['stdev']:>10,.0f} {'':>10}")
print(f"{'Test 2: CV %':<30} {stats_t2_inline['cv']:>12.2f}% {stats_t2_no_inline['cv']:>12.2f}% {'':>12} {'':>10}")
print()
print(f"{'CPU Cycles (avg)':<30} {stats_cycles_inline['mean']:>13,.0f} {stats_cycles_no_inline['mean']:>13,.0f} {stats_cycles_inline['mean']-stats_cycles_no_inline['mean']:>10,.0f} {improvement_cycles:>9.2f}%")
print(f"{'Cache Misses (avg)':<30} {stats_cache_inline['mean']:>13,.0f} {stats_cache_no_inline['mean']:>13,.0f} {stats_cache_inline['mean']-stats_cache_no_inline['mean']:>10,.0f} {improvement_cache:>9.2f}%")
print(f"{'L1 D-Cache Misses (avg)':<30} {stats_l1_inline['mean']:>13,.0f} {stats_l1_no_inline['mean']:>13,.0f} {stats_l1_inline['mean']-stats_l1_no_inline['mean']:>10,.0f} {improvement_l1:>9.2f}%")
print()
# Statistical Significance Analysis
print("=" * 80)
print("STATISTICAL SIGNIFICANCE ANALYSIS")
print("=" * 80)
print()
print("Coefficient of Variation (CV) Assessment:")
print(f" Test 1 WITH inlining: {stats_t1_inline['cv']:.2f}% {'[GOOD]' if stats_t1_inline['cv'] < 10 else '[HIGH VARIANCE]'}")
print(f" Test 1 WITHOUT inlining: {stats_t1_no_inline['cv']:.2f}% {'[GOOD]' if stats_t1_no_inline['cv'] < 10 else '[HIGH VARIANCE]'}")
print(f" Test 2 WITH inlining: {stats_t2_inline['cv']:.2f}% {'[GOOD]' if stats_t2_inline['cv'] < 10 else '[HIGH VARIANCE]'}")
print(f" Test 2 WITHOUT inlining: {stats_t2_no_inline['cv']:.2f}% {'[HIGH VARIANCE]' if stats_t2_no_inline['cv'] > 10 else '[GOOD]'}")
print()
print("t-test Results (Welch's t-test for unequal variances):")
print(f" Test 1: t = {t_stat_t1:.3f}, df = {df_t1:.2f}")
print(f" Test 2: t = {t_stat_t2:.3f}, df = {df_t2:.2f}")
print(f" CPU Cycles: t = {t_stat_cycles:.3f}, df = {df_cycles:.2f}")
print(f" Cache Misses: t = {t_stat_cache:.3f}, df = {df_cache:.2f}")
print(f" L1 Misses: t = {t_stat_l1:.3f}, df = {df_l1:.2f}")
print()
print("Note: For 5 samples, t > 2.776 suggests significance at p < 0.05 level")
print()
# Conclusion
print("=" * 80)
print("CONCLUSION")
print("=" * 80)
print()
# Determine if results are significant
cv_acceptable = all([
stats_t1_inline['cv'] < 15,
stats_t1_no_inline['cv'] < 15,
stats_t2_inline['cv'] < 15,
])
if improvement_t1 > 0 and improvement_t2 > 0:
print("INLINING OPTIMIZATION IS EFFECTIVE:")
print(f" - Test 1 shows {improvement_t1:.2f}% throughput improvement")
print(f" - Test 2 shows {improvement_t2:.2f}% throughput improvement")
print(f" - CPU cycles reduced by {improvement_cycles:.2f}%")
print(f" - Cache misses reduced by {improvement_cache:.2f}%")
print()
if cv_acceptable and t_stat_t1 > 1.5:
print("Results show GOOD CONSISTENCY with acceptable variance.")
else:
print("Results show HIGH VARIANCE - consider additional runs for confirmation.")
print()
if improvement_cycles >= 1.0:
print(f"The {improvement_cycles:.2f}% cycle reduction confirms the optimization is effective.")
print()
print("RECOMMENDATION: KEEP inlining optimization.")
print("NEXT STEP: Proceed with 'Batch Tier Checks' optimization.")
else:
print("Cycle reduction is marginal. Monitor in production workloads.")
print()
print("RECOMMENDATION: Keep inlining but verify with production benchmarks.")
else:
print("WARNING: INLINING SHOWS NO CLEAR BENEFIT OR REGRESSION")
print(f" - Test 1: {improvement_t1:.2f}%")
print(f" - Test 2: {improvement_t2:.2f}%")
print()
print("RECOMMENDATION: Re-evaluate inlining strategy or investigate variance.")
print()
print("=" * 80)