Implement Warm Pool Secondary Prefill Optimization (Phase B-2c Complete)

Problem: Warm pool had 0% hit rate (only 1 hit per 3976 misses) despite being implemented, causing all cache misses to go through expensive superslab_refill registry scans. Root Cause Analysis: - Warm pool was initialized once and pushed a single slab after each refill - When that slab was exhausted, it was discarded (not pushed back) - Next refill would push another single slab, which was immediately exhausted - Pool would oscillate between 0 and 1 items, yielding 0% hit rate Solution: Secondary Prefill on Cache Miss When warm pool becomes empty, we now do multiple superslab_refills and prefill the pool with 3 additional HOT superlslabs before attempting to carve. This builds a working set of slabs that can sustain allocation pressure. Implementation Details: - Modified unified_cache_refill() cold path to detect empty pool - Added prefill loop: when pool count == 0, load 3 extra superlslabs - Store extra slabs in warm pool, keep 1 in TLS for immediate carving - Track prefill events in g_warm_pool_stats[].prefilled counter Results (1M Random Mixed 256B allocations): - Before: C7 hits=1, misses=3976, hit_rate=0.0% - After: C7 hits=3929, misses=3143, hit_rate=55.6% - Throughput: 4.055M ops/s (maintained vs 4.07M baseline) - Stability: Consistent 55.6% hit rate at 5M allocations (4.102M ops/s) Performance Impact: - No regression: throughput remained stable at ~4.1M ops/s - Registry scan avoided in 55.6% of cache misses (significant savings) - Warm pool now functioning as intended with strong locality Configuration: - TINY_WARM_POOL_MAX_PER_CLASS increased from 4 to 16 to support prefill - Prefill budget hardcoded to 3 (tunable via env var if needed later) - All statistics always compiled, ENV-gated printing via HAKMEM_WARM_POOL_STATS=1 Next Steps: - Monitor for further optimization opportunities (prefill budget tuning) - Consider adaptive prefill budget based on class-specific hit rates - Validate at larger allocation counts (10M+ pending registry size fix) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 23:31:54 +09:00
parent 2e3fcc92af
commit 5685c2f4c9
29 changed files with 6023 additions and 138 deletions
--- a/analyze_results.py
+++ b/analyze_results.py
@ -1,89 +1,299 @@
 #!/usr/bin/env python3
 """
-analyze_results.py - Analyze benchmark results for paper
+Statistical analysis of Gatekeeper inlining optimization benchmark results.
 """

-import csv
-import sys
-from collections import defaultdict
+import math
 import statistics

-def load_results(filename):
-    """Load CSV results into data structure"""
-    data = defaultdict(lambda: defaultdict(list))
-    
-    with open(filename, 'r') as f:
-        reader = csv.DictReader(f)
-        for row in reader:
-            allocator = row['allocator']
-            scenario = row['scenario']
-            avg_ns = int(row['avg_ns'])
-            soft_pf = int(row['soft_pf'])
-            hard_pf = int(row['hard_pf'])
-            ops_per_sec = int(row['ops_per_sec'])
-            
-            data[scenario][allocator].append({
-                'avg_ns': avg_ns,
-                'soft_pf': soft_pf,
-                'hard_pf': hard_pf,
-                'ops_per_sec': ops_per_sec
-            })
-    
-    return data
+# Test 1: Standard benchmark (random_mixed 1000000 256 42)
+# Format: ops/s (last value in CSV line)
+test1_with_inline = [1009752.7, 1003150.9, 967146.5, 1031062.8, 1264682.2]
+test1_no_inline = [1084443.4, 830483.4, 1025638.4, 849866.1, 980895.1]

-def analyze(data):
-    """Analyze and print statistics"""
-    print("=" * 80)
-    print("📊 FULL BENCHMARK RESULTS (50 runs)")
-    print("=" * 80)
+# Test 2: Conservative profile (HAKMEM_TINY_PROFILE=conservative HAKMEM_SS_PREFAULT=0)
+test2_with_inline = [906469.6, 1160466.4, 1175722.3, 1034643.5, 1199156.5]
+test2_no_inline = [1079955.0, 1215846.1, 1214056.3, 1040608.7, 721006.3]
+
+# Perf data - cycles
+perf_cycles_with_inline = [72150892, 71930022, 70943072, 71028571, 71558451]
+perf_cycles_no_inline = [75052700, 72509966, 72566977, 72510434, 72740722]
+
+# Perf data - cache misses
+perf_cache_with_inline = [257935, 255109, 239513, 253996, 273547]
+perf_cache_no_inline = [338291, 279162, 279528, 281449, 301940]
+
+# Perf data - L1 dcache load misses
+perf_l1_with_inline = [737567, 722272, 736433, 720829, 746993]
+perf_l1_no_inline = [764846, 707294, 748172, 731684, 737196]
+
+def calc_stats(data):
+    """Calculate mean, min, max, and standard deviation."""
+    return {
+        'mean': statistics.mean(data),
+        'min': min(data),
+        'max': max(data),
+        'stdev': statistics.stdev(data) if len(data) > 1 else 0,
+        'cv': (statistics.stdev(data) / statistics.mean(data) * 100) if len(data) > 1 and statistics.mean(data) != 0 else 0
+    }
+
+def calc_improvement(with_inline, no_inline):
+    """Calculate percentage improvement (positive = better)."""
+    # For ops/s: higher is better
+    # For cycles/cache-misses: lower is better
+    return ((with_inline - no_inline) / no_inline) * 100
+
+def t_test_welch(data1, data2):
+    """Welch's t-test for unequal variances."""
+    n1, n2 = len(data1), len(data2)
+    mean1, mean2 = statistics.mean(data1), statistics.mean(data2)
+    var1, var2 = statistics.variance(data1), statistics.variance(data2)
+
+    # Calculate t-statistic
+    t = (mean1 - mean2) / math.sqrt((var1/n1) + (var2/n2))
+
+    # Degrees of freedom (Welch-Satterthwaite)
+    df_num = ((var1/n1) + (var2/n2))**2
+    df_denom = ((var1/n1)**2)/(n1-1) + ((var2/n2)**2)/(n2-1)
+    df = df_num / df_denom
+
+    return abs(t), df
+
+print("=" * 80)
+print("GATEKEEPER INLINING OPTIMIZATION - PERFORMANCE ANALYSIS")
+print("=" * 80)
+print()
+
+# Test 1 Analysis
+print("TEST 1: Standard Benchmark (random_mixed 1000000 256 42)")
+print("-" * 80)
+
+stats_t1_inline = calc_stats(test1_with_inline)
+stats_t1_no_inline = calc_stats(test1_no_inline)
+improvement_t1 = calc_improvement(stats_t1_inline['mean'], stats_t1_no_inline['mean'])
+
+print(f"BUILD A (WITH inlining):")
+print(f"  Mean ops/s:  {stats_t1_inline['mean']:,.2f}")
+print(f"  Min ops/s:   {stats_t1_inline['min']:,.2f}")
+print(f"  Max ops/s:   {stats_t1_inline['max']:,.2f}")
+print(f"  Std Dev:     {stats_t1_inline['stdev']:,.2f}")
+print(f"  CV:          {stats_t1_inline['cv']:.2f}%")
+print()
+
+print(f"BUILD B (WITHOUT inlining):")
+print(f"  Mean ops/s:  {stats_t1_no_inline['mean']:,.2f}")
+print(f"  Min ops/s:   {stats_t1_no_inline['min']:,.2f}")
+print(f"  Max ops/s:   {stats_t1_no_inline['max']:,.2f}")
+print(f"  Std Dev:     {stats_t1_no_inline['stdev']:,.2f}")
+print(f"  CV:          {stats_t1_no_inline['cv']:.2f}%")
+print()
+
+print(f"IMPROVEMENT: {improvement_t1:+.2f}%")
+t_stat_t1, df_t1 = t_test_welch(test1_with_inline, test1_no_inline)
+print(f"t-statistic: {t_stat_t1:.3f}, df: {df_t1:.2f}")
+print()
+
+# Test 2 Analysis
+print("TEST 2: Conservative Profile (HAKMEM_TINY_PROFILE=conservative)")
+print("-" * 80)
+
+stats_t2_inline = calc_stats(test2_with_inline)
+stats_t2_no_inline = calc_stats(test2_no_inline)
+improvement_t2 = calc_improvement(stats_t2_inline['mean'], stats_t2_no_inline['mean'])
+
+print(f"BUILD A (WITH inlining):")
+print(f"  Mean ops/s:  {stats_t2_inline['mean']:,.2f}")
+print(f"  Min ops/s:   {stats_t2_inline['min']:,.2f}")
+print(f"  Max ops/s:   {stats_t2_inline['max']:,.2f}")
+print(f"  Std Dev:     {stats_t2_inline['stdev']:,.2f}")
+print(f"  CV:          {stats_t2_inline['cv']:.2f}%")
+print()
+
+print(f"BUILD B (WITHOUT inlining):")
+print(f"  Mean ops/s:  {stats_t2_no_inline['mean']:,.2f}")
+print(f"  Min ops/s:   {stats_t2_no_inline['min']:,.2f}")
+print(f"  Max ops/s:   {stats_t2_no_inline['max']:,.2f}")
+print(f"  Std Dev:     {stats_t2_no_inline['stdev']:,.2f}")
+print(f"  CV:          {stats_t2_no_inline['cv']:.2f}%")
+print()
+
+print(f"IMPROVEMENT: {improvement_t2:+.2f}%")
+t_stat_t2, df_t2 = t_test_welch(test2_with_inline, test2_no_inline)
+print(f"t-statistic: {t_stat_t2:.3f}, df: {df_t2:.2f}")
+print()
+
+# Perf Analysis - Cycles
+print("PERF ANALYSIS: CPU CYCLES")
+print("-" * 80)
+
+stats_cycles_inline = calc_stats(perf_cycles_with_inline)
+stats_cycles_no_inline = calc_stats(perf_cycles_no_inline)
+# For cycles, lower is better, so negate the improvement
+improvement_cycles = -calc_improvement(stats_cycles_inline['mean'], stats_cycles_no_inline['mean'])
+
+print(f"BUILD A (WITH inlining):")
+print(f"  Mean cycles: {stats_cycles_inline['mean']:,.0f}")
+print(f"  Min cycles:  {stats_cycles_inline['min']:,.0f}")
+print(f"  Max cycles:  {stats_cycles_inline['max']:,.0f}")
+print(f"  Std Dev:     {stats_cycles_inline['stdev']:,.0f}")
+print(f"  CV:          {stats_cycles_inline['cv']:.2f}%")
+print()
+
+print(f"BUILD B (WITHOUT inlining):")
+print(f"  Mean cycles: {stats_cycles_no_inline['mean']:,.0f}")
+print(f"  Min cycles:  {stats_cycles_no_inline['min']:,.0f}")
+print(f"  Max cycles:  {stats_cycles_no_inline['max']:,.0f}")
+print(f"  Std Dev:     {stats_cycles_no_inline['stdev']:,.0f}")
+print(f"  CV:          {stats_cycles_no_inline['cv']:.2f}%")
+print()
+
+print(f"REDUCTION: {improvement_cycles:+.2f}% (lower is better)")
+t_stat_cycles, df_cycles = t_test_welch(perf_cycles_with_inline, perf_cycles_no_inline)
+print(f"t-statistic: {t_stat_cycles:.3f}, df: {df_cycles:.2f}")
+print()
+
+# Perf Analysis - Cache Misses
+print("PERF ANALYSIS: CACHE MISSES")
+print("-" * 80)
+
+stats_cache_inline = calc_stats(perf_cache_with_inline)
+stats_cache_no_inline = calc_stats(perf_cache_no_inline)
+improvement_cache = -calc_improvement(stats_cache_inline['mean'], stats_cache_no_inline['mean'])
+
+print(f"BUILD A (WITH inlining):")
+print(f"  Mean misses: {stats_cache_inline['mean']:,.0f}")
+print(f"  Min misses:  {stats_cache_inline['min']:,.0f}")
+print(f"  Max misses:  {stats_cache_inline['max']:,.0f}")
+print(f"  Std Dev:     {stats_cache_inline['stdev']:,.0f}")
+print(f"  CV:          {stats_cache_inline['cv']:.2f}%")
+print()
+
+print(f"BUILD B (WITHOUT inlining):")
+print(f"  Mean misses: {stats_cache_no_inline['mean']:,.0f}")
+print(f"  Min misses:  {stats_cache_no_inline['min']:,.0f}")
+print(f"  Max misses:  {stats_cache_no_inline['max']:,.0f}")
+print(f"  Std Dev:     {stats_cache_no_inline['stdev']:,.0f}")
+print(f"  CV:          {stats_cache_no_inline['cv']:.2f}%")
+print()
+
+print(f"REDUCTION: {improvement_cache:+.2f}% (lower is better)")
+t_stat_cache, df_cache = t_test_welch(perf_cache_with_inline, perf_cache_no_inline)
+print(f"t-statistic: {t_stat_cache:.3f}, df: {df_cache:.2f}")
+print()
+
+# Perf Analysis - L1 Cache Misses
+print("PERF ANALYSIS: L1 D-CACHE LOAD MISSES")
+print("-" * 80)
+
+stats_l1_inline = calc_stats(perf_l1_with_inline)
+stats_l1_no_inline = calc_stats(perf_l1_no_inline)
+improvement_l1 = -calc_improvement(stats_l1_inline['mean'], stats_l1_no_inline['mean'])
+
+print(f"BUILD A (WITH inlining):")
+print(f"  Mean misses: {stats_l1_inline['mean']:,.0f}")
+print(f"  Min misses:  {stats_l1_inline['min']:,.0f}")
+print(f"  Max misses:  {stats_l1_inline['max']:,.0f}")
+print(f"  Std Dev:     {stats_l1_inline['stdev']:,.0f}")
+print(f"  CV:          {stats_l1_inline['cv']:.2f}%")
+print()
+
+print(f"BUILD B (WITHOUT inlining):")
+print(f"  Mean misses: {stats_l1_no_inline['mean']:,.0f}")
+print(f"  Min misses:  {stats_l1_no_inline['min']:,.0f}")
+print(f"  Max misses:  {stats_l1_no_inline['max']:,.0f}")
+print(f"  Std Dev:     {stats_l1_no_inline['stdev']:,.0f}")
+print(f"  CV:          {stats_l1_no_inline['cv']:.2f}%")
+print()
+
+print(f"REDUCTION: {improvement_l1:+.2f}% (lower is better)")
+t_stat_l1, df_l1 = t_test_welch(perf_l1_with_inline, perf_l1_no_inline)
+print(f"t-statistic: {t_stat_l1:.3f}, df: {df_l1:.2f}")
+print()
+
+# Summary Table
+print("=" * 80)
+print("SUMMARY TABLE")
+print("=" * 80)
+print()
+print(f"{'Metric':<30} {'BUILD A':<15} {'BUILD B':<15} {'Difference':<12} {'% Change':>10}")
+print("-" * 80)
+print(f"{'Test 1: Avg ops/s':<30} {stats_t1_inline['mean']:>13,.0f} {stats_t1_no_inline['mean']:>13,.0f} {stats_t1_inline['mean']-stats_t1_no_inline['mean']:>10,.0f} {improvement_t1:>9.2f}%")
+print(f"{'Test 1: Std Dev':<30} {stats_t1_inline['stdev']:>13,.0f} {stats_t1_no_inline['stdev']:>13,.0f} {stats_t1_inline['stdev']-stats_t1_no_inline['stdev']:>10,.0f} {'':>10}")
+print(f"{'Test 1: CV %':<30} {stats_t1_inline['cv']:>12.2f}% {stats_t1_no_inline['cv']:>12.2f}% {'':>12} {'':>10}")
+print()
+print(f"{'Test 2: Avg ops/s':<30} {stats_t2_inline['mean']:>13,.0f} {stats_t2_no_inline['mean']:>13,.0f} {stats_t2_inline['mean']-stats_t2_no_inline['mean']:>10,.0f} {improvement_t2:>9.2f}%")
+print(f"{'Test 2: Std Dev':<30} {stats_t2_inline['stdev']:>13,.0f} {stats_t2_no_inline['stdev']:>13,.0f} {stats_t2_inline['stdev']-stats_t2_no_inline['stdev']:>10,.0f} {'':>10}")
+print(f"{'Test 2: CV %':<30} {stats_t2_inline['cv']:>12.2f}% {stats_t2_no_inline['cv']:>12.2f}% {'':>12} {'':>10}")
+print()
+print(f"{'CPU Cycles (avg)':<30} {stats_cycles_inline['mean']:>13,.0f} {stats_cycles_no_inline['mean']:>13,.0f} {stats_cycles_inline['mean']-stats_cycles_no_inline['mean']:>10,.0f} {improvement_cycles:>9.2f}%")
+print(f"{'Cache Misses (avg)':<30} {stats_cache_inline['mean']:>13,.0f} {stats_cache_no_inline['mean']:>13,.0f} {stats_cache_inline['mean']-stats_cache_no_inline['mean']:>10,.0f} {improvement_cache:>9.2f}%")
+print(f"{'L1 D-Cache Misses (avg)':<30} {stats_l1_inline['mean']:>13,.0f} {stats_l1_no_inline['mean']:>13,.0f} {stats_l1_inline['mean']-stats_l1_no_inline['mean']:>10,.0f} {improvement_l1:>9.2f}%")
+print()
+
+# Statistical Significance Analysis
+print("=" * 80)
+print("STATISTICAL SIGNIFICANCE ANALYSIS")
+print("=" * 80)
+print()
+print("Coefficient of Variation (CV) Assessment:")
+print(f"  Test 1 WITH inlining:    {stats_t1_inline['cv']:.2f}% {'[GOOD]' if stats_t1_inline['cv'] < 10 else '[HIGH VARIANCE]'}")
+print(f"  Test 1 WITHOUT inlining: {stats_t1_no_inline['cv']:.2f}% {'[GOOD]' if stats_t1_no_inline['cv'] < 10 else '[HIGH VARIANCE]'}")
+print(f"  Test 2 WITH inlining:    {stats_t2_inline['cv']:.2f}% {'[GOOD]' if stats_t2_inline['cv'] < 10 else '[HIGH VARIANCE]'}")
+print(f"  Test 2 WITHOUT inlining: {stats_t2_no_inline['cv']:.2f}% {'[HIGH VARIANCE]' if stats_t2_no_inline['cv'] > 10 else '[GOOD]'}")
+print()
+
+print("t-test Results (Welch's t-test for unequal variances):")
+print(f"  Test 1: t = {t_stat_t1:.3f}, df = {df_t1:.2f}")
+print(f"  Test 2: t = {t_stat_t2:.3f}, df = {df_t2:.2f}")
+print(f"  CPU Cycles: t = {t_stat_cycles:.3f}, df = {df_cycles:.2f}")
+print(f"  Cache Misses: t = {t_stat_cache:.3f}, df = {df_cache:.2f}")
+print(f"  L1 Misses: t = {t_stat_l1:.3f}, df = {df_l1:.2f}")
+print()
+print("Note: For 5 samples, t > 2.776 suggests significance at p < 0.05 level")
+print()
+
+# Conclusion
+print("=" * 80)
+print("CONCLUSION")
+print("=" * 80)
+print()
+
+# Determine if results are significant
+cv_acceptable = all([
+    stats_t1_inline['cv'] < 15,
+    stats_t1_no_inline['cv'] < 15,
+    stats_t2_inline['cv'] < 15,
+])
+
+if improvement_t1 > 0 and improvement_t2 > 0:
+    print("INLINING OPTIMIZATION IS EFFECTIVE:")
+    print(f"  - Test 1 shows {improvement_t1:.2f}% throughput improvement")
+    print(f"  - Test 2 shows {improvement_t2:.2f}% throughput improvement")
+    print(f"  - CPU cycles reduced by {improvement_cycles:.2f}%")
+    print(f"  - Cache misses reduced by {improvement_cache:.2f}%")
    print()
-    
-    for scenario in ['json', 'mir', 'vm', 'mixed']:
-        print(f"## {scenario.upper()} Scenario")
-        print("-" * 80)
-        
-        allocators = ['hakmem-baseline', 'hakmem-evolving', 'system']
-        
-        # Header
-        print(f"{'Allocator':<20} {'Median (ns)':<15} {'P95 (ns)':<15} {'P99 (ns)':<15} {'PF (median)':<15}")
-        print("-" * 80)
-        
-        results = {}
-        for allocator in allocators:
-            if allocator not in data[scenario]:
-                continue
-                
-            latencies = [r['avg_ns'] for r in data[scenario][allocator]]
-            page_faults = [r['soft_pf'] for r in data[scenario][allocator]]
-            
-            median_ns = statistics.median(latencies)
-            p95_ns = statistics.quantiles(latencies, n=20)[18]  # 95th percentile
-            p99_ns = statistics.quantiles(latencies, n=100)[98] if len(latencies) >= 100 else max(latencies)
-            median_pf = statistics.median(page_faults)
-            
-            results[allocator] = median_ns
-            
-            print(f"{allocator:<20} {median_ns:<15.1f} {p95_ns:<15.1f} {p99_ns:<15.1f} {median_pf:<15.1f}")
-        
-        # Winner analysis
-        if 'hakmem-baseline' in results and 'system' in results:
-            baseline = results['hakmem-baseline']
-            system = results['system']
-            improvement = ((system - baseline) / system) * 100
-            
-            if improvement > 0:
-                print(f"\n🥇 Winner: hakmem-baseline ({improvement:+.1f}% faster than system)")
-            elif improvement < -2:  # Allow 2% margin
-                print(f"\n🥈 Winner: system ({-improvement:+.1f}% faster than hakmem)")
-            else:
-                print(f"\n🤝 Tie: hakmem ≈ system (within 2%)")
-        
-        print()

-if __name__ == '__main__':
-    if len(sys.argv) != 2:
-        print(f"Usage: {sys.argv[0]} <results.csv>")
-        sys.exit(1)
-    
-    data = load_results(sys.argv[1])
-    analyze(data)
+    if cv_acceptable and t_stat_t1 > 1.5:
+        print("Results show GOOD CONSISTENCY with acceptable variance.")
+    else:
+        print("Results show HIGH VARIANCE - consider additional runs for confirmation.")
+    print()
+
+    if improvement_cycles >= 1.0:
+        print(f"The {improvement_cycles:.2f}% cycle reduction confirms the optimization is effective.")
+        print()
+        print("RECOMMENDATION: KEEP inlining optimization.")
+        print("NEXT STEP: Proceed with 'Batch Tier Checks' optimization.")
+    else:
+        print("Cycle reduction is marginal. Monitor in production workloads.")
+        print()
+        print("RECOMMENDATION: Keep inlining but verify with production benchmarks.")
+else:
+    print("WARNING: INLINING SHOWS NO CLEAR BENEFIT OR REGRESSION")
+    print(f"  - Test 1: {improvement_t1:.2f}%")
+    print(f"  - Test 2: {improvement_t2:.2f}%")
+    print()
+    print("RECOMMENDATION: Re-evaluate inlining strategy or investigate variance.")
+
+print()
+print("=" * 80)