#!/usr/bin/env python3 """ Statistical analysis of Gatekeeper inlining optimization benchmark results. """ import math import statistics # Test 1: Standard benchmark (random_mixed 1000000 256 42) # Format: ops/s (last value in CSV line) test1_with_inline = [1009752.7, 1003150.9, 967146.5, 1031062.8, 1264682.2] test1_no_inline = [1084443.4, 830483.4, 1025638.4, 849866.1, 980895.1] # Test 2: Conservative profile (HAKMEM_TINY_PROFILE=conservative HAKMEM_SS_PREFAULT=0) test2_with_inline = [906469.6, 1160466.4, 1175722.3, 1034643.5, 1199156.5] test2_no_inline = [1079955.0, 1215846.1, 1214056.3, 1040608.7, 721006.3] # Perf data - cycles perf_cycles_with_inline = [72150892, 71930022, 70943072, 71028571, 71558451] perf_cycles_no_inline = [75052700, 72509966, 72566977, 72510434, 72740722] # Perf data - cache misses perf_cache_with_inline = [257935, 255109, 239513, 253996, 273547] perf_cache_no_inline = [338291, 279162, 279528, 281449, 301940] # Perf data - L1 dcache load misses perf_l1_with_inline = [737567, 722272, 736433, 720829, 746993] perf_l1_no_inline = [764846, 707294, 748172, 731684, 737196] def calc_stats(data): """Calculate mean, min, max, and standard deviation.""" return { 'mean': statistics.mean(data), 'min': min(data), 'max': max(data), 'stdev': statistics.stdev(data) if len(data) > 1 else 0, 'cv': (statistics.stdev(data) / statistics.mean(data) * 100) if len(data) > 1 and statistics.mean(data) != 0 else 0 } def calc_improvement(with_inline, no_inline): """Calculate percentage improvement (positive = better).""" # For ops/s: higher is better # For cycles/cache-misses: lower is better return ((with_inline - no_inline) / no_inline) * 100 def t_test_welch(data1, data2): """Welch's t-test for unequal variances.""" n1, n2 = len(data1), len(data2) mean1, mean2 = statistics.mean(data1), statistics.mean(data2) var1, var2 = statistics.variance(data1), statistics.variance(data2) # Calculate t-statistic t = (mean1 - mean2) / math.sqrt((var1/n1) + (var2/n2)) # Degrees of freedom (Welch-Satterthwaite) df_num = ((var1/n1) + (var2/n2))**2 df_denom = ((var1/n1)**2)/(n1-1) + ((var2/n2)**2)/(n2-1) df = df_num / df_denom return abs(t), df print("=" * 80) print("GATEKEEPER INLINING OPTIMIZATION - PERFORMANCE ANALYSIS") print("=" * 80) print() # Test 1 Analysis print("TEST 1: Standard Benchmark (random_mixed 1000000 256 42)") print("-" * 80) stats_t1_inline = calc_stats(test1_with_inline) stats_t1_no_inline = calc_stats(test1_no_inline) improvement_t1 = calc_improvement(stats_t1_inline['mean'], stats_t1_no_inline['mean']) print(f"BUILD A (WITH inlining):") print(f" Mean ops/s: {stats_t1_inline['mean']:,.2f}") print(f" Min ops/s: {stats_t1_inline['min']:,.2f}") print(f" Max ops/s: {stats_t1_inline['max']:,.2f}") print(f" Std Dev: {stats_t1_inline['stdev']:,.2f}") print(f" CV: {stats_t1_inline['cv']:.2f}%") print() print(f"BUILD B (WITHOUT inlining):") print(f" Mean ops/s: {stats_t1_no_inline['mean']:,.2f}") print(f" Min ops/s: {stats_t1_no_inline['min']:,.2f}") print(f" Max ops/s: {stats_t1_no_inline['max']:,.2f}") print(f" Std Dev: {stats_t1_no_inline['stdev']:,.2f}") print(f" CV: {stats_t1_no_inline['cv']:.2f}%") print() print(f"IMPROVEMENT: {improvement_t1:+.2f}%") t_stat_t1, df_t1 = t_test_welch(test1_with_inline, test1_no_inline) print(f"t-statistic: {t_stat_t1:.3f}, df: {df_t1:.2f}") print() # Test 2 Analysis print("TEST 2: Conservative Profile (HAKMEM_TINY_PROFILE=conservative)") print("-" * 80) stats_t2_inline = calc_stats(test2_with_inline) stats_t2_no_inline = calc_stats(test2_no_inline) improvement_t2 = calc_improvement(stats_t2_inline['mean'], stats_t2_no_inline['mean']) print(f"BUILD A (WITH inlining):") print(f" Mean ops/s: {stats_t2_inline['mean']:,.2f}") print(f" Min ops/s: {stats_t2_inline['min']:,.2f}") print(f" Max ops/s: {stats_t2_inline['max']:,.2f}") print(f" Std Dev: {stats_t2_inline['stdev']:,.2f}") print(f" CV: {stats_t2_inline['cv']:.2f}%") print() print(f"BUILD B (WITHOUT inlining):") print(f" Mean ops/s: {stats_t2_no_inline['mean']:,.2f}") print(f" Min ops/s: {stats_t2_no_inline['min']:,.2f}") print(f" Max ops/s: {stats_t2_no_inline['max']:,.2f}") print(f" Std Dev: {stats_t2_no_inline['stdev']:,.2f}") print(f" CV: {stats_t2_no_inline['cv']:.2f}%") print() print(f"IMPROVEMENT: {improvement_t2:+.2f}%") t_stat_t2, df_t2 = t_test_welch(test2_with_inline, test2_no_inline) print(f"t-statistic: {t_stat_t2:.3f}, df: {df_t2:.2f}") print() # Perf Analysis - Cycles print("PERF ANALYSIS: CPU CYCLES") print("-" * 80) stats_cycles_inline = calc_stats(perf_cycles_with_inline) stats_cycles_no_inline = calc_stats(perf_cycles_no_inline) # For cycles, lower is better, so negate the improvement improvement_cycles = -calc_improvement(stats_cycles_inline['mean'], stats_cycles_no_inline['mean']) print(f"BUILD A (WITH inlining):") print(f" Mean cycles: {stats_cycles_inline['mean']:,.0f}") print(f" Min cycles: {stats_cycles_inline['min']:,.0f}") print(f" Max cycles: {stats_cycles_inline['max']:,.0f}") print(f" Std Dev: {stats_cycles_inline['stdev']:,.0f}") print(f" CV: {stats_cycles_inline['cv']:.2f}%") print() print(f"BUILD B (WITHOUT inlining):") print(f" Mean cycles: {stats_cycles_no_inline['mean']:,.0f}") print(f" Min cycles: {stats_cycles_no_inline['min']:,.0f}") print(f" Max cycles: {stats_cycles_no_inline['max']:,.0f}") print(f" Std Dev: {stats_cycles_no_inline['stdev']:,.0f}") print(f" CV: {stats_cycles_no_inline['cv']:.2f}%") print() print(f"REDUCTION: {improvement_cycles:+.2f}% (lower is better)") t_stat_cycles, df_cycles = t_test_welch(perf_cycles_with_inline, perf_cycles_no_inline) print(f"t-statistic: {t_stat_cycles:.3f}, df: {df_cycles:.2f}") print() # Perf Analysis - Cache Misses print("PERF ANALYSIS: CACHE MISSES") print("-" * 80) stats_cache_inline = calc_stats(perf_cache_with_inline) stats_cache_no_inline = calc_stats(perf_cache_no_inline) improvement_cache = -calc_improvement(stats_cache_inline['mean'], stats_cache_no_inline['mean']) print(f"BUILD A (WITH inlining):") print(f" Mean misses: {stats_cache_inline['mean']:,.0f}") print(f" Min misses: {stats_cache_inline['min']:,.0f}") print(f" Max misses: {stats_cache_inline['max']:,.0f}") print(f" Std Dev: {stats_cache_inline['stdev']:,.0f}") print(f" CV: {stats_cache_inline['cv']:.2f}%") print() print(f"BUILD B (WITHOUT inlining):") print(f" Mean misses: {stats_cache_no_inline['mean']:,.0f}") print(f" Min misses: {stats_cache_no_inline['min']:,.0f}") print(f" Max misses: {stats_cache_no_inline['max']:,.0f}") print(f" Std Dev: {stats_cache_no_inline['stdev']:,.0f}") print(f" CV: {stats_cache_no_inline['cv']:.2f}%") print() print(f"REDUCTION: {improvement_cache:+.2f}% (lower is better)") t_stat_cache, df_cache = t_test_welch(perf_cache_with_inline, perf_cache_no_inline) print(f"t-statistic: {t_stat_cache:.3f}, df: {df_cache:.2f}") print() # Perf Analysis - L1 Cache Misses print("PERF ANALYSIS: L1 D-CACHE LOAD MISSES") print("-" * 80) stats_l1_inline = calc_stats(perf_l1_with_inline) stats_l1_no_inline = calc_stats(perf_l1_no_inline) improvement_l1 = -calc_improvement(stats_l1_inline['mean'], stats_l1_no_inline['mean']) print(f"BUILD A (WITH inlining):") print(f" Mean misses: {stats_l1_inline['mean']:,.0f}") print(f" Min misses: {stats_l1_inline['min']:,.0f}") print(f" Max misses: {stats_l1_inline['max']:,.0f}") print(f" Std Dev: {stats_l1_inline['stdev']:,.0f}") print(f" CV: {stats_l1_inline['cv']:.2f}%") print() print(f"BUILD B (WITHOUT inlining):") print(f" Mean misses: {stats_l1_no_inline['mean']:,.0f}") print(f" Min misses: {stats_l1_no_inline['min']:,.0f}") print(f" Max misses: {stats_l1_no_inline['max']:,.0f}") print(f" Std Dev: {stats_l1_no_inline['stdev']:,.0f}") print(f" CV: {stats_l1_no_inline['cv']:.2f}%") print() print(f"REDUCTION: {improvement_l1:+.2f}% (lower is better)") t_stat_l1, df_l1 = t_test_welch(perf_l1_with_inline, perf_l1_no_inline) print(f"t-statistic: {t_stat_l1:.3f}, df: {df_l1:.2f}") print() # Summary Table print("=" * 80) print("SUMMARY TABLE") print("=" * 80) print() print(f"{'Metric':<30} {'BUILD A':<15} {'BUILD B':<15} {'Difference':<12} {'% Change':>10}") print("-" * 80) print(f"{'Test 1: Avg ops/s':<30} {stats_t1_inline['mean']:>13,.0f} {stats_t1_no_inline['mean']:>13,.0f} {stats_t1_inline['mean']-stats_t1_no_inline['mean']:>10,.0f} {improvement_t1:>9.2f}%") print(f"{'Test 1: Std Dev':<30} {stats_t1_inline['stdev']:>13,.0f} {stats_t1_no_inline['stdev']:>13,.0f} {stats_t1_inline['stdev']-stats_t1_no_inline['stdev']:>10,.0f} {'':>10}") print(f"{'Test 1: CV %':<30} {stats_t1_inline['cv']:>12.2f}% {stats_t1_no_inline['cv']:>12.2f}% {'':>12} {'':>10}") print() print(f"{'Test 2: Avg ops/s':<30} {stats_t2_inline['mean']:>13,.0f} {stats_t2_no_inline['mean']:>13,.0f} {stats_t2_inline['mean']-stats_t2_no_inline['mean']:>10,.0f} {improvement_t2:>9.2f}%") print(f"{'Test 2: Std Dev':<30} {stats_t2_inline['stdev']:>13,.0f} {stats_t2_no_inline['stdev']:>13,.0f} {stats_t2_inline['stdev']-stats_t2_no_inline['stdev']:>10,.0f} {'':>10}") print(f"{'Test 2: CV %':<30} {stats_t2_inline['cv']:>12.2f}% {stats_t2_no_inline['cv']:>12.2f}% {'':>12} {'':>10}") print() print(f"{'CPU Cycles (avg)':<30} {stats_cycles_inline['mean']:>13,.0f} {stats_cycles_no_inline['mean']:>13,.0f} {stats_cycles_inline['mean']-stats_cycles_no_inline['mean']:>10,.0f} {improvement_cycles:>9.2f}%") print(f"{'Cache Misses (avg)':<30} {stats_cache_inline['mean']:>13,.0f} {stats_cache_no_inline['mean']:>13,.0f} {stats_cache_inline['mean']-stats_cache_no_inline['mean']:>10,.0f} {improvement_cache:>9.2f}%") print(f"{'L1 D-Cache Misses (avg)':<30} {stats_l1_inline['mean']:>13,.0f} {stats_l1_no_inline['mean']:>13,.0f} {stats_l1_inline['mean']-stats_l1_no_inline['mean']:>10,.0f} {improvement_l1:>9.2f}%") print() # Statistical Significance Analysis print("=" * 80) print("STATISTICAL SIGNIFICANCE ANALYSIS") print("=" * 80) print() print("Coefficient of Variation (CV) Assessment:") print(f" Test 1 WITH inlining: {stats_t1_inline['cv']:.2f}% {'[GOOD]' if stats_t1_inline['cv'] < 10 else '[HIGH VARIANCE]'}") print(f" Test 1 WITHOUT inlining: {stats_t1_no_inline['cv']:.2f}% {'[GOOD]' if stats_t1_no_inline['cv'] < 10 else '[HIGH VARIANCE]'}") print(f" Test 2 WITH inlining: {stats_t2_inline['cv']:.2f}% {'[GOOD]' if stats_t2_inline['cv'] < 10 else '[HIGH VARIANCE]'}") print(f" Test 2 WITHOUT inlining: {stats_t2_no_inline['cv']:.2f}% {'[HIGH VARIANCE]' if stats_t2_no_inline['cv'] > 10 else '[GOOD]'}") print() print("t-test Results (Welch's t-test for unequal variances):") print(f" Test 1: t = {t_stat_t1:.3f}, df = {df_t1:.2f}") print(f" Test 2: t = {t_stat_t2:.3f}, df = {df_t2:.2f}") print(f" CPU Cycles: t = {t_stat_cycles:.3f}, df = {df_cycles:.2f}") print(f" Cache Misses: t = {t_stat_cache:.3f}, df = {df_cache:.2f}") print(f" L1 Misses: t = {t_stat_l1:.3f}, df = {df_l1:.2f}") print() print("Note: For 5 samples, t > 2.776 suggests significance at p < 0.05 level") print() # Conclusion print("=" * 80) print("CONCLUSION") print("=" * 80) print() # Determine if results are significant cv_acceptable = all([ stats_t1_inline['cv'] < 15, stats_t1_no_inline['cv'] < 15, stats_t2_inline['cv'] < 15, ]) if improvement_t1 > 0 and improvement_t2 > 0: print("INLINING OPTIMIZATION IS EFFECTIVE:") print(f" - Test 1 shows {improvement_t1:.2f}% throughput improvement") print(f" - Test 2 shows {improvement_t2:.2f}% throughput improvement") print(f" - CPU cycles reduced by {improvement_cycles:.2f}%") print(f" - Cache misses reduced by {improvement_cache:.2f}%") print() if cv_acceptable and t_stat_t1 > 1.5: print("Results show GOOD CONSISTENCY with acceptable variance.") else: print("Results show HIGH VARIANCE - consider additional runs for confirmation.") print() if improvement_cycles >= 1.0: print(f"The {improvement_cycles:.2f}% cycle reduction confirms the optimization is effective.") print() print("RECOMMENDATION: KEEP inlining optimization.") print("NEXT STEP: Proceed with 'Batch Tier Checks' optimization.") else: print("Cycle reduction is marginal. Monitor in production workloads.") print() print("RECOMMENDATION: Keep inlining but verify with production benchmarks.") else: print("WARNING: INLINING SHOWS NO CLEAR BENEFIT OR REGRESSION") print(f" - Test 1: {improvement_t1:.2f}%") print(f" - Test 2: {improvement_t2:.2f}%") print() print("RECOMMENDATION: Re-evaluate inlining strategy or investigate variance.") print() print("=" * 80)