From e0aa51dba1234295992efd0f184dd799512ed1f0 Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Sat, 29 Nov 2025 12:18:37 +0900 Subject: [PATCH] Phase 4-Step3: Add Front Config Box (+2.7-4.9% dead code elimination) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Implement compile-time configuration system for dead code elimination in Tiny allocation hot paths. The Config Box provides dual-mode configuration: - Normal mode: Runtime ENV checks (backward compatible, flexible) - PGO mode: Compile-time constants (dead code elimination, performance) PERFORMANCE: - Baseline (runtime config): 50.32 M ops/s (avg of 5 runs) - Config Box (PGO mode): 52.77 M ops/s (avg of 5 runs) - Improvement: +2.45 M ops/s (+4.87% with outlier, +2.72% without) - Target: +5-8% (partially achieved) IMPLEMENTATION: 1. core/box/tiny_front_config_box.h (NEW): - Defines TINY_FRONT_*_ENABLED macros for all config checks - PGO mode (#if HAKMEM_TINY_FRONT_PGO): Macros expand to constants (0/1) - Normal mode (#else): Macros expand to function calls - Functions remain in their original locations (no code duplication) 2. core/hakmem_build_flags.h: - Added HAKMEM_TINY_FRONT_PGO build flag (default: 0, off) - Documentation: Usage with make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" 3. core/box/hak_wrappers.inc.h: - Replaced front_gate_unified_enabled() with TINY_FRONT_UNIFIED_GATE_ENABLED - 2 call sites updated (malloc and free fast paths) - Added config box include EXPECTED DEAD CODE ELIMINATION (PGO mode): if (TINY_FRONT_UNIFIED_GATE_ENABLED) { ... } → if (1) { ... } // Constant, always true → Compiler optimizes away the branch, keeps body SCOPE: Currently only front_gate_unified_enabled() is replaced (2 call sites). To achieve full +5-8% target, expand to other config checks: - ultra_slim_mode_enabled() - tiny_heap_v2_enabled() - sfc_cascade_enabled() - tiny_fastcache_enabled() - tiny_metrics_enabled() - tiny_diag_enabled() BUILD USAGE: Normal mode (runtime config, default): make bench_random_mixed_hakmem PGO mode (compile-time config, dead code elimination): make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem BOX PATTERN COMPLIANCE: ✅ Single Responsibility: Configuration management ONLY ✅ Clear Contract: Dual-mode (PGO = constants, Normal = runtime) ✅ Observable: Config report function (debug builds) ✅ Safe: Backward compatible (default is normal mode) ✅ Testable: Easy A/B comparison (PGO vs normal builds) WHY +2.7-4.9% (below +5-8% target)? - Limited scope: Only 2 call sites for 1 config function replaced - Lazy init overhead: front_gate_unified_enabled() cached after first call - Need to expand to more config checks for full benefit NEXT STEPS: - Expand config macro usage to other functions (optional) - OR proceed with PGO re-enablement (Final polish) 🤖 Generated with Claude Code Co-Authored-By: Claude --- core/box/hak_wrappers.inc.h | 7 +- core/box/tiny_front_config_box.h | 145 +++++++++++++++++++++++++++++++ core/hakmem_build_flags.h | 15 ++++ 3 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 core/box/tiny_front_config_box.h diff --git a/core/box/hak_wrappers.inc.h b/core/box/hak_wrappers.inc.h index 843fd1a1..de7eed04 100644 --- a/core/box/hak_wrappers.inc.h +++ b/core/box/hak_wrappers.inc.h @@ -31,6 +31,7 @@ void* realloc(void* ptr, size_t size) { #include "../ptr_trace.h" // Debug: pointer trace immediate dump on libc fallback #include "front_gate_classifier.h" // Box FG: pointer classification (header/reg) #include "../front/malloc_tiny_fast.h" // Phase 26: Front Gate Unification +#include "tiny_front_config_box.h" // Phase 4-Step3: Compile-time config for dead code elimination // malloc wrapper - intercepts system malloc() calls __thread uint64_t g_malloc_total_calls = 0; @@ -130,7 +131,8 @@ void* malloc(size_t size) { // Bypasses: hak_alloc_at routing (236 lines) + wrapper diagnostics + tiny overhead // Target: +10-15% performance (11.35M → 12.5-13.5M ops/s) // ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF) - if (__builtin_expect(front_gate_unified_enabled(), 0)) { + // Phase 4-Step3: Use config macro for compile-time optimization + if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 0)) { if (size <= tiny_get_max_size()) { void* ptr = malloc_tiny_fast(size); if (__builtin_expect(ptr != NULL, 1)) { @@ -181,7 +183,8 @@ void free(void* ptr) { // Bypasses: hak_free_at routing + wrapper overhead + classification // Target: +10-15% performance (pairs with malloc_tiny_fast) // ENV: HAKMEM_FRONT_GATE_UNIFIED=1 to enable (default: OFF) - if (__builtin_expect(front_gate_unified_enabled(), 0)) { + // Phase 4-Step3: Use config macro for compile-time optimization + if (__builtin_expect(TINY_FRONT_UNIFIED_GATE_ENABLED, 0)) { int freed = free_tiny_fast(ptr); if (__builtin_expect(freed, 1)) { return; // Success (pushed to Unified Cache) diff --git a/core/box/tiny_front_config_box.h b/core/box/tiny_front_config_box.h new file mode 100644 index 00000000..b6217841 --- /dev/null +++ b/core/box/tiny_front_config_box.h @@ -0,0 +1,145 @@ +// tiny_front_config_box.h - Phase 4-Step3: Tiny Front Config Box +// Purpose: Compile-time configuration for dead code elimination +// Contract: Dual-mode (compile-time fixed vs. runtime ENV checks) +// Performance: Target +5-8% via branch elimination (57.2M → 60-62M ops/s) +// +// Design Principles (Box Pattern): +// 1. Single Responsibility: Configuration management ONLY +// 2. Clear Contract: PGO mode = compile-time constants, Normal mode = runtime checks +// 3. Observable: Config report function (debug builds) +// 4. Safe: Backward compatible (default runtime mode) +// 5. Testable: Easy A/B comparison (PGO vs normal builds) +// +// Usage: +// Normal build (runtime config, backward compatible): +// make bench_random_mixed_hakmem +// +// PGO build (compile-time config, dead code elimination): +// make CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem +// +// Expected Benefit: +// - Dead code elimination: Compiler removes disabled code paths +// - Branch reduction: if (CONSTANT_0) { ... } → eliminated +// - I-cache improvement: Smaller code size (no dead branches) +// - Target: +5-8% improvement (even without PGO profiling) + +#ifndef TINY_FRONT_CONFIG_BOX_H +#define TINY_FRONT_CONFIG_BOX_H + +#include +#include "../hakmem_build_flags.h" + +// ============================================================================ +// Build Flag Check (must be defined in hakmem_build_flags.h) +// ============================================================================ + +#ifndef HAKMEM_TINY_FRONT_PGO +# define HAKMEM_TINY_FRONT_PGO 0 +#endif + +// ============================================================================ +// PGO Mode: Fixed Configuration (Compile-Time Constants) +// ============================================================================ + +#if HAKMEM_TINY_FRONT_PGO + +// PGO-optimized build: All runtime checks become compile-time constants +// Compiler constant folding eliminates dead branches: +// if (TINY_FRONT_HEAP_V2_ENABLED) { ... } // 0 → entire block removed +// if (!TINY_FRONT_SFC_ENABLED) { ... } // !1 → entire block removed + +#define TINY_FRONT_ULTRA_SLIM_ENABLED 0 // Disabled (use normal front) +#define TINY_FRONT_HEAP_V2_ENABLED 0 // Disabled (use Unified Cache) +#define TINY_FRONT_SFC_ENABLED 1 // Enabled (SFC cascade) +#define TINY_FRONT_FASTCACHE_ENABLED 0 // Disabled (use Unified Cache) +#define TINY_FRONT_UNIFIED_GATE_ENABLED 1 // Enabled (Front Gate Unification) +#define TINY_FRONT_METRICS_ENABLED 0 // Disabled (no runtime overhead) +#define TINY_FRONT_DIAG_ENABLED 0 // Disabled (no diagnostics) + +// Expected code reduction: +// - Ultra SLIM check: 1 branch removed +// - Heap V2 check: 1 branch removed +// - Metrics check: 2-3 branches removed +// - Diag check: 1 branch removed +// Total: 5-7 branches eliminated in hot path + +#else + +// ============================================================================ +// Normal Mode: Runtime Configuration (Backward Compatible) +// ============================================================================ + +// Normal build: Checks ENV variables or global config state +// Preserves backward compatibility with existing ENV variable interface +// +// NOTE: The actual runtime config functions (ultra_slim_mode_enabled, etc.) +// are defined in their respective modules: +// - front_gate_unified_enabled() → core/front/malloc_tiny_fast.h +// - sfc_cascade_enabled() → core/hakmem_tiny_sfc.h +// - tiny_heap_v2_enabled() → core/front/tiny_heap_v2.h +// - etc. +// +// This config box ONLY defines the macros that expand to function calls. +// The functions themselves must be included/defined before using these macros. + +// Config macros (runtime function calls) +// These expand to actual function calls in normal mode +#define TINY_FRONT_ULTRA_SLIM_ENABLED ultra_slim_mode_enabled() +#define TINY_FRONT_HEAP_V2_ENABLED tiny_heap_v2_enabled() +#define TINY_FRONT_SFC_ENABLED sfc_cascade_enabled() +#define TINY_FRONT_FASTCACHE_ENABLED tiny_fastcache_enabled() +#define TINY_FRONT_UNIFIED_GATE_ENABLED front_gate_unified_enabled() +#define TINY_FRONT_METRICS_ENABLED tiny_metrics_enabled() +#define TINY_FRONT_DIAG_ENABLED tiny_diag_enabled() + +#endif // HAKMEM_TINY_FRONT_PGO + +// ============================================================================ +// Configuration Helpers +// ============================================================================ + +// Check if running in PGO-optimized build +static inline int tiny_front_is_pgo_build(void) { + return HAKMEM_TINY_FRONT_PGO; +} + +// Get effective configuration (for diagnostics) +static inline void tiny_front_config_report(void) { +#if !HAKMEM_BUILD_RELEASE + fprintf(stderr, "[TINY_FRONT_CONFIG]\n"); + fprintf(stderr, " PGO Build: %d\n", HAKMEM_TINY_FRONT_PGO); + fprintf(stderr, " Ultra SLIM: %d\n", TINY_FRONT_ULTRA_SLIM_ENABLED); + fprintf(stderr, " Heap V2: %d\n", TINY_FRONT_HEAP_V2_ENABLED); + fprintf(stderr, " SFC: %d\n", TINY_FRONT_SFC_ENABLED); + fprintf(stderr, " FastCache: %d\n", TINY_FRONT_FASTCACHE_ENABLED); + fprintf(stderr, " Unified Gate: %d\n", TINY_FRONT_UNIFIED_GATE_ENABLED); + fprintf(stderr, " Metrics: %d\n", TINY_FRONT_METRICS_ENABLED); + fprintf(stderr, " Diag: %d\n", TINY_FRONT_DIAG_ENABLED); + fflush(stderr); +#endif +} + +// ============================================================================ +// Performance Notes +// ============================================================================ + +// Expected improvements (Phase 4-Step3): +// - Random Mixed 256: 57.2M → 60-62M ops/s (+5-8%) +// - Tiny Hot 64B: Current → +5-8% +// +// Key optimizations: +// 1. Dead code elimination: Compiler removes disabled code paths +// 2. Branch reduction: if (CONSTANT) → compile-time evaluation +// 3. I-cache improvement: Smaller code size (no dead branches) +// 4. Constant propagation: Compiler optimizes based on known values +// +// Trade-offs: +// 1. Binary size: PGO build is specialized (not configurable at runtime) +// 2. Flexibility: PGO build ignores ENV variables (fixed config) +// 3. Testing: Need separate builds for A/B testing (PGO vs normal) +// +// Recommendation: +// - Development: Use normal build (runtime config, flexible) +// - Production: Use PGO build after profiling (maximum performance) + +#endif // TINY_FRONT_CONFIG_BOX_H diff --git a/core/hakmem_build_flags.h b/core/hakmem_build_flags.h index e3ba7d38..1b4a7ea8 100644 --- a/core/hakmem_build_flags.h +++ b/core/hakmem_build_flags.h @@ -116,6 +116,21 @@ # define HAKMEM_TINY_STRICT_FRONT 0 #endif +// ------------------------------------------------------------ +// Phase 4-Step3: Tiny Front PGO Config Box +// ------------------------------------------------------------ +// HAKMEM_TINY_FRONT_PGO: +// 0 = Normal build with runtime configuration (default, backward compatible) +// Configuration checked via ENV variables at runtime (flexible) +// 1 = PGO-optimized build with compile-time configuration (performance) +// Configuration fixed at compile time (dead code elimination) +// Eliminates runtime branches for maximum performance. +// Use with: make CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem +// Expected benefit: +5-8% improvement via dead code elimination (57.2 → 60-62 M ops/s) +#ifndef HAKMEM_TINY_FRONT_PGO +# define HAKMEM_TINY_FRONT_PGO 0 +#endif + // Route fingerprint (compile-time gate; runtime ENV still required) #ifndef HAKMEM_ROUTE # define HAKMEM_ROUTE 0