Files
hakmem/core/hakmem_build_flags.h
Moe Charm (CI) 84f5034e45 Phase 68: PGO training set diversification (seed/WS expansion)
Changes:
- scripts/box/pgo_fast_profile_config.sh: Expanded WS patterns (3→5) and seeds (1→3)
  for reduced overfitting and better production workload representativeness
- PERFORMANCE_TARGETS_SCORECARD.md: Phase 68 baseline promoted (61.614M = 50.93%)
- CURRENT_TASK.md: Phase 68 marked complete, Phase 67a (layout tax forensics) set Active

Results:
- 10-run verification: +1.19% vs Phase 66 baseline (GO, >+1.0% threshold)
- M1 milestone: 50.93% of mimalloc (target 50%, exceeded by +0.93pp)
- Stability: 10-run mean/median with <2.1% CV

🤖 Generated with Claude Code

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-17 21:08:17 +09:00

451 lines
19 KiB
C

// hakmem_build_flags.h - Centralized compile-time feature switches
// Purpose: Define all build-time toggles in one place with safe defaults.
// Usage: Include from common public headers (e.g., hakmem.h / hakmem_tiny.h).
#ifndef HAKMEM_BUILD_FLAGS_H
#define HAKMEM_BUILD_FLAGS_H
// ------------------------------------------------------------
// Phase 2: Headerless Mode Override
// ------------------------------------------------------------
// If Headerless is enabled, force HEADER_CLASSIDX to 0
#if defined(HAKMEM_TINY_HEADERLESS) && HAKMEM_TINY_HEADERLESS
#undef HAKMEM_TINY_HEADER_CLASSIDX
#define HAKMEM_TINY_HEADER_CLASSIDX 0
#endif
// ------------------------------------------------------------
// Release/debug detection
// ------------------------------------------------------------
// HAKMEM_BUILD_RELEASE: 1 in release-like builds, 0 otherwise
#ifndef HAKMEM_BUILD_RELEASE
# if defined(NDEBUG)
# define HAKMEM_BUILD_RELEASE 1
# else
# define HAKMEM_BUILD_RELEASE 0
# endif
#endif
// ------------------------------------------------------------
// Phase 35-A: Benchmark Minimal Mode
// ------------------------------------------------------------
// HAKMEM_BENCH_MINIMAL: Eliminate gate function overhead for benchmarks
// When =1: Gate functions return compile-time constants (no lazy init check)
// When =0: Normal runtime gate behavior (default)
// Usage: Build with -DHAKMEM_BENCH_MINIMAL=1 for benchmark-only binaries
#ifndef HAKMEM_BENCH_MINIMAL
# define HAKMEM_BENCH_MINIMAL 0
#endif
// ------------------------------------------------------------
// Instrumentation & counters (compile-time)
// ------------------------------------------------------------
// Enable lightweight path/debug counters (compiled out when 0)
// Default: 0 in release builds (NDEBUG set), 1 in debug builds
#ifndef HAKMEM_DEBUG_COUNTERS
# if defined(NDEBUG)
# define HAKMEM_DEBUG_COUNTERS 0
# else
# define HAKMEM_DEBUG_COUNTERS 1
# endif
#endif
// Enable extended memory profiling (compiled out when 0)
#ifndef HAKMEM_DEBUG_MEMORY
# define HAKMEM_DEBUG_MEMORY 0
#endif
// Tiny refill optimization helpers (header-only)
#ifndef HAKMEM_TINY_REFILL_OPT
# define HAKMEM_TINY_REFILL_OPT 1
#endif
// Batch refill P0 (can be toggled for A/B)
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
# define HAKMEM_TINY_P0_BATCH_REFILL 0
#endif
// Box refactor (Phase 6-1.7) — usually injected from build system
#ifndef HAKMEM_TINY_PHASE6_BOX_REFACTOR
# define HAKMEM_TINY_PHASE6_BOX_REFACTOR 1
#endif
// SuperSlab backend toggle (compile-time)
// Default: 1 (ON) - SuperSlab is the core architecture.
// Set to 0 only for legacy/compat testing.
#ifndef HAKMEM_TINY_USE_SUPERSLAB
# define HAKMEM_TINY_USE_SUPERSLAB 1
#endif
// ------------------------------------------------------------
// Phase 7: Region-ID Direct Lookup (Header-based optimization)
// ------------------------------------------------------------
// Phase 7 Task 1: Header-based class_idx for O(1) free
// Default: OFF (enable after full validation in Task 5)
// Build: make HEADER_CLASSIDX=1 or make phase7
#ifndef HAKMEM_TINY_HEADER_CLASSIDX
# define HAKMEM_TINY_HEADER_CLASSIDX 1
#endif
// Phase 7 Task 2: Aggressive inline TLS cache access
// Default: OFF (enable after full validation in Task 5)
// Build: make AGGRESSIVE_INLINE=1 or make phase7
// Requires: HAKMEM_TINY_HEADER_CLASSIDX=1
#ifndef HAKMEM_TINY_AGGRESSIVE_INLINE
# define HAKMEM_TINY_AGGRESSIVE_INLINE 0
#endif
// Inline TLS SLL pop (experimental, A/B only)
// Default: OFF (HAKMEM_TINY_INLINE_SLL=0) to keep Box TLS-SLL API as the standard path.
// Enable explicitly via build flag: -DHAKMEM_TINY_INLINE_SLL=1 (bench/debug only).
#ifndef HAKMEM_TINY_INLINE_SLL
# define HAKMEM_TINY_INLINE_SLL 0
#endif
// Phase 1A3: Always-inline tiny_region_id_write_header()
// Default: OFF (HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE=0) - enable after A/B validation
// Purpose: Force inline expansion of header write to reduce alloc path overhead
// Expected impact: +0.5-2% on Mixed workloads
// Build: make EXTRA_CFLAGS=-DHAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE=1 [target]
#ifndef HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE
# define HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE 0
#endif
// Phase 7 Task 3: Pre-warm TLS cache at init
// Default: OFF (enable after implementation)
// Build: make PREWARM_TLS=1 or make phase7
#ifndef HAKMEM_TINY_PREWARM_TLS
# define HAKMEM_TINY_PREWARM_TLS 0
#endif
// ------------------------------------------------------------
// Phase 1: Headerless Optimization - TLS SuperSlab Hint Cache
// ------------------------------------------------------------
// Purpose: Accelerate ptr→SuperSlab lookup in Headerless mode
// Default: 0 (disabled during development and testing)
// Target: 1 (enabled after validation in Phase 1 rollout)
//
// Performance Impact:
// - Cache hit: 2-5 cycles (vs 10-50 cycles for hak_super_lookup)
// - Expected hit rate: 85-95% (single-threaded), 70-85% (multi-threaded)
// - Expected throughput improvement: 15-20%
//
// Memory Overhead:
// - 112 bytes per thread (TLS)
// - Negligible for typical workloads (1000 threads = 112KB)
//
// Dependencies:
// - Requires HAKMEM_TINY_HEADERLESS=1 (hint is no-op in header mode)
// - No other dependencies (self-contained Box)
//
// Build: make EXTRA_CFLAGS="-DHAKMEM_TINY_SS_TLS_HINT=1"
#ifndef HAKMEM_TINY_SS_TLS_HINT
# define HAKMEM_TINY_SS_TLS_HINT 0
#endif
// Validation: Hint Box only active in Headerless mode
#if HAKMEM_TINY_SS_TLS_HINT && !defined(HAKMEM_TINY_HEADERLESS)
#warning "HAKMEM_TINY_SS_TLS_HINT enabled but HAKMEM_TINY_HEADERLESS not defined - hint will have no effect"
#endif
// Runtime verbosity (printf-heavy diagnostics). Keep OFF for benches.
#ifndef HAKMEM_DEBUG_VERBOSE
# define HAKMEM_DEBUG_VERBOSE 0
#endif
// Tiny/Mid safety checks on free path (mincore header validation).
// 0 = performance (boundary-only), 1 = strict (mincore for all)
#ifndef HAKMEM_TINY_SAFE_FREE
# define HAKMEM_TINY_SAFE_FREE 0
#endif
// Phase 10: Aggressive refill count defaults (tunable via env vars)
// Goal: Reduce backend transitions by refilling in larger batches
// HAKMEM_TINY_REFILL_COUNT: global default (default: 128)
// HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 128)
// HAKMEM_TINY_REFILL_COUNT_MID: class 4-7 (default: 96)
// Larson Fix (Priority 1): Increased from 64 to 128 to reduce lock contention
// Expected impact: Lock frequency reduction 19K → ~1.6K locks/sec (12x)
// NOTE: Multi-threaded Larson has pre-existing crash bug (not caused by this change)
#ifndef HAKMEM_TINY_REFILL_DEFAULT
# define HAKMEM_TINY_REFILL_DEFAULT 128
#endif
// ------------------------------------------------------------
// Tiny front architecture toggles (compile-time defaults)
// ------------------------------------------------------------
// New 3-layer Tiny front (A/B via build flag)
#ifndef HAKMEM_TINY_USE_NEW_3LAYER
# define HAKMEM_TINY_USE_NEW_3LAYER 0
#endif
// Minimal/strict front variants (bench/debug only)
#ifndef HAKMEM_TINY_MINIMAL_FRONT
# define HAKMEM_TINY_MINIMAL_FRONT 1
#endif
#ifndef HAKMEM_TINY_STRICT_FRONT
# define HAKMEM_TINY_STRICT_FRONT 0
#endif
// ------------------------------------------------------------
// Phase 4-Step3: Tiny Front PGO Config Box
// ------------------------------------------------------------
// HAKMEM_TINY_FRONT_PGO:
// 0 = Normal build with runtime configuration (default, backward compatible)
// Configuration checked via ENV variables at runtime (flexible)
// 1 = PGO-optimized build with compile-time configuration (performance)
// Configuration fixed at compile time (dead code elimination)
// Eliminates runtime branches for maximum performance.
// Use with: make CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem
// Expected benefit: +5-8% improvement via dead code elimination (57.2 → 60-62 M ops/s)
#ifndef HAKMEM_TINY_FRONT_PGO
# define HAKMEM_TINY_FRONT_PGO 0
#endif
// Phase 5-Step3: Mid/Large PGO Config Box
// ------------------------------------------------------------
// HAKMEM_MID_LARGE_PGO:
// 0 = Normal build with runtime configuration (default, backward compatible)
// Configuration checked via ENV variables at runtime (flexible)
// 1 = PGO-optimized build with compile-time configuration (performance)
// Configuration fixed at compile time (dead code elimination)
// Eliminates runtime branches for Mid/Large allocation paths.
// Use with: make EXTRA_CFLAGS="-DHAKMEM_MID_LARGE_PGO=1" bench_random_mixed_hakmem
// Expected benefit: +2-4% improvement via dead code elimination
#ifndef HAKMEM_MID_LARGE_PGO
# define HAKMEM_MID_LARGE_PGO 0
#endif
// Route fingerprint (compile-time gate; runtime ENV still required)
#ifndef HAKMEM_ROUTE
# define HAKMEM_ROUTE 0
#endif
// Phase 63: FAST Profile-Fixed Build (compile-time constant gates)
// HAKMEM_FAST_PROFILE_FIXED: Fix all MIXED_TINYV3_C7_SAFE gates to compile-time constants
// When =1: Top 5-8 gates (tiny_front_v3_enabled, front_fastlane_enabled, etc.)
// return compile-time constants, learning layer disabled (DCE expected +5-10%)
// When =0: Normal runtime gate behavior (default, backward compatible)
// Usage: Build with -DHAKMEM_FAST_PROFILE_FIXED=1 for speed-first FAST binaries
// Only for FAST builds; Standard/OBSERVE keep runtime gates unchanged
#ifndef HAKMEM_FAST_PROFILE_FIXED
# define HAKMEM_FAST_PROFILE_FIXED 0
#endif
// Phase 64: Backend Pruning (compile-time unreachable code elimination)
// HAKMEM_FAST_PROFILE_PRUNE_BACKENDS: Disable unused backends in Mixed workload
// When =1: Backend gates (mid_v3_enabled, pool_v2_enabled, etc.) return false at compile-time
// LTO DCE eliminates unreachable code paths (expected +5-10%)
// When =0: Normal runtime gate behavior (default, backward compatible)
// Usage: Build with -DHAKMEM_FAST_PROFILE_PRUNE_BACKENDS=1 for ultra-fast FAST binaries
// Backends disabled: MID_V3, POOL_V2, SMALL_HEAP_V4, LEARNER, etc.
#ifndef HAKMEM_FAST_PROFILE_PRUNE_BACKENDS
# define HAKMEM_FAST_PROFILE_PRUNE_BACKENDS 0
#endif
// Bench-only knobs (default values; can be overridden via build flags)
#ifndef HAKMEM_TINY_BENCH_REFILL
# define HAKMEM_TINY_BENCH_REFILL 8
#endif
#ifndef HAKMEM_TINY_BENCH_REFILL8
# define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL
#endif
#ifndef HAKMEM_TINY_BENCH_REFILL16
# define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL
#endif
#ifndef HAKMEM_TINY_BENCH_REFILL32
# define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL
#endif
#ifndef HAKMEM_TINY_BENCH_REFILL64
# define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL
#endif
#ifndef HAKMEM_TINY_BENCH_WARMUP8
# define HAKMEM_TINY_BENCH_WARMUP8 64
#endif
#ifndef HAKMEM_TINY_BENCH_WARMUP16
# define HAKMEM_TINY_BENCH_WARMUP16 96
#endif
#ifndef HAKMEM_TINY_BENCH_WARMUP32
# define HAKMEM_TINY_BENCH_WARMUP32 160
#endif
#ifndef HAKMEM_TINY_BENCH_WARMUP64
# define HAKMEM_TINY_BENCH_WARMUP64 192
#endif
// ------------------------------------------------------------
// Phase 22: Research Box Prune (Compile-out default-OFF boxes)
// ------------------------------------------------------------
// Phase 14 Tcache: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need tcache experimentation
#ifndef HAKMEM_TINY_TCACHE_COMPILED
# define HAKMEM_TINY_TCACHE_COMPILED 0
#endif
// Phase 15 Unified LIFO: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need LIFO/FIFO mode switching
#ifndef HAKMEM_TINY_UNIFIED_LIFO_COMPILED
# define HAKMEM_TINY_UNIFIED_LIFO_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 23: Per-op Default-OFF Tax Prune (Compile-out per-op research knobs)
// ------------------------------------------------------------
// Phase E5-2 Header Write-Once: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need write-once header optimization
#ifndef HAKMEM_TINY_HEADER_WRITE_ONCE_COMPILED
# define HAKMEM_TINY_HEADER_WRITE_ONCE_COMPILED 0
#endif
// Unified Cache Measurement: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need cache measurement instrumentation
#ifndef HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
# define HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 24: OBSERVE Tax Prune (Compile-out hot-path stats atomics)
// ------------------------------------------------------------
// Tiny Class Stats: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need per-class stats observation
#ifndef HAKMEM_TINY_CLASS_STATS_COMPILED
# define HAKMEM_TINY_CLASS_STATS_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 25: Tiny Free Stats Atomic Prune (Compile-out g_free_ss_enter)
// ------------------------------------------------------------
// Tiny Free Stats: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need free path telemetry
// Target: g_free_ss_enter atomic in core/tiny_superslab_free.inc.h
#ifndef HAKMEM_TINY_FREE_STATS_COMPILED
# define HAKMEM_TINY_FREE_STATS_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 26A: C7 Free Count Atomic Prune (Compile-out c7_free_count)
// ------------------------------------------------------------
// C7 Free Count: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need C7 free path diagnostics
// Target: c7_free_count atomic in core/tiny_superslab_free.inc.h:51
#ifndef HAKMEM_C7_FREE_COUNT_COMPILED
# define HAKMEM_C7_FREE_COUNT_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 26B: Header Mismatch Log Atomic Prune (Compile-out g_hdr_mismatch_log)
// ------------------------------------------------------------
// Header Mismatch Log: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need header validation diagnostics
// Target: g_hdr_mismatch_log atomic in core/tiny_superslab_free.inc.h:147
#ifndef HAKMEM_HDR_MISMATCH_LOG_COMPILED
# define HAKMEM_HDR_MISMATCH_LOG_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 26C: Header Meta Mismatch Atomic Prune (Compile-out g_hdr_meta_mismatch)
// ------------------------------------------------------------
// Header Meta Mismatch: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need metadata validation diagnostics
// Target: g_hdr_meta_mismatch atomic in core/tiny_superslab_free.inc.h:182
#ifndef HAKMEM_HDR_META_MISMATCH_COMPILED
# define HAKMEM_HDR_META_MISMATCH_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 26D: Metric Bad Class Atomic Prune (Compile-out g_metric_bad_class_once)
// ------------------------------------------------------------
// Metric Bad Class: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need bad class index diagnostics
// Target: g_metric_bad_class_once atomic in core/hakmem_tiny_alloc.inc:22
#ifndef HAKMEM_METRIC_BAD_CLASS_COMPILED
# define HAKMEM_METRIC_BAD_CLASS_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 26E: Header Meta Fast Atomic Prune (Compile-out g_hdr_meta_fast)
// ------------------------------------------------------------
// Header Meta Fast: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need fast-path metadata telemetry
// Target: g_hdr_meta_fast atomic in core/tiny_free_fast_v2.inc.h:181
#ifndef HAKMEM_HDR_META_FAST_COMPILED
# define HAKMEM_HDR_META_FAST_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 27: Unified Cache Stats Atomic Prune (Compile-out observation atomics)
// ------------------------------------------------------------
// Unified Cache Stats: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need cache telemetry
// Target: g_cache_unified_stats atomics in core/hakmem_tiny.c
#ifndef HAKMEM_UNIFIED_CACHE_STATS_COMPILED
# define HAKMEM_UNIFIED_CACHE_STATS_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 29: Pool Hotbox v2 Stats Prune (Compile-out telemetry atomics)
// ------------------------------------------------------------
// Pool Hotbox v2 Stats: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need Pool v2 telemetry
// Target: g_pool_hotbox_v2_stats[ci].* atomics in core/hakmem_pool.c
// Impact: 12 atomic counters on HOT+WARM path (alloc_fast, free_fast, etc.)
#ifndef HAKMEM_POOL_HOTBOX_V2_STATS_COMPILED
# define HAKMEM_POOL_HOTBOX_V2_STATS_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 31: Tiny Free Trace Atomic Prune (Compile-out trace atomic)
// ------------------------------------------------------------
// Tiny Free Trace: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need free path trace diagnostics
// Target: g_tiny_free_trace atomic in core/hakmem_tiny_free.inc:326
// Impact: HOT path atomic (every free operation)
// Expected improvement: +0.5% to +1.0% (similar to Phase 25: +1.07%)
#ifndef HAKMEM_TINY_FREE_TRACE_COMPILED
# define HAKMEM_TINY_FREE_TRACE_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 32: Tiny Free Calls Atomic Prune (Compile-out diagnostic counter)
// ------------------------------------------------------------
// Tiny Free Calls: Compile gate (default OFF = compile-out)
// Set to 1 for research builds that need free path call counting
// Target: g_hak_tiny_free_calls atomic in core/hakmem_tiny_free.inc:335
// Impact: HOT path atomic (every free operation, unconditional)
// Expected improvement: +0.3% to +0.7% (diagnostic counter, less critical than Phase 25)
#ifndef HAKMEM_TINY_FREE_CALLS_COMPILED
# define HAKMEM_TINY_FREE_CALLS_COMPILED 0
#endif
// ------------------------------------------------------------
// Phase 34: Batch Atomic Prune (Compile-out remaining WARM path atomics)
// ------------------------------------------------------------
// Phase 34A: Splice Debug Counter (WARM path, refill)
// Target: g_splice_count in core/tiny_refill_opt.h:79
// Impact: WARM path atomic (every refill splice operation)
#ifndef HAKMEM_SPLICE_DEBUG_COMPILED
# define HAKMEM_SPLICE_DEBUG_COMPILED 0
#endif
// Phase 34B: Alloc Gate Class Mismatch (ERROR path, rare)
// Target: g_alloc_gate_cls_mis in core/box/tiny_alloc_gate_box.h:95
// Impact: ERROR path atomic (class mismatch detection, rare)
#ifndef HAKMEM_ALLOC_GATE_CLS_MIS_COMPILED
# define HAKMEM_ALLOC_GATE_CLS_MIS_COMPILED 0
#endif
// ------------------------------------------------------------
// Helper enum (for documentation / logging)
// ------------------------------------------------------------
typedef enum {
HAK_FLAG_BUILD_RELEASE = HAKMEM_BUILD_RELEASE,
HAK_FLAG_DEBUG_COUNTERS = HAKMEM_DEBUG_COUNTERS,
HAK_FLAG_DEBUG_MEMORY = HAKMEM_DEBUG_MEMORY,
HAK_FLAG_REFILL_OPT = HAKMEM_TINY_REFILL_OPT,
HAK_FLAG_P0_BATCH = HAKMEM_TINY_P0_BATCH_REFILL,
HAK_FLAG_BOX_REFACTOR = HAKMEM_TINY_PHASE6_BOX_REFACTOR,
HAK_FLAG_NEW_3LAYER = HAKMEM_TINY_USE_NEW_3LAYER,
} hak_build_flags_t;
#endif // HAKMEM_BUILD_FLAGS_H