2025-11-05 12:31:14 +09:00
|
|
|
// hakmem_build_flags.h - Centralized compile-time feature switches
|
|
|
|
|
// Purpose: Define all build-time toggles in one place with safe defaults.
|
|
|
|
|
// Usage: Include from common public headers (e.g., hakmem.h / hakmem_tiny.h).
|
|
|
|
|
|
|
|
|
|
#ifndef HAKMEM_BUILD_FLAGS_H
|
|
|
|
|
#define HAKMEM_BUILD_FLAGS_H
|
|
|
|
|
|
2025-12-03 12:11:27 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 2: Headerless Mode Override
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// If Headerless is enabled, force HEADER_CLASSIDX to 0
|
|
|
|
|
#if defined(HAKMEM_TINY_HEADERLESS) && HAKMEM_TINY_HEADERLESS
|
|
|
|
|
#undef HAKMEM_TINY_HEADER_CLASSIDX
|
|
|
|
|
#define HAKMEM_TINY_HEADER_CLASSIDX 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Release/debug detection
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// HAKMEM_BUILD_RELEASE: 1 in release-like builds, 0 otherwise
|
|
|
|
|
#ifndef HAKMEM_BUILD_RELEASE
|
|
|
|
|
# if defined(NDEBUG)
|
|
|
|
|
# define HAKMEM_BUILD_RELEASE 1
|
|
|
|
|
# else
|
|
|
|
|
# define HAKMEM_BUILD_RELEASE 0
|
|
|
|
|
# endif
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-12-16 15:01:56 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 35-A: Benchmark Minimal Mode
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// HAKMEM_BENCH_MINIMAL: Eliminate gate function overhead for benchmarks
|
|
|
|
|
// When =1: Gate functions return compile-time constants (no lazy init check)
|
|
|
|
|
// When =0: Normal runtime gate behavior (default)
|
|
|
|
|
// Usage: Build with -DHAKMEM_BENCH_MINIMAL=1 for benchmark-only binaries
|
|
|
|
|
#ifndef HAKMEM_BENCH_MINIMAL
|
|
|
|
|
# define HAKMEM_BENCH_MINIMAL 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Instrumentation & counters (compile-time)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Enable lightweight path/debug counters (compiled out when 0)
|
2025-12-05 06:16:12 +09:00
|
|
|
// Default: 0 in release builds (NDEBUG set), 1 in debug builds
|
2025-11-05 12:31:14 +09:00
|
|
|
#ifndef HAKMEM_DEBUG_COUNTERS
|
2025-12-05 06:16:12 +09:00
|
|
|
# if defined(NDEBUG)
|
|
|
|
|
# define HAKMEM_DEBUG_COUNTERS 0
|
|
|
|
|
# else
|
|
|
|
|
# define HAKMEM_DEBUG_COUNTERS 1
|
|
|
|
|
# endif
|
2025-11-05 12:31:14 +09:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Enable extended memory profiling (compiled out when 0)
|
|
|
|
|
#ifndef HAKMEM_DEBUG_MEMORY
|
|
|
|
|
# define HAKMEM_DEBUG_MEMORY 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Tiny refill optimization helpers (header-only)
|
|
|
|
|
#ifndef HAKMEM_TINY_REFILL_OPT
|
|
|
|
|
# define HAKMEM_TINY_REFILL_OPT 1
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Batch refill P0 (can be toggled for A/B)
|
|
|
|
|
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
|
2025-11-11 10:00:36 +09:00
|
|
|
# define HAKMEM_TINY_P0_BATCH_REFILL 0
|
2025-11-05 12:31:14 +09:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Box refactor (Phase 6-1.7) — usually injected from build system
|
|
|
|
|
#ifndef HAKMEM_TINY_PHASE6_BOX_REFACTOR
|
|
|
|
|
# define HAKMEM_TINY_PHASE6_BOX_REFACTOR 1
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-30 15:08:45 +09:00
|
|
|
// SuperSlab backend toggle (compile-time)
|
|
|
|
|
// Default: 1 (ON) - SuperSlab is the core architecture.
|
|
|
|
|
// Set to 0 only for legacy/compat testing.
|
|
|
|
|
#ifndef HAKMEM_TINY_USE_SUPERSLAB
|
|
|
|
|
# define HAKMEM_TINY_USE_SUPERSLAB 1
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-08 12:54:52 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 7: Region-ID Direct Lookup (Header-based optimization)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 7 Task 1: Header-based class_idx for O(1) free
|
|
|
|
|
// Default: OFF (enable after full validation in Task 5)
|
|
|
|
|
// Build: make HEADER_CLASSIDX=1 or make phase7
|
|
|
|
|
#ifndef HAKMEM_TINY_HEADER_CLASSIDX
|
2025-11-11 10:00:36 +09:00
|
|
|
# define HAKMEM_TINY_HEADER_CLASSIDX 1
|
2025-11-08 12:54:52 +09:00
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Phase 7 Task 2: Aggressive inline TLS cache access
|
|
|
|
|
// Default: OFF (enable after full validation in Task 5)
|
|
|
|
|
// Build: make AGGRESSIVE_INLINE=1 or make phase7
|
|
|
|
|
// Requires: HAKMEM_TINY_HEADER_CLASSIDX=1
|
|
|
|
|
#ifndef HAKMEM_TINY_AGGRESSIVE_INLINE
|
|
|
|
|
# define HAKMEM_TINY_AGGRESSIVE_INLINE 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-14 06:09:02 +09:00
|
|
|
// Inline TLS SLL pop (experimental, A/B only)
|
|
|
|
|
// Default: OFF (HAKMEM_TINY_INLINE_SLL=0) to keep Box TLS-SLL API as the standard path.
|
|
|
|
|
// Enable explicitly via build flag: -DHAKMEM_TINY_INLINE_SLL=1 (bench/debug only).
|
|
|
|
|
#ifndef HAKMEM_TINY_INLINE_SLL
|
|
|
|
|
# define HAKMEM_TINY_INLINE_SLL 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-12-13 15:31:08 +09:00
|
|
|
// Phase 1A3: Always-inline tiny_region_id_write_header()
|
|
|
|
|
// Default: OFF (HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE=0) - enable after A/B validation
|
|
|
|
|
// Purpose: Force inline expansion of header write to reduce alloc path overhead
|
|
|
|
|
// Expected impact: +0.5-2% on Mixed workloads
|
|
|
|
|
// Build: make EXTRA_CFLAGS=-DHAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE=1 [target]
|
|
|
|
|
#ifndef HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE
|
|
|
|
|
# define HAKMEM_TINY_HEADER_WRITE_ALWAYS_INLINE 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-08 12:54:52 +09:00
|
|
|
// Phase 7 Task 3: Pre-warm TLS cache at init
|
|
|
|
|
// Default: OFF (enable after implementation)
|
|
|
|
|
// Build: make PREWARM_TLS=1 or make phase7
|
|
|
|
|
#ifndef HAKMEM_TINY_PREWARM_TLS
|
|
|
|
|
# define HAKMEM_TINY_PREWARM_TLS 0
|
|
|
|
|
#endif
|
|
|
|
|
|
Implement Phase 1: TLS SuperSlab Hint Box for Headerless performance
Design: Cache recently-used SuperSlab references in TLS to accelerate
ptr→SuperSlab resolution in Headerless mode free() path.
## Implementation
### New Box: core/box/tls_ss_hint_box.h
- Header-only Box (4-slot FIFO cache per thread)
- Functions: tls_ss_hint_init(), tls_ss_hint_update(), tls_ss_hint_lookup(), tls_ss_hint_clear()
- Memory overhead: 112 bytes per thread (negligible)
- Statistics API for debug builds (hit/miss counters)
### Integration Points
1. **Free path** (core/hakmem_tiny_free.inc):
- Lines 477-481: Fast path hint lookup before hak_super_lookup()
- Lines 550-555: Second lookup location (fallback path)
- Expected savings: 10-50 cycles → 2-5 cycles on cache hit
2. **Allocation path** (core/tiny_superslab_alloc.inc.h):
- Lines 115-122: Linear allocation return path
- Lines 179-186: Freelist allocation return path
- Cache update on successful allocation
3. **TLS variable** (core/hakmem_tiny_tls_state_box.inc):
- `__thread TlsSsHintCache g_tls_ss_hint = {0};`
### Build System
- **Build flag** (core/hakmem_build_flags.h):
- HAKMEM_TINY_SS_TLS_HINT (default: 0, disabled)
- Validation: requires HAKMEM_TINY_HEADERLESS=1
- **Makefile**:
- Removed old ss_tls_hint_box.o (conflicting implementation)
- Header-only design eliminates compiled object files
### Testing
- **Unit tests** (tests/test_tls_ss_hint.c):
- 6 test functions covering init, lookup, FIFO rotation, duplicates, clear, stats
- All tests PASSING
- **Build validation**:
- ✅ Compiles with hint disabled (default)
- ✅ Compiles with hint enabled (HAKMEM_TINY_SS_TLS_HINT=1)
### Documentation
- **Benchmark report** (docs/PHASE1_TLS_HINT_BENCHMARK.md):
- Implementation summary
- Build validation results
- Benchmark methodology (to be executed)
- Performance analysis framework
## Expected Performance
- **Hit rate**: 85-95% (single-threaded), 70-85% (multi-threaded)
- **Cycle savings**: 80-95% on cache hit (10-50 cycles → 2-5 cycles)
- **Target improvement**: 15-20% throughput increase vs Headerless baseline
- **Memory overhead**: 112 bytes per thread
## Box Theory
**Mission**: Cache hot SuperSlabs to avoid global registry lookup
**Boundary**: ptr → SuperSlab* or NULL (miss)
**Invariant**: hint.base ≤ ptr < hint.end → hit is valid
**Fallback**: Always safe to miss (triggers hak_super_lookup)
**Thread Safety**: TLS storage, no synchronization required
**Risk**: Low (read-only cache, fail-safe fallback, magic validation)
## Next Steps
1. Run full benchmark suite (sh8bench, cfrac, larson)
2. Measure actual hit rate with stats enabled
3. If performance target met (15-20% improvement), enable by default
4. Consider increasing cache slots if hit rate < 80%
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-03 18:06:24 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 1: Headerless Optimization - TLS SuperSlab Hint Cache
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Purpose: Accelerate ptr→SuperSlab lookup in Headerless mode
|
|
|
|
|
// Default: 0 (disabled during development and testing)
|
|
|
|
|
// Target: 1 (enabled after validation in Phase 1 rollout)
|
|
|
|
|
//
|
|
|
|
|
// Performance Impact:
|
|
|
|
|
// - Cache hit: 2-5 cycles (vs 10-50 cycles for hak_super_lookup)
|
|
|
|
|
// - Expected hit rate: 85-95% (single-threaded), 70-85% (multi-threaded)
|
|
|
|
|
// - Expected throughput improvement: 15-20%
|
|
|
|
|
//
|
|
|
|
|
// Memory Overhead:
|
|
|
|
|
// - 112 bytes per thread (TLS)
|
|
|
|
|
// - Negligible for typical workloads (1000 threads = 112KB)
|
|
|
|
|
//
|
|
|
|
|
// Dependencies:
|
|
|
|
|
// - Requires HAKMEM_TINY_HEADERLESS=1 (hint is no-op in header mode)
|
|
|
|
|
// - No other dependencies (self-contained Box)
|
|
|
|
|
//
|
|
|
|
|
// Build: make EXTRA_CFLAGS="-DHAKMEM_TINY_SS_TLS_HINT=1"
|
|
|
|
|
#ifndef HAKMEM_TINY_SS_TLS_HINT
|
|
|
|
|
# define HAKMEM_TINY_SS_TLS_HINT 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Validation: Hint Box only active in Headerless mode
|
|
|
|
|
#if HAKMEM_TINY_SS_TLS_HINT && !defined(HAKMEM_TINY_HEADERLESS)
|
|
|
|
|
#warning "HAKMEM_TINY_SS_TLS_HINT enabled but HAKMEM_TINY_HEADERLESS not defined - hint will have no effect"
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-09 11:50:18 +09:00
|
|
|
// Runtime verbosity (printf-heavy diagnostics). Keep OFF for benches.
|
|
|
|
|
#ifndef HAKMEM_DEBUG_VERBOSE
|
|
|
|
|
# define HAKMEM_DEBUG_VERBOSE 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Tiny/Mid safety checks on free path (mincore header validation).
|
|
|
|
|
// 0 = performance (boundary-only), 1 = strict (mincore for all)
|
|
|
|
|
#ifndef HAKMEM_TINY_SAFE_FREE
|
|
|
|
|
# define HAKMEM_TINY_SAFE_FREE 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-13 14:25:54 +09:00
|
|
|
// Phase 10: Aggressive refill count defaults (tunable via env vars)
|
|
|
|
|
// Goal: Reduce backend transitions by refilling in larger batches
|
2025-11-14 22:09:14 +09:00
|
|
|
// HAKMEM_TINY_REFILL_COUNT: global default (default: 128)
|
2025-11-13 14:25:54 +09:00
|
|
|
// HAKMEM_TINY_REFILL_COUNT_HOT: class 0-3 (default: 128)
|
|
|
|
|
// HAKMEM_TINY_REFILL_COUNT_MID: class 4-7 (default: 96)
|
2025-11-14 22:09:14 +09:00
|
|
|
// Larson Fix (Priority 1): Increased from 64 to 128 to reduce lock contention
|
|
|
|
|
// Expected impact: Lock frequency reduction 19K → ~1.6K locks/sec (12x)
|
|
|
|
|
// NOTE: Multi-threaded Larson has pre-existing crash bug (not caused by this change)
|
2025-11-08 12:54:52 +09:00
|
|
|
#ifndef HAKMEM_TINY_REFILL_DEFAULT
|
2025-11-14 22:09:14 +09:00
|
|
|
# define HAKMEM_TINY_REFILL_DEFAULT 128
|
2025-11-08 12:54:52 +09:00
|
|
|
#endif
|
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Tiny front architecture toggles (compile-time defaults)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// New 3-layer Tiny front (A/B via build flag)
|
|
|
|
|
#ifndef HAKMEM_TINY_USE_NEW_3LAYER
|
|
|
|
|
# define HAKMEM_TINY_USE_NEW_3LAYER 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Minimal/strict front variants (bench/debug only)
|
|
|
|
|
#ifndef HAKMEM_TINY_MINIMAL_FRONT
|
2025-11-11 21:49:05 +09:00
|
|
|
# define HAKMEM_TINY_MINIMAL_FRONT 1
|
2025-11-07 01:27:04 +09:00
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_STRICT_FRONT
|
|
|
|
|
# define HAKMEM_TINY_STRICT_FRONT 0
|
|
|
|
|
#endif
|
|
|
|
|
|
Phase 4-Step3: Add Front Config Box (+2.7-4.9% dead code elimination)
Implement compile-time configuration system for dead code elimination in Tiny
allocation hot paths. The Config Box provides dual-mode configuration:
- Normal mode: Runtime ENV checks (backward compatible, flexible)
- PGO mode: Compile-time constants (dead code elimination, performance)
PERFORMANCE:
- Baseline (runtime config): 50.32 M ops/s (avg of 5 runs)
- Config Box (PGO mode): 52.77 M ops/s (avg of 5 runs)
- Improvement: +2.45 M ops/s (+4.87% with outlier, +2.72% without)
- Target: +5-8% (partially achieved)
IMPLEMENTATION:
1. core/box/tiny_front_config_box.h (NEW):
- Defines TINY_FRONT_*_ENABLED macros for all config checks
- PGO mode (#if HAKMEM_TINY_FRONT_PGO): Macros expand to constants (0/1)
- Normal mode (#else): Macros expand to function calls
- Functions remain in their original locations (no code duplication)
2. core/hakmem_build_flags.h:
- Added HAKMEM_TINY_FRONT_PGO build flag (default: 0, off)
- Documentation: Usage with make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1"
3. core/box/hak_wrappers.inc.h:
- Replaced front_gate_unified_enabled() with TINY_FRONT_UNIFIED_GATE_ENABLED
- 2 call sites updated (malloc and free fast paths)
- Added config box include
EXPECTED DEAD CODE ELIMINATION (PGO mode):
if (TINY_FRONT_UNIFIED_GATE_ENABLED) { ... }
→ if (1) { ... } // Constant, always true
→ Compiler optimizes away the branch, keeps body
SCOPE:
Currently only front_gate_unified_enabled() is replaced (2 call sites).
To achieve full +5-8% target, expand to other config checks:
- ultra_slim_mode_enabled()
- tiny_heap_v2_enabled()
- sfc_cascade_enabled()
- tiny_fastcache_enabled()
- tiny_metrics_enabled()
- tiny_diag_enabled()
BUILD USAGE:
Normal mode (runtime config, default):
make bench_random_mixed_hakmem
PGO mode (compile-time config, dead code elimination):
make EXTRA_CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem
BOX PATTERN COMPLIANCE:
✅ Single Responsibility: Configuration management ONLY
✅ Clear Contract: Dual-mode (PGO = constants, Normal = runtime)
✅ Observable: Config report function (debug builds)
✅ Safe: Backward compatible (default is normal mode)
✅ Testable: Easy A/B comparison (PGO vs normal builds)
WHY +2.7-4.9% (below +5-8% target)?
- Limited scope: Only 2 call sites for 1 config function replaced
- Lazy init overhead: front_gate_unified_enabled() cached after first call
- Need to expand to more config checks for full benefit
NEXT STEPS:
- Expand config macro usage to other functions (optional)
- OR proceed with PGO re-enablement (Final polish)
🤖 Generated with Claude Code
Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-29 12:18:37 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 4-Step3: Tiny Front PGO Config Box
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// HAKMEM_TINY_FRONT_PGO:
|
|
|
|
|
// 0 = Normal build with runtime configuration (default, backward compatible)
|
|
|
|
|
// Configuration checked via ENV variables at runtime (flexible)
|
|
|
|
|
// 1 = PGO-optimized build with compile-time configuration (performance)
|
|
|
|
|
// Configuration fixed at compile time (dead code elimination)
|
|
|
|
|
// Eliminates runtime branches for maximum performance.
|
|
|
|
|
// Use with: make CFLAGS="-DHAKMEM_TINY_FRONT_PGO=1" bench_random_mixed_hakmem
|
|
|
|
|
// Expected benefit: +5-8% improvement via dead code elimination (57.2 → 60-62 M ops/s)
|
|
|
|
|
#ifndef HAKMEM_TINY_FRONT_PGO
|
|
|
|
|
# define HAKMEM_TINY_FRONT_PGO 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-29 14:39:07 +09:00
|
|
|
// Phase 5-Step3: Mid/Large PGO Config Box
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// HAKMEM_MID_LARGE_PGO:
|
|
|
|
|
// 0 = Normal build with runtime configuration (default, backward compatible)
|
|
|
|
|
// Configuration checked via ENV variables at runtime (flexible)
|
|
|
|
|
// 1 = PGO-optimized build with compile-time configuration (performance)
|
|
|
|
|
// Configuration fixed at compile time (dead code elimination)
|
|
|
|
|
// Eliminates runtime branches for Mid/Large allocation paths.
|
|
|
|
|
// Use with: make EXTRA_CFLAGS="-DHAKMEM_MID_LARGE_PGO=1" bench_random_mixed_hakmem
|
|
|
|
|
// Expected benefit: +2-4% improvement via dead code elimination
|
|
|
|
|
#ifndef HAKMEM_MID_LARGE_PGO
|
|
|
|
|
# define HAKMEM_MID_LARGE_PGO 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-07 01:27:04 +09:00
|
|
|
// Route fingerprint (compile-time gate; runtime ENV still required)
|
|
|
|
|
#ifndef HAKMEM_ROUTE
|
|
|
|
|
# define HAKMEM_ROUTE 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Bench-only knobs (default values; can be overridden via build flags)
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
# define HAKMEM_TINY_BENCH_REFILL 8
|
|
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL8
|
|
|
|
|
# define HAKMEM_TINY_BENCH_REFILL8 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL16
|
|
|
|
|
# define HAKMEM_TINY_BENCH_REFILL16 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL32
|
|
|
|
|
# define HAKMEM_TINY_BENCH_REFILL32 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_REFILL64
|
|
|
|
|
# define HAKMEM_TINY_BENCH_REFILL64 HAKMEM_TINY_BENCH_REFILL
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP8
|
|
|
|
|
# define HAKMEM_TINY_BENCH_WARMUP8 64
|
|
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP16
|
|
|
|
|
# define HAKMEM_TINY_BENCH_WARMUP16 96
|
|
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP32
|
|
|
|
|
# define HAKMEM_TINY_BENCH_WARMUP32 160
|
|
|
|
|
#endif
|
|
|
|
|
#ifndef HAKMEM_TINY_BENCH_WARMUP64
|
|
|
|
|
# define HAKMEM_TINY_BENCH_WARMUP64 192
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-12-16 05:35:11 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 22: Research Box Prune (Compile-out default-OFF boxes)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 14 Tcache: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need tcache experimentation
|
|
|
|
|
#ifndef HAKMEM_TINY_TCACHE_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_TCACHE_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Phase 15 Unified LIFO: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need LIFO/FIFO mode switching
|
|
|
|
|
#ifndef HAKMEM_TINY_UNIFIED_LIFO_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_UNIFIED_LIFO_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 23: Per-op Default-OFF Tax Prune (Compile-out per-op research knobs)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase E5-2 Header Write-Once: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need write-once header optimization
|
|
|
|
|
#ifndef HAKMEM_TINY_HEADER_WRITE_ONCE_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_HEADER_WRITE_ONCE_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Unified Cache Measurement: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need cache measurement instrumentation
|
|
|
|
|
#ifndef HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_UNIFIED_CACHE_MEASURE_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 24: OBSERVE Tax Prune (Compile-out hot-path stats atomics)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Tiny Class Stats: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need per-class stats observation
|
|
|
|
|
#ifndef HAKMEM_TINY_CLASS_STATS_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_CLASS_STATS_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 25: Tiny Free Stats Atomic Prune (Compile-out g_free_ss_enter)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Tiny Free Stats: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need free path telemetry
|
|
|
|
|
// Target: g_free_ss_enter atomic in core/tiny_superslab_free.inc.h
|
|
|
|
|
#ifndef HAKMEM_TINY_FREE_STATS_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_FREE_STATS_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 26A: C7 Free Count Atomic Prune (Compile-out c7_free_count)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// C7 Free Count: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need C7 free path diagnostics
|
|
|
|
|
// Target: c7_free_count atomic in core/tiny_superslab_free.inc.h:51
|
|
|
|
|
#ifndef HAKMEM_C7_FREE_COUNT_COMPILED
|
|
|
|
|
# define HAKMEM_C7_FREE_COUNT_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 26B: Header Mismatch Log Atomic Prune (Compile-out g_hdr_mismatch_log)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Header Mismatch Log: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need header validation diagnostics
|
|
|
|
|
// Target: g_hdr_mismatch_log atomic in core/tiny_superslab_free.inc.h:147
|
|
|
|
|
#ifndef HAKMEM_HDR_MISMATCH_LOG_COMPILED
|
|
|
|
|
# define HAKMEM_HDR_MISMATCH_LOG_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 26C: Header Meta Mismatch Atomic Prune (Compile-out g_hdr_meta_mismatch)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Header Meta Mismatch: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need metadata validation diagnostics
|
|
|
|
|
// Target: g_hdr_meta_mismatch atomic in core/tiny_superslab_free.inc.h:182
|
|
|
|
|
#ifndef HAKMEM_HDR_META_MISMATCH_COMPILED
|
|
|
|
|
# define HAKMEM_HDR_META_MISMATCH_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 26D: Metric Bad Class Atomic Prune (Compile-out g_metric_bad_class_once)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Metric Bad Class: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need bad class index diagnostics
|
|
|
|
|
// Target: g_metric_bad_class_once atomic in core/hakmem_tiny_alloc.inc:22
|
|
|
|
|
#ifndef HAKMEM_METRIC_BAD_CLASS_COMPILED
|
|
|
|
|
# define HAKMEM_METRIC_BAD_CLASS_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 26E: Header Meta Fast Atomic Prune (Compile-out g_hdr_meta_fast)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Header Meta Fast: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need fast-path metadata telemetry
|
|
|
|
|
// Target: g_hdr_meta_fast atomic in core/tiny_free_fast_v2.inc.h:181
|
|
|
|
|
#ifndef HAKMEM_HDR_META_FAST_COMPILED
|
|
|
|
|
# define HAKMEM_HDR_META_FAST_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
Phase 29: Pool Hotbox v2 Stats Prune - NO-OP (infrastructure ready)
Target: g_pool_hotbox_v2_stats atomics (12 total) in Pool v2
Result: 0.00% impact (code path inactive by default, ENV-gated)
Verdict: NO-OP - Maintain compile-out for future-proofing
Audit Results:
- Classification: 12/12 TELEMETRY (100% observational)
- Counters: alloc_calls, alloc_fast, alloc_refill, alloc_refill_fail,
alloc_fallback_v1, free_calls, free_fast, free_fallback_v1,
page_of_fail_* (4 failure counters)
- Verification: All stats/logging only, zero flow control usage
- Phase 28 lesson applied: Traced all usages, confirmed no CORRECTNESS
Key Finding: Pool v2 OFF by default
- Requires HAKMEM_POOL_V2_ENABLED=1 to activate
- Benchmark never executes Pool v2 code paths
- Compile-out has zero performance impact (code never runs)
Implementation (future-ready):
- Added HAKMEM_POOL_HOTBOX_V2_STATS_COMPILED (default: 0)
- Wrapped 13 atomic write sites in core/hakmem_pool.c
- Pattern: #if HAKMEM_POOL_HOTBOX_V2_STATS_COMPILED ... #endif
- Expected impact if Pool v2 enabled: +0.3~0.8% (HOT+WARM atomics)
A/B Test Results:
- Baseline (COMPILED=0): 52.98 M ops/s (±0.43M, 0.81% stdev)
- Research (COMPILED=1): 53.31 M ops/s (±0.80M, 1.50% stdev)
- Delta: -0.62% (noise, not real effect - code path not active)
Critical Lesson Learned (NEW):
Phase 29 revealed ENV-gated features can appear on hot paths but never
execute. Updated audit checklist:
1. Classify atomics (CORRECTNESS vs TELEMETRY)
2. Verify no flow control usage
3. NEW: Verify code path is ACTIVE in benchmark (check ENV gates)
4. Implement compile-out
5. A/B test
Verification methods added to documentation:
- rg "getenv.*FEATURE" to check ENV gates
- perf record/report to verify execution
- Debug printf for quick validation
Cumulative Progress (Phase 24-29):
- Phase 24 (class stats): +0.93% GO
- Phase 25 (free stats): +1.07% GO
- Phase 26 (diagnostics): -0.33% NEUTRAL
- Phase 27 (unified cache): +0.74% GO
- Phase 28 (bg spill): NO-OP (all CORRECTNESS)
- Phase 29 (pool v2): NO-OP (inactive code path)
- Total: 17 atomics removed, +2.74% improvement
Documentation:
- PHASE29_POOL_HOTBOX_V2_AUDIT.md: Complete audit with TELEMETRY classification
- PHASE29_POOL_HOTBOX_V2_STATS_RESULTS.md: Results + new lesson learned
- ATOMIC_PRUNE_CUMULATIVE_SUMMARY.md: Updated with Phase 29 + new checklist
- PHASE29_COMPLETE.md: Completion summary with recommendations
Decision: Keep compile-out despite NO-OP
- Code cleanliness (binary size reduction)
- Future-proofing (ready when Pool v2 enabled)
- Consistency with Phase 24-28 pattern
Generated with Claude Code
https://claude.com/claude-code
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
2025-12-16 06:33:41 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 27: Unified Cache Stats Atomic Prune (Compile-out observation atomics)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Unified Cache Stats: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need cache telemetry
|
|
|
|
|
// Target: g_cache_unified_stats atomics in core/hakmem_tiny.c
|
|
|
|
|
#ifndef HAKMEM_UNIFIED_CACHE_STATS_COMPILED
|
|
|
|
|
# define HAKMEM_UNIFIED_CACHE_STATS_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 29: Pool Hotbox v2 Stats Prune (Compile-out telemetry atomics)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Pool Hotbox v2 Stats: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need Pool v2 telemetry
|
|
|
|
|
// Target: g_pool_hotbox_v2_stats[ci].* atomics in core/hakmem_pool.c
|
|
|
|
|
// Impact: 12 atomic counters on HOT+WARM path (alloc_fast, free_fast, etc.)
|
|
|
|
|
#ifndef HAKMEM_POOL_HOTBOX_V2_STATS_COMPILED
|
|
|
|
|
# define HAKMEM_POOL_HOTBOX_V2_STATS_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-12-16 07:31:15 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 31: Tiny Free Trace Atomic Prune (Compile-out trace atomic)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Tiny Free Trace: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need free path trace diagnostics
|
|
|
|
|
// Target: g_tiny_free_trace atomic in core/hakmem_tiny_free.inc:326
|
|
|
|
|
// Impact: HOT path atomic (every free operation)
|
|
|
|
|
// Expected improvement: +0.5% to +1.0% (similar to Phase 25: +1.07%)
|
|
|
|
|
#ifndef HAKMEM_TINY_FREE_TRACE_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_FREE_TRACE_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-12-16 15:01:56 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 32: Tiny Free Calls Atomic Prune (Compile-out diagnostic counter)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Tiny Free Calls: Compile gate (default OFF = compile-out)
|
|
|
|
|
// Set to 1 for research builds that need free path call counting
|
|
|
|
|
// Target: g_hak_tiny_free_calls atomic in core/hakmem_tiny_free.inc:335
|
|
|
|
|
// Impact: HOT path atomic (every free operation, unconditional)
|
|
|
|
|
// Expected improvement: +0.3% to +0.7% (diagnostic counter, less critical than Phase 25)
|
|
|
|
|
#ifndef HAKMEM_TINY_FREE_CALLS_COMPILED
|
|
|
|
|
# define HAKMEM_TINY_FREE_CALLS_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 34: Batch Atomic Prune (Compile-out remaining WARM path atomics)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Phase 34A: Splice Debug Counter (WARM path, refill)
|
|
|
|
|
// Target: g_splice_count in core/tiny_refill_opt.h:79
|
|
|
|
|
// Impact: WARM path atomic (every refill splice operation)
|
|
|
|
|
#ifndef HAKMEM_SPLICE_DEBUG_COMPILED
|
|
|
|
|
# define HAKMEM_SPLICE_DEBUG_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
|
|
|
|
// Phase 34B: Alloc Gate Class Mismatch (ERROR path, rare)
|
|
|
|
|
// Target: g_alloc_gate_cls_mis in core/box/tiny_alloc_gate_box.h:95
|
|
|
|
|
// Impact: ERROR path atomic (class mismatch detection, rare)
|
|
|
|
|
#ifndef HAKMEM_ALLOC_GATE_CLS_MIS_COMPILED
|
|
|
|
|
# define HAKMEM_ALLOC_GATE_CLS_MIS_COMPILED 0
|
|
|
|
|
#endif
|
|
|
|
|
|
2025-11-05 12:31:14 +09:00
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
// Helper enum (for documentation / logging)
|
|
|
|
|
// ------------------------------------------------------------
|
|
|
|
|
typedef enum {
|
|
|
|
|
HAK_FLAG_BUILD_RELEASE = HAKMEM_BUILD_RELEASE,
|
|
|
|
|
HAK_FLAG_DEBUG_COUNTERS = HAKMEM_DEBUG_COUNTERS,
|
|
|
|
|
HAK_FLAG_DEBUG_MEMORY = HAKMEM_DEBUG_MEMORY,
|
|
|
|
|
HAK_FLAG_REFILL_OPT = HAKMEM_TINY_REFILL_OPT,
|
|
|
|
|
HAK_FLAG_P0_BATCH = HAKMEM_TINY_P0_BATCH_REFILL,
|
|
|
|
|
HAK_FLAG_BOX_REFACTOR = HAKMEM_TINY_PHASE6_BOX_REFACTOR,
|
2025-11-07 01:27:04 +09:00
|
|
|
HAK_FLAG_NEW_3LAYER = HAKMEM_TINY_USE_NEW_3LAYER,
|
2025-11-05 12:31:14 +09:00
|
|
|
} hak_build_flags_t;
|
|
|
|
|
|
|
|
|
|
#endif // HAKMEM_BUILD_FLAGS_H
|