// bench_fast_box.h - BenchFast Mode (Phase 20-2) // Purpose: Measure HAKMEM's structural performance ceiling by removing ALL safety costs // WARNING: UNSAFE - Benchmark-only mode, DO NOT use in production // // Design Philosophy: // - Alloc: Trust size → instant Tiny path (no classify_ptr, no Pool/Mid checks) // - Free: Trust header → instant Tiny path (no registry, no mincore, no guards) // - Goal: Minimal instruction count (6-8 alloc, 3-5 free) to measure structural limits // // Enable with: HAKMEM_BENCH_FAST_MODE=1 // Expected: +65-100% performance (15.7M → 25-30M ops/s) // // ============================================================================ // Box Contract (Phase 8 Root Cause Fix) // ============================================================================ // // BenchFast Box uses TLS SLL allocation strategy, NOT Unified Cache. // This is a critical design decision that affects all BenchFast code. // // Scope Separation: // 1. WORKLOAD allocations (measured): // - User malloc/free calls in benchmark loop // - Contract: ALL are Tiny (size <= 1024B) // - Path: bench_fast_alloc() → bench_fast_free() // - Strategy: TLS SLL (g_tls_sll[]) // // 2. INFRASTRUCTURE allocations (not measured): // - Benchmark metadata (slots[] array in bench_random_mixed.c) // - Cache arrays (if any infrastructure needs allocation) // - Contract: Bypass HAKMEM entirely (use __libc_calloc/__libc_free) // - Path: __libc_calloc() → __libc_free() // // Preconditions: // - bench_fast_init() called before workload // - Infrastructure uses __libc_* directly (NO mixing with HAKMEM paths) // // Guarantees: // - Workload: Ultra-fast (6-8 instructions alloc, 3-5 instructions free) // - Infrastructure: Isolated (no interference with BenchFast paths) // - No path crossing (enforced by using different allocation functions) // // Contract Violation Example (Phase 8 Bug): // ❌ bench_fast_init() called unified_cache_init() // ❌ unified_cache_init() used calloc() (went through HAKMEM wrapper) // ❌ 16KB allocation went through mmap path (not Tiny) // ❌ Later free() misclassified it as Tiny → CRASH // // ✅ Fixed: Removed unified_cache_init() call (BenchFast uses TLS SLL, not UC) // ✅ Defensive: unified_cache_init() now uses __libc_calloc (infrastructure isolation) // // ============================================================================ #ifndef HAK_BOX_BENCH_FAST_H #define HAK_BOX_BENCH_FAST_H #include #include #include #include // BenchFast mode enabled (ENV cached at first call) // Returns: 1 if enabled, 0 if disabled static inline int bench_fast_enabled(void) { static int cached = -1; if (__builtin_expect(cached == -1, 0)) { const char* env = getenv("HAKMEM_BENCH_FAST_MODE"); cached = (env && *env && *env != '0') ? 1 : 0; if (cached) { fprintf(stderr, "[HAKMEM][BENCH_FAST] WARNING: Unsafe benchmark mode enabled!\n"); fprintf(stderr, "[HAKMEM][BENCH_FAST] DO NOT use in production - safety costs removed\n"); } } return cached; } // Exposed init guard so wrappers can avoid BenchFast during preallocation // Phase 8-TLS-Fix: Changed from __thread to atomic_int (works across ALL threads) extern atomic_int g_bench_fast_init_in_progress; // BenchFast alloc (Tiny-only, no safety checks) // Preconditions: size <= 1024 (Tiny range) // Returns: pointer on success, NULL on failure void* bench_fast_alloc(size_t size); // BenchFast free (header-based, no validation) // Preconditions: ptr from bench_fast_alloc(), header is valid void bench_fast_free(void* ptr); // BenchFast init - Preallocate pool before benchmark // Purpose: Avoid recursion by pre-populating TLS SLL with blocks // Call this BEFORE starting benchmark (uses normal allocator path) // Returns: Total number of blocks preallocated, or 0 if disabled // Recommended: 50,000 blocks per class (C2-C7) = 300,000 total int bench_fast_init(void); // BenchFast stats - Print remaining blocks per class (debug/verification) // Optional: Use after benchmark to verify pool wasn't exhausted void bench_fast_stats(void); #endif // HAK_BOX_BENCH_FAST_H