hakmem/core/box/bench_fast_box.h

// bench_fast_box.h - BenchFast Mode (Phase 20-2)
// Purpose: Measure HAKMEM's structural performance ceiling by removing ALL safety costs
// WARNING: UNSAFE - Benchmark-only mode, DO NOT use in production
//
// Design Philosophy:
//   - Alloc: Trust size → instant Tiny path (no classify_ptr, no Pool/Mid checks)
//   - Free:  Trust header → instant Tiny path (no registry, no mincore, no guards)
//   - Goal:  Minimal instruction count (6-8 alloc, 3-5 free) to measure structural limits
//
// Enable with: HAKMEM_BENCH_FAST_MODE=1
// Expected: +65-100% performance (15.7M → 25-30M ops/s)

#ifndef HAK_BOX_BENCH_FAST_H
#define HAK_BOX_BENCH_FAST_H

#include <stddef.h>
#include <stdlib.h>
#include <stdio.h>

// BenchFast mode enabled (ENV cached at first call)
// Returns: 1 if enabled, 0 if disabled
static inline int bench_fast_enabled(void) {
    static int cached = -1;
    if (__builtin_expect(cached == -1, 0)) {
        const char* env = getenv("HAKMEM_BENCH_FAST_MODE");
        cached = (env && *env && *env != '0') ? 1 : 0;
        if (cached) {
            fprintf(stderr, "[HAKMEM][BENCH_FAST] WARNING: Unsafe benchmark mode enabled!\n");
            fprintf(stderr, "[HAKMEM][BENCH_FAST] DO NOT use in production - safety costs removed\n");
        }
    }
    return cached;
}

// Exposed init guard so wrappers can avoid BenchFast during preallocation
extern __thread int bench_fast_init_in_progress;

// BenchFast alloc (Tiny-only, no safety checks)
// Preconditions: size <= 1024 (Tiny range)
// Returns: pointer on success, NULL on failure
void* bench_fast_alloc(size_t size);

// BenchFast free (header-based, no validation)
// Preconditions: ptr from bench_fast_alloc(), header is valid
void bench_fast_free(void* ptr);

// BenchFast init - Preallocate pool before benchmark
// Purpose: Avoid recursion by pre-populating TLS SLL with blocks
// Call this BEFORE starting benchmark (uses normal allocator path)
// Returns: Total number of blocks preallocated, or 0 if disabled
// Recommended: 50,000 blocks per class (C2-C7) = 300,000 total
int bench_fast_init(void);

// BenchFast stats - Print remaining blocks per class (debug/verification)
// Optional: Use after benchmark to verify pool wasn't exhausted
void bench_fast_stats(void);

#endif // HAK_BOX_BENCH_FAST_H