// hakmem.c - Minimal PoC Implementation
// Purpose: Verify call-site profiling concept

#define _GNU_SOURCE  // For mincore, madvise on Linux

#include "hakmem.h"
#include "hakmem_config.h"      // NEW Phase 6.8: Mode-based configuration
#include "hakmem_internal.h"    // NEW Phase 6.8: Static inline helpers
#include "hakmem_bigcache.h"    // NEW: BigCache Box
#include "hakmem_pool.h"        // NEW Phase 6.9: L2 Hybrid Pool (2-32KiB)
#include "hakmem_l25_pool.h"    // NEW Phase 6.13: L2.5 LargePool (64KB-1MB)
#include "hakmem_policy.h"      // NEW Phase 6.16: FrozenPolicy (SACS-3)
#include "hakmem_learner.h"     // NEW: CAP auto-tuner (background)
#include "hakmem_size_hist.h"   // NEW: size histogram sampling (off hot path)
#include "hakmem_ace.h"         // NEW Phase 6.16: ACE layer (L1)
#include "hakmem_site_rules.h"  // NEW Phase 6.10: Site-Aware Cache Routing
#include "hakmem_tiny.h"        // NEW Phase 6.12: Tiny Pool (≤1KB)
#include "hakmem_tiny_superslab.h"  // NEW Phase 7.6: SuperSlab for Tiny Pool
#include "tiny_fastcache.h"     // NEW Phase 6-3: Tiny Fast Path (System tcache style)
#include "hakmem_mid_mt.h"      // NEW Phase Hybrid: Mid Range MT (8-32KB, mimalloc-style)
#include "hakmem_super_registry.h"  // NEW Phase 1: SuperSlab Registry (mincore elimination)
#include "hakmem_elo.h"         // NEW: ELO Strategy Selection (Phase 6.2)
#include "hakmem_ace_stats.h"    // NEW: ACE lightweight stats (avoid implicit decl warnings)
#include "hakmem_batch.h"       // NEW: madvise Batching (Phase 6.3)
#include "hakmem_evo.h"         // NEW: Learning Lifecycle (Phase 6.5)
#include "hakmem_debug.h"       // NEW Phase 6.11.1: Debug Timing
#include "hakmem_sys.h"         // NEW Phase 6.11.1: Syscall Wrappers
#include "hakmem_whale.h"       // NEW Phase 6.11.1: Whale Fast-Path (≥2MB)
#include "hakmem_prof.h"        // NEW Phase 6.16: Sampling profiler
#include "hakmem_syscall.h"     // NEW Phase 6.X P0 FIX: Box 3 (dlsym direct libc)
#include "hakmem_ace_controller.h"  // NEW Phase ACE: Adaptive Control Engine
#include "hakmem_ace_metrics.h"     // NEW Phase ACE: Metrics tracking (inline helpers)
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <time.h>
#include <dlfcn.h>
#include <stdatomic.h>  // NEW Phase 6.5: For atomic tick counter
#include <pthread.h>    // Phase 6.15: Threading primitives (recursion guard only)
#include <errno.h>      // calloc overflow handling

// For mmap (Linux)
#ifdef __linux__
#include <sys/mman.h>
#include <unistd.h>

// MADV_FREE support (Linux kernel 4.5+)
#ifndef MADV_FREE
  #define MADV_FREE 8  // Linux MADV_FREE
#endif
#endif

// ============================================================================
// Configuration
// ============================================================================

#define MAX_SITES 256           // Hash table size (power of 2)
#define SAMPLING_RATE 1         // Sample ALL (PoC demo: no sampling)
#define HASH_MASK (MAX_SITES - 1)

// Phase 6.8: FREE_POLICY/FreePolicy moved to hakmem_config.h
// Phase 6.8: FreeThermal/THERMAL_* constants moved to hakmem_internal.h
// Phase 6.8: THP_POLICY/THPPolicy moved to hakmem_config.h


// ============================================================================
// Global State
// ============================================================================

// NEW Phase ACE: Adaptive Control Engine
static struct hkm_ace_controller g_ace_controller;

static int g_initialized = 0;
static int g_strict_free = 0;   // runtime: HAKMEM_SAFE_FREE=1 enables extra safety checks
int g_invalid_free_log = 0; // runtime: HAKMEM_INVALID_FREE_LOG=1 to log invalid-free messages (extern visible)
// Phase 7.4: Cache HAKMEM_INVALID_FREE to eliminate 44% CPU overhead (getenv on hot path)
// Perf analysis showed getenv("HAKMEM_INVALID_FREE") consumed 43.96% of CPU time!
static int g_invalid_free_mode = 1;  // 1 = skip invalid-free check (default), 0 = fallback to libc

// Statistics
static uint64_t g_malloc_count = 0;  // Used for optimization stats display

// Phase 6.11.4 P0-2: Cached Strategy (atomic, updated by hak_evo_tick)
static _Atomic int g_cached_strategy_id = 0;  // Cached strategy ID (updated every window closure)

// Phase 6.15 P0.3: EVO Sampling Control (environment variable)
static uint64_t g_evo_sample_mask = 0;  // 0 = disabled (default), (1<<N)-1 = sample every 2^N calls

// Phase 6.15 P1: Site Rules enable (env: HAKMEM_SITE_RULES=1 to enable)
static int g_site_rules_enabled = 0;  // default off to avoid contention in MT
static int g_bench_tiny_only = 0;     // bench preset: Tiny-only fast path
int g_ldpreload_mode = 0;  // 1 when running via LD_PRELOAD=libhakmem.so
static int g_flush_tiny_on_exit = 0;  // HAKMEM_TINY_FLUSH_ON_EXIT=1
static int g_ultra_debug_on_exit = 0; // HAKMEM_TINY_ULTRA_DEBUG=1
// Cached LD_PRELOAD detection for wrapper hot paths (avoid getenv per call)
static int g_ldpre_env_cached = -1;  // -1 = unknown, 0/1 cached
static inline int hak_ld_env_mode(void) {
    if (g_ldpre_env_cached < 0) {
        const char* ldpre = getenv("LD_PRELOAD");
        g_ldpre_env_cached = (ldpre && strstr(ldpre, "libhakmem.so")) ? 1 : 0;
    }
    return g_ldpre_env_cached;
}

// Sanitizer / guard rails: allow forcing libc allocator even when wrappers are linked
#ifdef HAKMEM_FORCE_LIBC_ALLOC_BUILD
static int g_force_libc_alloc = 1;
#else
static int g_force_libc_alloc = -1;  // 1=force libc, 0=use hakmem, -1=uninitialized
#endif
static inline int hak_force_libc_alloc(void) {
    if (g_force_libc_alloc < 0) {
        const char* force = getenv("HAKMEM_FORCE_LIBC_ALLOC");
        if (force && *force) {
            g_force_libc_alloc = (atoi(force) != 0);
        } else {
            const char* wrap = getenv("HAKMEM_WRAP_TINY");
            if (wrap && *wrap && atoi(wrap) == 0) {
                g_force_libc_alloc = 1;
            } else {
                g_force_libc_alloc = 0;
            }
        }
    }
    return g_force_libc_alloc;
}

// LD_PRELOAD safety: avoid interposing when jemalloc is present
static int g_ld_block_jemalloc = -1;   // env: HAKMEM_LD_BLOCK_JEMALLOC (default 1)
static int g_jemalloc_loaded = -1;     // -1 unknown, 0/1 cached
static inline int hak_jemalloc_loaded(void) {
    if (g_jemalloc_loaded < 0) {
        void* h = dlopen("libjemalloc.so.2", RTLD_NOLOAD | RTLD_NOW);
        if (!h) h = dlopen("libjemalloc.so.1", RTLD_NOLOAD | RTLD_NOW);
        g_jemalloc_loaded = (h != NULL) ? 1 : 0;
        if (h) dlclose(h);
    }
    return g_jemalloc_loaded;
}
static inline int hak_ld_block_jemalloc(void) {
    if (g_ld_block_jemalloc < 0) {
        const char* e = getenv("HAKMEM_LD_BLOCK_JEMALLOC");
        g_ld_block_jemalloc = (e == NULL) ? 1 : (atoi(e) != 0);
    }
    return g_ld_block_jemalloc;
}

// ============================================================================
// Phase 6.15 P1: Remove global lock; keep recursion guard only
// ---------------------------------------------------------------------------
// We no longer serialize all allocations with a single global mutex.
// Instead, each submodule is responsible for its own fine‑grained locking.
// We keep a per‑thread recursion guard so that internal use of malloc/free
// within the allocator routes to libc (avoids infinite recursion).
//
// Phase 6.X P0 FIX (2025-10-24): Reverted to simple g_hakmem_lock_depth check
// Box Theory - Layer 1 (API Layer):
//   This guard protects against LD_PRELOAD recursion (Box 1 → Box 1)
//   Box 2 (Core) → Box 3 (Syscall) uses hkm_libc_malloc() (dlsym, no guard needed!)
static __thread int g_hakmem_lock_depth = 0;  // 0 = outermost call

int hak_in_wrapper(void) {
    return g_hakmem_lock_depth > 0;  // Simple and correct!
}

// Initialization guard
static int g_initializing = 0;
int hak_is_initializing(void) { return g_initializing; }

// ============================================================================
// Phase 6-1.5: Ultra-Simple Fast Path Forward Declarations
// ============================================================================
// Forward declarations for Phase 6 fast path variants
// Phase 6-1.5: Alignment guessing (hakmem_tiny_ultra_simple.inc)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
extern void* hak_tiny_alloc_ultra_simple(size_t size);
extern void hak_tiny_free_ultra_simple(void* ptr);
#endif

// Phase 6-1.6: Metadata header (hakmem_tiny_metadata.inc)
#ifdef HAKMEM_TINY_PHASE6_METADATA
extern void* hak_tiny_alloc_metadata(size_t size);
extern void hak_tiny_free_metadata(void* ptr);
#endif

// Phase 6-1.7: Box Theory Refactoring - Wrapper function declarations
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
    extern void* hak_tiny_alloc_fast_wrapper(size_t size);
    extern void hak_tiny_free_fast_wrapper(void* ptr);
#endif

static void hak_flush_tiny_exit(void) {
    // Best-effort: flush Tiny magazines at process exit
    if (g_flush_tiny_on_exit) {
        hak_tiny_magazine_flush_all();
        hak_tiny_trim();
    }
    if (g_ultra_debug_on_exit) {
        hak_tiny_ultra_debug_dump();
    }
    // Path debug dump (optional): HAKMEM_TINY_PATH_DEBUG=1
    hak_tiny_path_debug_dump();
    // Extended counters (optional): HAKMEM_TINY_COUNTERS_DUMP=1
    extern void hak_tiny_debug_counters_dump(void);
    hak_tiny_debug_counters_dump();
}

// ============================================================================
// KPI Measurement (for UCB1) - NEW!
// ============================================================================

#ifdef __linux__
// Latency histogram (simple buckets for P50/P95/P99)
#define LATENCY_BUCKETS 100
static uint64_t g_latency_histogram[LATENCY_BUCKETS];
static uint64_t g_latency_samples = 0;

// Baseline page faults (at init)
static uint64_t g_baseline_soft_pf = 0;
static uint64_t g_baseline_hard_pf = 0;
static uint64_t g_baseline_rss_kb = 0;

// Get page faults from /proc/self/stat
static void get_page_faults(uint64_t* soft_pf, uint64_t* hard_pf) {
    FILE* f = fopen("/proc/self/stat", "r");
    if (!f) {
        *soft_pf = 0;
        *hard_pf = 0;
        return;
    }

    // Format: pid (comm) state ... minflt cminflt majflt cmajflt ...
    // Fields: 1 2 3 ... 10(minflt) 11(cminflt) 12(majflt) 13(cmajflt)
    unsigned long minflt = 0, majflt = 0;
    unsigned long dummy;
    char comm[256], state;

    (void)fscanf(f, "%lu %s %c %lu %lu %lu %lu %lu %lu %lu %lu %lu",
                 &dummy, comm, &state, &dummy, &dummy, &dummy, &dummy, &dummy,
                 &dummy, &minflt, &dummy, &majflt);

    fclose(f);

    *soft_pf = minflt;
    *hard_pf = majflt;
}

// Get RSS from /proc/self/statm (in KB)
static uint64_t get_rss_kb(void) {
    FILE* f = fopen("/proc/self/statm", "r");
    if (!f) return 0;

    // Format: size resident shared text lib data dt
    // We want 'resident' (field 2) in pages
    unsigned long size, resident;
    (void)fscanf(f, "%lu %lu", &size, &resident);
    fclose(f);

    long page_size = sysconf(_SC_PAGESIZE);
    return (resident * page_size) / 1024;  // Convert to KB
}

// NOTE: Latency measurement functions (currently unused, for future use)
/*
static inline uint64_t measure_latency_ns(void (*func)(void*), void* arg) {
    struct timespec start, end;
    clock_gettime(CLOCK_MONOTONIC, &start);

    func(arg);  // Execute function

    clock_gettime(CLOCK_MONOTONIC, &end);

    uint64_t ns = (end.tv_sec - start.tv_sec) * 1000000000ULL +
                  (end.tv_nsec - start.tv_nsec);
    return ns;
}

static void record_latency(uint64_t ns) {
    // Bucket: 0-10ns, 10-20ns, ..., 990-1000ns, 1000+ns
    size_t bucket = ns / 10;
    if (bucket >= LATENCY_BUCKETS) bucket = LATENCY_BUCKETS - 1;

    g_latency_histogram[bucket]++;
    g_latency_samples++;
}
*/

// Calculate percentile from histogram
static uint64_t calculate_percentile(double percentile) {
    if (g_latency_samples == 0) return 0;

    uint64_t target = (uint64_t)(g_latency_samples * percentile);
    uint64_t cumulative = 0;

    for (size_t i = 0; i < LATENCY_BUCKETS; i++) {
        cumulative += g_latency_histogram[i];
        if (cumulative >= target) {
            return i * 10;  // Return bucket midpoint (ns)
        }
    }

    return (LATENCY_BUCKETS - 1) * 10;
}

// Implement hak_get_kpi()
void hak_get_kpi(hak_kpi_t* out) {
    memset(out, 0, sizeof(hak_kpi_t));

    // Latency (from histogram)
    out->p50_alloc_ns = calculate_percentile(0.50);
    out->p95_alloc_ns = calculate_percentile(0.95);
    out->p99_alloc_ns = calculate_percentile(0.99);

    // Page Faults (delta from baseline)
    uint64_t soft_pf, hard_pf;
    get_page_faults(&soft_pf, &hard_pf);
    out->soft_page_faults = soft_pf - g_baseline_soft_pf;
    out->hard_page_faults = hard_pf - g_baseline_hard_pf;

    // RSS (delta from baseline, in MB)
    uint64_t rss_kb = get_rss_kb();
    int64_t rss_delta_kb = (int64_t)rss_kb - (int64_t)g_baseline_rss_kb;
    out->rss_delta_mb = rss_delta_kb / 1024;
}

#else
// Non-Linux: stub implementation
void hak_get_kpi(hak_kpi_t* out) {
    memset(out, 0, sizeof(hak_kpi_t));
}
#endif

// ============================================================================
// Internal Helpers
// ============================================================================

// Phase 6.8: All legacy profiling functions removed
// - hash_site(), get_site_profile(), infer_policy(), record_alloc(), allocate_with_policy()
// Replaced by ELO-based allocation (hakmem_elo.c)

// ============================================================================
// BigCache eviction callback
// ============================================================================

// BigCache eviction callback (called when cache is full and needs to evict)
static void bigcache_free_callback(void* ptr, size_t size) {
    (void)size;  // Not used
    if (!ptr) return;

    // Get raw pointer and header
    void* raw = (char*)ptr - HEADER_SIZE;
    AllocHeader* hdr = (AllocHeader*)raw;

    // Verify magic before accessing method field
    if (hdr->magic != HAKMEM_MAGIC) {
        fprintf(stderr, "[hakmem] BigCache eviction: invalid magic, fallback to free()\n");
        free(raw);
        return;
    }

    // Dispatch based on allocation method
    switch (hdr->method) {
        case ALLOC_METHOD_MALLOC:
            free(raw);
            break;

        case ALLOC_METHOD_MMAP:
            // Cold eviction: route through batch for large blocks
            // This completes Phase 6.3 architecture
#ifdef __linux__
            if (hdr->size >= BATCH_MIN_SIZE) {
                // Large blocks: use batch (deferred munmap + TLB optimization)
                hak_batch_add(raw, hdr->size);
            } else {
                // Small blocks: direct munmap (not worth batching)
                // Phase 6.11.1: Try whale cache first
                if (hkm_whale_put(raw, hdr->size) != 0) {
                    // Whale cache full or not a whale: munmap
                    madvise(raw, hdr->size, MADV_FREE);  // Best-effort
                    hkm_sys_munmap(raw, hdr->size);
                }
                // else: Successfully cached in whale cache (no munmap!)
            }
#else
            free(raw);  // Fallback (should not happen)
#endif
            break;

        default:
            fprintf(stderr, "[hakmem] BigCache eviction: unknown method %d\n", hdr->method);
            free(raw);  // Fallback
            break;
    }
}

// ============================================================================
// Public API
// ============================================================================

// Thread-safe one-time initialization
static void hak_init_impl(void);
static pthread_once_t g_init_once = PTHREAD_ONCE_INIT;

void hak_init(void) {
    (void)pthread_once(&g_init_once, hak_init_impl);
}

static void hak_init_impl(void) {
    g_initializing = 1;

    // Phase 6.X P0 FIX (2025-10-24): Initialize Box 3 (Syscall Layer) FIRST!
    // This MUST be called before ANY allocation (Tiny/Mid/Large/Learner)
    // dlsym() initializes function pointers to real libc (bypasses LD_PRELOAD)
    hkm_syscall_init();

    // NEW Phase 6.11.1: Initialize debug timing
    hkm_timing_init();

    // NEW Phase 6.11.1: Initialize whale fast-path cache
    hkm_whale_init();

    // NEW Phase Hybrid: Initialize Mid Range MT allocator (8-32KB, mimalloc-style)
    mid_mt_init();

    // NEW Phase 6.8: Initialize configuration system (replaces init_free_policy + init_thp_policy)
    hak_config_init();

    // Phase 6.16: Initialize FrozenPolicy (SACS-3)
    hkm_policy_init();

    // Phase 6.15 P0.3: Configure EVO sampling from environment variable
    // HAKMEM_EVO_SAMPLE: 0=disabled (default), N=sample every 2^N calls
    // Example: HAKMEM_EVO_SAMPLE=10 → sample every 1024 calls
    //          HAKMEM_EVO_SAMPLE=16 → sample every 65536 calls
    char* evo_sample_str = getenv("HAKMEM_EVO_SAMPLE");
    if (evo_sample_str && atoi(evo_sample_str) > 0) {
        int freq = atoi(evo_sample_str);
        if (freq >= 64) {
            fprintf(stderr, "[hakmem] Warning: HAKMEM_EVO_SAMPLE=%d too large, using 63\n", freq);
            freq = 63;
        }
        g_evo_sample_mask = (1ULL << freq) - 1;
        HAKMEM_LOG("EVO sampling enabled: every 2^%d = %llu calls\n",
                   freq, (unsigned long long)(g_evo_sample_mask + 1));
    } else {
        g_evo_sample_mask = 0;  // Disabled by default
        HAKMEM_LOG("EVO sampling disabled (HAKMEM_EVO_SAMPLE not set or 0)\n");
    }

#ifdef __linux__
    // Record baseline KPIs
    memset(g_latency_histogram, 0, sizeof(g_latency_histogram));
    g_latency_samples = 0;

    get_page_faults(&g_baseline_soft_pf, &g_baseline_hard_pf);
    g_baseline_rss_kb = get_rss_kb();

    HAKMEM_LOG("Baseline: soft_pf=%lu, hard_pf=%lu, rss=%lu KB\n",
           (unsigned long)g_baseline_soft_pf,
           (unsigned long)g_baseline_hard_pf,
           (unsigned long)g_baseline_rss_kb);
#endif

    HAKMEM_LOG("Initialized (PoC version)\n");
    HAKMEM_LOG("Sampling rate: 1/%d\n", SAMPLING_RATE);
    HAKMEM_LOG("Max sites: %d\n", MAX_SITES);

    // Bench preset: Tiny-only (disable non-essential subsystems)
    {
        char* bt = getenv("HAKMEM_BENCH_TINY_ONLY");
        if (bt && atoi(bt) != 0) {
            g_bench_tiny_only = 1;
        }
    }

    // Under LD_PRELOAD, enforce safer defaults for Tiny path unless overridden
    {
        char* ldpre = getenv("LD_PRELOAD");
        if (ldpre && strstr(ldpre, "libhakmem.so")) {
            g_ldpreload_mode = 1;
            // Default LD-safe mode if not set: 1 (Tiny-only)
            char* lds = getenv("HAKMEM_LD_SAFE");
            if (lds) { /* NOP used in wrappers */ } else { setenv("HAKMEM_LD_SAFE", "1", 0); }
            if (!getenv("HAKMEM_TINY_TLS_SLL")) {
                setenv("HAKMEM_TINY_TLS_SLL", "0", 0);  // disable TLS SLL by default
            }
            if (!getenv("HAKMEM_TINY_USE_SUPERSLAB")) {
                setenv("HAKMEM_TINY_USE_SUPERSLAB", "0", 0);  // disable SuperSlab path by default
            }
        }
    }

    // Runtime safety toggle
    char* safe_free_env = getenv("HAKMEM_SAFE_FREE");
    if (safe_free_env && atoi(safe_free_env) != 0) {
        g_strict_free = 1;
        HAKMEM_LOG("Strict free safety enabled (HAKMEM_SAFE_FREE=1)\n");
    } else {
        // Heuristic: if loaded via LD_PRELOAD, enable strict free by default
        char* ldpre = getenv("LD_PRELOAD");
        if (ldpre && strstr(ldpre, "libhakmem.so")) {
            g_ldpreload_mode = 1;
            g_strict_free = 1;
            HAKMEM_LOG("Strict free safety auto-enabled under LD_PRELOAD\n");
        }
    }

    // Invalid free logging toggle (default off to avoid spam under LD_PRELOAD)
    char* invlog = getenv("HAKMEM_INVALID_FREE_LOG");
    if (invlog && atoi(invlog) != 0) {
        g_invalid_free_log = 1;
        HAKMEM_LOG("Invalid free logging enabled (HAKMEM_INVALID_FREE_LOG=1)\n");
    }

    // Phase 7.4: Cache HAKMEM_INVALID_FREE to eliminate 44% CPU overhead
    // Perf showed getenv() on hot path consumed 43.96% CPU time (26.41% strcmp + 17.55% getenv)
    char* inv = getenv("HAKMEM_INVALID_FREE");
    if (inv && strcmp(inv, "fallback") == 0) {
        g_invalid_free_mode = 0;  // fallback mode: route invalid frees to libc
        HAKMEM_LOG("Invalid free mode: fallback to libc (HAKMEM_INVALID_FREE=fallback)\n");
    } else {
        // Under LD_PRELOAD, prefer safety: default to fallback unless explicitly overridden
        char* ldpre = getenv("LD_PRELOAD");
        if (ldpre && strstr(ldpre, "libhakmem.so")) {
            g_ldpreload_mode = 1;
            g_invalid_free_mode = 0;
            HAKMEM_LOG("Invalid free mode: fallback to libc (auto under LD_PRELOAD)\n");
        } else {
            g_invalid_free_mode = 1;  // default: skip invalid-free check
            HAKMEM_LOG("Invalid free mode: skip check (default)\n");
        }
    }

    // NEW Phase 6.8: Feature-gated initialization (check g_hakem_config flags)
    if (HAK_ENABLED_ALLOC(HAKMEM_FEATURE_POOL)) {
        hak_pool_init();
    }

    // NEW Phase 6.13: L2.5 LargePool (64KB-1MB allocations)
    hak_l25_pool_init();

    if (!g_bench_tiny_only && HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE)) {
        hak_bigcache_init();
        hak_bigcache_set_free_callback(bigcache_free_callback);
    }

    if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_ELO)) {
        hak_elo_init();
        // Phase 6.11.4 P0-2: Initialize cached strategy to default (strategy 0)
        atomic_store(&g_cached_strategy_id, 0);
    }

    if (!g_bench_tiny_only && HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE)) {
        hak_batch_init();
    }

    if (!g_bench_tiny_only && HAK_ENABLED_LEARNING(HAKMEM_FEATURE_EVOLUTION)) {
        hak_evo_init();
    }

    if (!g_bench_tiny_only) {
        // Phase 6.16: Initialize ACE stats (sampling) – default off
        hkm_ace_stats_init();
        // Phase 6.16: Initialize sampling profiler – default off
        hkm_prof_init();
        // Size histogram sampling (optional)
        hkm_size_hist_init();
    }

    if (!g_bench_tiny_only) {
        // Start CAP learner (optional, env-gated)
        hkm_learner_init();
    }

    // NEW Phase 6.10: Site Rules (MVP: always ON)
    // MT note: default disabled unless HAKMEM_SITE_RULES=1
    char* sr_env = getenv("HAKMEM_SITE_RULES");
    g_site_rules_enabled = (sr_env && atoi(sr_env) != 0);
    if (!g_bench_tiny_only && g_site_rules_enabled) {
        hak_site_rules_init();
    }

    // NEW Phase 6.12: Tiny Pool (≤1KB allocations)
    hak_tiny_init();

    // Env: optional Tiny flush on exit (memory efficiency evaluation)
    {
        char* tf = getenv("HAKMEM_TINY_FLUSH_ON_EXIT");
        if (tf && atoi(tf) != 0) {
            g_flush_tiny_on_exit = 1;
        }
        char* ud = getenv("HAKMEM_TINY_ULTRA_DEBUG");
        if (ud && atoi(ud) != 0) {
            g_ultra_debug_on_exit = 1;
        }
        // Register exit hook if any of the debug/flush toggles are on
        // or when path debug is requested.
        if (g_flush_tiny_on_exit || g_ultra_debug_on_exit || getenv("HAKMEM_TINY_PATH_DEBUG")) {
            atexit(hak_flush_tiny_exit);
        }
    }

    // NEW Phase ACE: Initialize Adaptive Control Engine
    hkm_ace_controller_init(&g_ace_controller);
    if (g_ace_controller.enabled) {
        hkm_ace_controller_start(&g_ace_controller);
        HAKMEM_LOG("ACE Learning Layer enabled and started\n");
    }

    g_initializing = 0;
    // Publish that initialization is complete
    atomic_thread_fence(memory_order_seq_cst);
    g_initialized = 1;
}

void hak_shutdown(void) {
    if (!g_initialized) return;

    // NEW Phase ACE: Shutdown Adaptive Control Engine FIRST (before other subsystems)
    hkm_ace_controller_destroy(&g_ace_controller);

    if (!g_bench_tiny_only) {
        printf("[hakmem] Shutting down...\n");
        hak_print_stats();
    }

    // NEW Phase 6.9: Shutdown L2 Pool
    if (!g_bench_tiny_only) hak_pool_shutdown();

    // NEW Phase 6.13: Shutdown L2.5 LargePool
    if (!g_bench_tiny_only) hak_l25_pool_shutdown();

    // NEW: Shutdown BigCache Box
    if (!g_bench_tiny_only) hak_bigcache_shutdown();

    // NEW Phase 6.2: Shutdown ELO Strategy Selection
    if (!g_bench_tiny_only) hak_elo_shutdown();

    // NEW Phase 6.3: Shutdown madvise Batching
    if (!g_bench_tiny_only) hak_batch_shutdown();

    // NEW Phase 6.10: Shutdown Site Rules
    if (!g_bench_tiny_only) hak_site_rules_shutdown();

    // NEW Phase 6.12: Print Tiny Pool statistics
    if (!g_bench_tiny_only) hak_tiny_print_stats();

    // NEW Phase 6.11.1: Print whale cache statistics
    if (!g_bench_tiny_only) {
        hkm_whale_dump_stats();
        // NEW Phase 6.11.1: Shutdown whale cache
        hkm_whale_shutdown();
    }

    // NEW Phase 6.11.1: Shutdown debug timing (must be last!)
    if (!g_bench_tiny_only) hkm_timing_shutdown();

    // Phase 6.16: Dump sampling profiler
    if (!g_bench_tiny_only) hkm_prof_shutdown();

    // Stop learner thread
    if (!g_bench_tiny_only) hkm_learner_shutdown();

    // Stop Tiny background components (e.g., Intelligence Engine)
    hak_tiny_shutdown();

    g_initialized = 0;
}

// Phase 9.1: Force inline for performance (reduces call overhead by ~30-40%)
__attribute__((always_inline))
inline void* hak_alloc_at(size_t size, hak_callsite_t site) {
#if HAKMEM_DEBUG_TIMING
    HKM_TIME_START(t0);  // Profiling (build-time gated)
#endif

    if (!g_initialized) hak_init();

    // ========================================================================
    // Phase 6-3: Tiny Fast Path (System tcache style, 3-4 instruction fast path)
    // ========================================================================
#ifdef HAKMEM_TINY_FAST_PATH
    if (size <= TINY_FAST_THRESHOLD) {
        // Ultra-simple TLS cache pop (bypasses Magazine/SuperSlab)
        extern void* tiny_fast_alloc(size_t);
        extern void tiny_fast_init(void);
        extern __thread int g_tiny_fast_initialized;

        if (__builtin_expect(!g_tiny_fast_initialized, 0)) {
            tiny_fast_init();
        }

        void* ptr = tiny_fast_alloc(size);
        if (ptr) return ptr;
        // Fall through to slow path on failure
    }
#endif
    // ========================================================================

    uintptr_t site_id = (uintptr_t)site;

    // Phase 6.12: Tiny Pool fast-path (≤1KB allocations)
    // Priority: highest for tiny allocations (most frequent)
    if (__builtin_expect(size <= TINY_MAX_SIZE, 1)) {
#if HAKMEM_DEBUG_TIMING
        HKM_TIME_START(t_tiny);
#endif
        void* tiny_ptr = NULL;

        #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
            // Phase 6-1.7: Box Theory Refactoring (3-4 instruction fast path)
            tiny_ptr = hak_tiny_alloc_fast_wrapper(size);
        #elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
            // Phase 6-1.5: Ultra Simple (alignment guessing)
            tiny_ptr = hak_tiny_alloc_ultra_simple(size);
        #elif defined(HAKMEM_TINY_PHASE6_METADATA)
            // Phase 6-1.6: Metadata header
            tiny_ptr = hak_tiny_alloc_metadata(size);
        #else
            // Default: Standard Tiny path
            tiny_ptr = hak_tiny_alloc(size);
        #endif

#if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_TINY_ALLOC, t_tiny);
#endif
        if (tiny_ptr) {
            // NEW Phase ACE: Track allocation for learning
            hkm_ace_track_alloc();
            // Tiny Pool hit! Return immediately (no header needed)
            return tiny_ptr;
        }
        // DEBUG: Tiny Pool returned NULL - fallback to other paths
        static int log_count = 0;
        if (log_count < 3) {
            fprintf(stderr, "[DEBUG] tiny_alloc(%zu) returned NULL, falling back\n", size);
            log_count++;
        }
        // Tiny Pool miss: fallback to other paths below
    }

    // Record size histogram (sampling) — moved after Tiny fast-path to
    // keep hottest path minimal. Tiny hits skip histogram to reduce overhead.
    hkm_size_hist_record(size);

    // Phase Hybrid: Mid Range MT fast-path (8-32KB allocations)
    // Priority: second highest (after Tiny Pool)
    // Uses mimalloc-style per-thread segments for optimal MT performance
    if (__builtin_expect(mid_is_in_range(size), 0)) {
#if HAKMEM_DEBUG_TIMING
        HKM_TIME_START(t_mid);
#endif
        void* mid_ptr = mid_mt_alloc(size);
#if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_POOL_GET, t_mid);
#endif
        if (mid_ptr) {
            // Mid MT hit! Return immediately (no header, lock-free)
            return mid_ptr;
        }
        // Mid MT miss: fallback to other paths below (should be rare)
    }

    // Phase 6.11.4 P0-1 & P0-2: Compile-time guard + cached strategy update
    // Phase 6.15 P0.3: Restored with environment variable control (default disabled)
#if HAKMEM_FEATURE_EVOLUTION
    // Only sample if enabled via HAKMEM_EVO_SAMPLE environment variable
    if (g_evo_sample_mask > 0) {
        static _Atomic uint64_t tick_counter = 0;
        if ((atomic_fetch_add(&tick_counter, 1) & g_evo_sample_mask) == 0) {
            struct timespec now;
            clock_gettime(CLOCK_MONOTONIC, &now);
            uint64_t now_ns = now.tv_sec * 1000000000ULL + now.tv_nsec;

            // P0-2: Update cached strategy when window closes
            if (hak_evo_tick(now_ns)) {
                // Window closed, update cached strategy
                int new_strategy = hak_elo_select_strategy();
                atomic_store(&g_cached_strategy_id, new_strategy);
            }
        }
    }
#endif

    // Phase 6.11.4 P0-2: Always use cached strategy (LEARN/FROZEN/CANARY all use same path)
    size_t threshold;

    if (HAK_ENABLED_LEARNING(HAKMEM_FEATURE_ELO)) {
        // ELO enabled: use cached strategy (updated by hak_evo_tick)
        int strategy_id = atomic_load(&g_cached_strategy_id);
        threshold = hak_elo_get_threshold(strategy_id);
    } else {
        // ELO disabled: use default threshold (2MB - mimalloc's large threshold)
        threshold = 2097152;  // 2MB
    }

    // Phase SACS-3: BigCache only for very large blocks (>= threshold)
    if (HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE) && size >= threshold) {
        void* cached_ptr = NULL;
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_START(t_bc);
        #endif
        if (hak_bigcache_try_get(size, site_id, &cached_ptr)) {
            #if HAKMEM_DEBUG_TIMING
            HKM_TIME_END(HKM_CAT_BIGCACHE_GET, t_bc);
            #endif
            // Cache hit! Return immediately
            return cached_ptr;
        }
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_BIGCACHE_GET, t_bc);
        #endif
    }

    // Phase SACS-3: No Site Rules in tier selection (size-only decision)

    // Phase 6.16 SACS-3: L1 via ACE unified path
    if (size > TINY_MAX_SIZE && size < threshold) {
        const FrozenPolicy* pol = hkm_policy_get();
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_START(t_ace);
        #endif
        void* l1 = hkm_ace_alloc(size, site_id, pol);
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_POOL_GET, t_ace);
        #endif
        if (l1) return l1;
    }

    // Phase SACS-3: For < threshold, prefer malloc; for >= threshold prefer mmap
    void* ptr;
    if (size >= threshold) {
        // Large allocation (L2): use mmap (enables batch madvise)
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_START(t_mmap);
        #endif
        ptr = hak_alloc_mmap_impl(size);
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_SYSCALL_MMAP, t_mmap);
        #endif
    } else {
        // Small/medium allocation (L0/L1): use malloc (faster for <2MB)
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_START(t_malloc);
        #endif
        ptr = hak_alloc_malloc_impl(size);
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_FALLBACK_MALLOC, t_malloc);
        #endif
    }

    if (!ptr) return NULL;

    // NEW Phase 6.5: Record allocation size for distribution signature (gated)
    if (g_evo_sample_mask > 0) {
        hak_evo_record_size(size);
    }

    // NEW: Set alloc_site and class_bytes in header (for BigCache Phase 2)
    AllocHeader* hdr = (AllocHeader*)((char*)ptr - HEADER_SIZE);

    // Verify magic (fail-fast if header corrupted)
    if (hdr->magic != HAKMEM_MAGIC) {
        fprintf(stderr, "[hakmem] ERROR: Invalid magic in allocated header!\n");
        return ptr;  // Return anyway, but log error
    }

    // Set allocation site (for per-site cache reuse)
    hdr->alloc_site = site_id;

    // Set size class for caching (L2 only → threshold class)
    if (size >= threshold) {
        hdr->class_bytes = threshold;  // cacheable at L2 threshold
    } else {
        hdr->class_bytes = 0;  // Not cacheable
    }

    #if HAKMEM_DEBUG_TIMING
    HKM_TIME_END(HKM_CAT_HAK_ALLOC, t0);  // Profiling (build-time gated)
    #endif
    return ptr;
}

// Phase 9.1: Force inline for performance (reduces call overhead by ~30-40%)
// Phase 6-1.7: Disable inline for box refactor to avoid recursive inlining
#ifndef HAKMEM_TINY_PHASE6_BOX_REFACTOR
__attribute__((always_inline))
inline
#endif
void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
#if HAKMEM_DEBUG_TIMING
    HKM_TIME_START(t0);  // Profiling (build-time gated)
#endif

    (void)site;  // Not used yet (will be used in BigCache Phase 2)
    (void)size;  // Size stored in header

    if (!ptr) {
#if HAKMEM_DEBUG_TIMING
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
        #endif
#endif
        return;
    }

    // OPTIMIZATION PHASE 2+1 (2025-11-01): Check Tiny Pool FIRST
    // Phase 2: Ultra-fast owner_slab with TLS range check (1-2 cycles negative lookup)
    // Phase 1: Reorder to avoid Mid MT mutex overhead for Tiny allocations (90% of mixed workload)
    //
    // Target: +12-13% improvement (16.24 → 18.4-18.6 M ops/sec)
    // - Tiny allocations (90%): Skip Mid MT mutex entirely → ~12% improvement
    // - Mid allocations (10%): Fast negative lookup from owner_slab → minimal overhead
    TinySlab* tiny_slab = hak_tiny_owner_slab(ptr);
    if (tiny_slab) {
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
        // Phase 6-1.7: Box Theory Refactoring (2-3 instruction fast path)
        // Box 6 handles both same-thread (fast) and cross-thread (remote) internally
        hak_tiny_free_fast_wrapper(ptr);
        return;
#elif defined(HAKMEM_TINY_PHASE6_ULTRA_SIMPLE)
        // Phase 6-1.5: Only use ultra-simple free on same-thread pointers.
        // Cross-thread frees must go through the full tiny free path
        // to ensure proper remote-queue handling and slab reuse.
        pthread_t self_pt = pthread_self();
        if (__builtin_expect(pthread_equal(tiny_slab->owner_tid, self_pt), 1)) {
            hak_tiny_free_ultra_simple(ptr);
            return;
        }
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
        // Phase 6-1.6: Metadata header
        hak_tiny_free_metadata(ptr);
        return;
#endif
        // Fallback: full tiny free (handles cross-thread case correctly)
        hak_tiny_free(ptr);
        return;
    }

    // Phase Hybrid: Mid Range MT check (8-32KB, headerless)
    {
        size_t mid_block_size = 0;
        int mid_class_idx = 0;

        // First check if ptr is in current thread's segment (fast path)
        for (int i = 0; i < MID_NUM_CLASSES; i++) {
            MidThreadSegment* seg = &g_mid_segments[i];
            if (seg->chunk_base && ptr >= seg->chunk_base && ptr < seg->end) {
                *(void**)ptr = seg->free_list;
                seg->free_list = ptr;
                seg->used_count--;
                return;
            }
        }

        // Not in current thread's segment - try registry (mutex + binary search)
        if (mid_registry_lookup(ptr, &mid_block_size, &mid_class_idx)) {
            mid_mt_free(ptr, mid_block_size);
            return;
        }
    }

    // DISABLED: SuperSlab Registry lookup causes false positives
    // Problem: L25 allocations aligned to 1MB boundary are misidentified as SuperSlabs
    // causing crashes when checking magic number on unmapped/invalid memory
    // TODO: Fix SuperSlab registry to avoid false positives (descriptor-based check?)
    #if 0
    SuperSlab* ss = hak_super_lookup(ptr);
    if (ss) {
        hak_tiny_free(ptr);
        #if HAKMEM_DEBUG_TIMING
        HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
        #endif
        return;
    }
    #endif

    // Mid Pool headerless fast route: use page descriptor before header read
    {
        extern int hak_pool_mid_lookup(void* ptr, size_t* out_size);
        extern void hak_pool_free_fast(void* ptr, uintptr_t site_id);
        size_t mid_sz = 0;
        if (hak_pool_mid_lookup(ptr, &mid_sz)) {
            // For Mid, header read is unnecessary; free directly via pool.
            hak_pool_free_fast(ptr, (uintptr_t)site);
            #if HAKMEM_DEBUG_TIMING
            HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
            #endif
            return;
        }
    }

    // L2.5 headerless route: use page descriptor before header read
    {
        extern int hak_l25_lookup(void* ptr, size_t* out_size);
        extern void hak_l25_pool_free_fast(void* ptr, uintptr_t site_id);
        size_t l25_sz = 0;
        if (hak_l25_lookup(ptr, &l25_sz)) {
            // Stats (optional): count as large free
            hkm_ace_stat_large_free();
            hak_l25_pool_free_fast(ptr, (uintptr_t)site);
            #if HAKMEM_DEBUG_TIMING
            HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
            #endif
            return;
        }
    }

    // NEW Phase 6.5: Measure free latency (start timing)
    // Gate by EVO sampling mask to avoid per-op overhead when disabled
    int _do_evo = (g_evo_sample_mask > 0);
    struct timespec start_time, end_time;
    if (_do_evo) {
        clock_gettime(CLOCK_MONOTONIC, &start_time);
    }

    // Helper macro to record latency before returning (build-time gated timing)
    #if HAKMEM_DEBUG_TIMING
      #define RECORD_FREE_LATENCY() do { \
        if (_do_evo) { \
            clock_gettime(CLOCK_MONOTONIC, &end_time); \
            uint64_t ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + \
                          (end_time.tv_nsec - start_time.tv_nsec); \
            hak_evo_record_latency((double)ns); \
            if (hak_evo_is_canary()) { \
                hak_evo_record_canary_result(0, (double)ns); \
            } \
        } \
        HKM_TIME_END(HKM_CAT_HAK_FREE, t0); \
      } while(0)
    #else
      #define RECORD_FREE_LATENCY() do { \
        if (_do_evo) { \
            clock_gettime(CLOCK_MONOTONIC, &end_time); \
            uint64_t ns = (end_time.tv_sec - start_time.tv_sec) * 1000000000ULL + \
                          (end_time.tv_nsec - start_time.tv_nsec); \
            hak_evo_record_latency((double)ns); \
            if (hak_evo_is_canary()) { \
                hak_evo_record_canary_result(0, (double)ns); \
            } \
        } \
      } while(0)
    #endif

    // Get raw pointer (before header)
    void* raw = (char*)ptr - HEADER_SIZE;

#ifdef __linux__
    if (g_strict_free) {
        // Safety: ensure header address is mapped before touching it (optional)
        long _ps = sysconf(_SC_PAGESIZE);
        void* _pg = (void*)((uintptr_t)raw & ~((uintptr_t)_ps - 1));
        unsigned char _vec;
        if (mincore(_pg, (size_t)_ps, &_vec) != 0) {
            // Not a valid mapped region → fallback directly to libc free
            extern void __libc_free(void*);
            __libc_free(ptr);
            RECORD_FREE_LATENCY();
            return;
        }
    }
#endif

    // Read header
    AllocHeader* hdr = (AllocHeader*)raw;

    // NEW: Verify magic (fail-fast if corrupted or not from hakmem)
    if (hdr->magic != HAKMEM_MAGIC) {
        if (g_invalid_free_log) {
            fprintf(stderr, "[hakmem] ERROR: Invalid magic 0x%X (expected 0x%X) - possible corruption or non-hakmem pointer\n",
                    hdr->magic, HAKMEM_MAGIC);
        }
        // Phase 7.4: Use cached mode (eliminates 44% CPU overhead from getenv on hot path!)
        // OLD CODE (44% CPU time!): const char* inv = getenv("HAKMEM_INVALID_FREE");
        //                           if (inv && strcmp(inv, "fallback") == 0) mode_skip = 0;
        int mode_skip = g_invalid_free_mode;  // 1 = skip, 0 = fallback to libc
        if (mode_skip) {
            // Skip freeing unknown pointer to avoid abort (possible mmap region). Log only.
            RECORD_FREE_LATENCY();
            return;
        } else {
            fprintf(stderr, "[hakmem] Attempting fallback to system free()...\n");
            extern void __libc_free(void*);
            __libc_free(ptr);
            RECORD_FREE_LATENCY();
            return;
        }
    }

    // Phase SACS-3: BigCache put only for L2 (class_bytes >= 2MB)
    if (HAK_ENABLED_CACHE(HAKMEM_FEATURE_BIGCACHE) && hdr->class_bytes >= 2097152) {
        // Pass actual allocated size (hdr->size), not class_bytes!
        // This prevents buffer overflow when BigCache returns undersized blocks
        if (hak_bigcache_put(ptr, hdr->size, hdr->alloc_site)) {
            RECORD_FREE_LATENCY();
            return;  // Successfully cached, skip actual free
        }
    }

    // Phase 6.9.1: Pool allocations are now handled via header method
    //  (no separate detection needed, just dispatch on method)

    // Dispatch to correct free function
    switch (hdr->method) {
        case ALLOC_METHOD_POOL:
            // Phase 6.9.1: Pool allocation - return to pool
            if (HAK_ENABLED_ALLOC(HAKMEM_FEATURE_POOL)) {
                // Stats: record free in ACE L1 Mid
                hkm_ace_stat_mid_free();
                hak_pool_free(ptr, hdr->size, hdr->alloc_site);
            } else {
                // Pool disabled, shouldn't happen (fail-fast)
                fprintf(stderr, "[hakmem] ERROR: POOL allocation but POOL feature disabled!\\n");
            }
            RECORD_FREE_LATENCY();
            return;

        case ALLOC_METHOD_L25_POOL:
            // Phase 6.13: L2.5 Pool allocation - return to pool
            hkm_ace_stat_large_free();
            hak_l25_pool_free(ptr, hdr->size, hdr->alloc_site);
            RECORD_FREE_LATENCY();
            return;

        case ALLOC_METHOD_MALLOC:
            free(raw);
            break;

        case ALLOC_METHOD_MMAP:
            // Phase 6.4 P1: Apply free policy (Hot/Warm/Cold)
            if (g_hakem_config.free_policy == FREE_POLICY_KEEP) {
                // KEEP: 何もしない（VA保持、madviseもしない）
                RECORD_FREE_LATENCY();
                return;
            } else if (g_hakem_config.free_policy == FREE_POLICY_ADAPTIVE) {
                // ADAPTIVE: Hot/Warm/Cold判定
                FreeThermal thermal = hak_classify_thermal(hdr->size);

                switch (thermal) {
                    case FREE_THERMAL_HOT:
                        // HOT (< 1MB): 何もしない（すぐ再利用される）
                        RECORD_FREE_LATENCY();
                        return;

                    case FREE_THERMAL_WARM:
                        // WARM (1-2MB): MADV_FREE（munmapしない、物理ページのみ返す）
#ifdef __linux__
                        madvise(raw, hdr->size, MADV_FREE);
#endif
                        RECORD_FREE_LATENCY();
                        return;

                    case FREE_THERMAL_COLD:
                        // COLD (>= 2MB): batch (Phase 6.8: feature-gated)
                        if (HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE) && hdr->size >= BATCH_MIN_SIZE) {
                            hak_batch_add(raw, hdr->size);
                            RECORD_FREE_LATENCY();
                            return;
                        }
                        // Small blocks: immediate munmap
#ifdef __linux__
                        // Phase 6.11.1: Try whale cache first
                        if (hkm_whale_put(raw, hdr->size) != 0) {
                            hkm_sys_munmap(raw, hdr->size);
                        }
#else
                        free(raw);
#endif
                        break;
                }
            } else {
                // BATCH (default): Phase 6.8 feature-gated
                // - Keep VA mapped for reuse (mimalloc strategy)
                // - Only MADV_FREE on batch flush (release physical pages)
                // - munmap happens on cold eviction only
                if (HAK_ENABLED_MEMORY(HAKMEM_FEATURE_BATCH_MADVISE) && hdr->size >= BATCH_MIN_SIZE) {
                    hak_batch_add(raw, hdr->size);
                    RECORD_FREE_LATENCY();
                    return;
                }

                // Small blocks: immediate munmap (not worth batching)
#ifdef __linux__
                // Phase 6.11.1: Try whale cache first
                if (hkm_whale_put(raw, hdr->size) != 0) {
                    hkm_sys_munmap(raw, hdr->size);
                }
#else
                free(raw);
#endif
            }
            break;

        default:
            fprintf(stderr, "[hakmem] ERROR: Unknown allocation method: %d\n", hdr->method);
            break;
    }

    // Record latency for all paths that reach here
    RECORD_FREE_LATENCY();

    #undef RECORD_FREE_LATENCY
}


void hak_print_stats(void) {
    printf("\n========================================\n");
    printf("hakmem ELO-based Profiling Statistics\n");
    printf("========================================\n");

    printf("\nOptimization Stats:\n");
    printf("  malloc() calls: %llu\n", (unsigned long long)g_malloc_count);

    hak_elo_print_leaderboard();

    printf("========================================\n\n");
}

// ============================================================================
// Phase 6.15 P0: Standard C Library Wrappers (for LD_PRELOAD)
// ============================================================================

#ifdef HAKMEM_FORCE_LIBC_ALLOC_BUILD

// Sanitizer/diagnostic builds: bypass hakmem allocator completely.
void* malloc(size_t size) {
    extern void* __libc_malloc(size_t);
    return __libc_malloc(size);
}

void free(void* ptr) {
    if (!ptr) return;
    extern void __libc_free(void*);
    __libc_free(ptr);
}

void* calloc(size_t nmemb, size_t size) {
    extern void* __libc_calloc(size_t, size_t);
    return __libc_calloc(nmemb, size);
}

void* realloc(void* ptr, size_t size) {
    extern void* __libc_realloc(void*, size_t);
    return __libc_realloc(ptr, size);
}

#else

// malloc wrapper - intercepts system malloc() calls
void* malloc(size_t size) {
    // ========================================================================
    // Phase 6-4: ULTRA-FAST PATH (Option A optimization)
    // Priority: initialized + tiny size → direct to fast cache (2-3 branches)
    // Expected hit rate: 95%+ for tiny allocations
    // ========================================================================
#ifdef HAKMEM_TINY_FAST_PATH
    // Branch 1+2: initialized check + size check (combined for branch prediction)
    if (__builtin_expect(g_initialized && size <= TINY_FAST_THRESHOLD, 1)) {
        extern void* tiny_fast_alloc(size_t);
        extern void tiny_fast_init(void);
        extern __thread int g_tiny_fast_initialized;

        // Branch 3: init check (rarely taken)
        if (__builtin_expect(!g_tiny_fast_initialized, 0)) {
            tiny_fast_init();
        }

        // Fast path: TLS cache pop (3-4 instructions inside tiny_fast_alloc)
        void* ptr = tiny_fast_alloc(size);
        if (__builtin_expect(ptr != NULL, 1)) {
            return ptr;  // 🚀 FAST PATH HIT: 3 branches total!
        }
        // Fall through to slow path on cache miss
    }
#endif
    // ========================================================================

    // ========================================================================
    // SLOW PATH: All guard checks (for non-tiny, uninitialized, or special cases)
    // ========================================================================

    // Recursion guard: if we're inside the allocator already, fall back to libc
    if (g_hakmem_lock_depth > 0) {
        // Nested call detected - fallback to system malloc
        extern void* __libc_malloc(size_t);
        return __libc_malloc(size);
    }

    // Initialization guard: during hak_init() bootstrap, use libc directly
    if (__builtin_expect(g_initializing != 0, 0)) {
        extern void* __libc_malloc(size_t);
        return __libc_malloc(size);
    }

    if (__builtin_expect(hak_force_libc_alloc(), 0)) {
        extern void* __libc_malloc(size_t);
        return __libc_malloc(size);
    }

    // LD safe modes: 1=tiny-only, 2=pass-through
    // Determine LD_PRELOAD mode early (before hak_init) to avoid misrouting
    int ld_mode = hak_ld_env_mode();
    if (ld_mode) {
        // Avoid mixing with jemalloc-managed programs (e.g., redis)
        if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
            extern void* __libc_malloc(size_t);
            return __libc_malloc(size);
        }
        // Before hakmem initialization completes, always delegate to libc
        if (!g_initialized || g_initializing) {
            extern void* __libc_malloc(size_t);
            return __libc_malloc(size);
        }
        const char* lds = getenv("HAKMEM_LD_SAFE");
        int mode = (lds ? atoi(lds) : 1);
        if (mode >= 2 || size > TINY_MAX_SIZE) {
            extern void* __libc_malloc(size_t);
            return __libc_malloc(size);
        }
    }

    // First-level call: enter allocator (no global lock)
    g_hakmem_lock_depth++;
    void* ptr = hak_alloc_at(size, HAK_CALLSITE());
    g_hakmem_lock_depth--;
    return ptr;
}

// free wrapper - intercepts system free() calls
void free(void* ptr) {
    if (!ptr) return;  // NULL check

    // ========================================================================
    // Phase 6-4: ULTRA-FAST PATH (Option A optimization)
    // Priority: initialized → direct to fast free path (1-2 branches)
    // Expected hit rate: 95%+ for tiny allocations
    // ========================================================================

    // Branch 1: initialized check (fast path for common case)
    if (__builtin_expect(g_initialized, 1)) {
        // Fast path: normal operation, no special handling needed

        // Phase 6 Fast Path variants (when enabled)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
        g_hakmem_lock_depth++;
        hak_tiny_free_ultra_simple(ptr);
        g_hakmem_lock_depth--;
        return;
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
        g_hakmem_lock_depth++;
        hak_tiny_free_metadata(ptr);
        g_hakmem_lock_depth--;
        return;
#else
        // Default fast path
        g_hakmem_lock_depth++;
        hak_free_at(ptr, 0, HAK_CALLSITE());
        g_hakmem_lock_depth--;
        return;
#endif
    }

    // ========================================================================
    // SLOW PATH: All guard checks (for uninitialized or special cases)
    // ========================================================================

    // Recursion guard: if we're inside the allocator already, fall back to libc
    if (g_hakmem_lock_depth > 0) {
        // Nested call detected - fallback to system free
        extern void __libc_free(void*);
        __libc_free(ptr);
        return;
    }

    if (__builtin_expect(g_initializing != 0, 0)) {
        extern void __libc_free(void*);
        __libc_free(ptr);
        return;
    }

    if (__builtin_expect(hak_force_libc_alloc(), 0)) {
        extern void __libc_free(void*);
        __libc_free(ptr);
        return;
    }

    // In LD_PRELOAD mode, before hakmem initialization completes, always delegate
    {
        if (hak_ld_env_mode()) {
            if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
                extern void __libc_free(void*);
                __libc_free(ptr);
                return;
            }
            if (!g_initialized || g_initializing) {
                extern void __libc_free(void*);
                __libc_free(ptr);
                return;
            }
        }
    }

    // Fallback (should not reach here in normal case)
    g_hakmem_lock_depth++;
    hak_free_at(ptr, 0, HAK_CALLSITE());
    g_hakmem_lock_depth--;
}

// calloc wrapper - intercepts system calloc() calls
void* calloc(size_t nmemb, size_t size) {
    // Recursion guard
    if (g_hakmem_lock_depth > 0) {
        // Nested call detected - fallback to system calloc
        extern void* __libc_calloc(size_t, size_t);
        return __libc_calloc(nmemb, size);
    }

    if (__builtin_expect(g_initializing != 0, 0)) {
        extern void* __libc_calloc(size_t, size_t);
        return __libc_calloc(nmemb, size);
    }

    // Overflow check before any multiplication
    if (size != 0 && nmemb > (SIZE_MAX / size)) {
        errno = ENOMEM;
        return NULL;
    }

    if (__builtin_expect(hak_force_libc_alloc(), 0)) {
        extern void* __libc_calloc(size_t, size_t);
        return __libc_calloc(nmemb, size);
    }

    // Determine LD_PRELOAD mode early (before hak_init)
    int ld_mode = hak_ld_env_mode();
    if (ld_mode) {
        if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
            extern void* __libc_calloc(size_t, size_t);
            return __libc_calloc(nmemb, size);
        }
        if (!g_initialized || g_initializing) {
            extern void* __libc_calloc(size_t, size_t);
            return __libc_calloc(nmemb, size);
        }
        const char* lds = getenv("HAKMEM_LD_SAFE");
        int mode = (lds ? atoi(lds) : 1);
        size_t total = nmemb * size;  // safe: overflow checked above
        if (mode >= 2 || total > TINY_MAX_SIZE) {
            extern void* __libc_calloc(size_t, size_t);
            return __libc_calloc(nmemb, size);
        }
    }

    g_hakmem_lock_depth++;
    size_t total_size = nmemb * size;  // safe: overflow checked above
    void* ptr = hak_alloc_at(total_size, HAK_CALLSITE());

    if (ptr) {
        memset(ptr, 0, total_size);  // calloc zeros memory
    }

    g_hakmem_lock_depth--;
    return ptr;
}

// realloc wrapper - intercepts system realloc() calls
void* realloc(void* ptr, size_t size) {
    // Recursion guard
    if (g_hakmem_lock_depth > 0) {
        // Nested call detected - fallback to system realloc
        extern void* __libc_realloc(void*, size_t);
        return __libc_realloc(ptr, size);
    }

    if (__builtin_expect(g_initializing != 0, 0)) {
        extern void* __libc_realloc(void*, size_t);
        return __libc_realloc(ptr, size);
    }

    if (__builtin_expect(hak_force_libc_alloc(), 0)) {
        extern void* __libc_realloc(void*, size_t);
        return __libc_realloc(ptr, size);
    }

    // Determine LD_PRELOAD mode early (before hak_init)
    int ld_mode = hak_ld_env_mode();
    if (ld_mode) {
        if (hak_ld_block_jemalloc() && hak_jemalloc_loaded()) {
            extern void* __libc_realloc(void*, size_t);
            return __libc_realloc(ptr, size);
        }
        if (!g_initialized || g_initializing) {
            extern void* __libc_realloc(void*, size_t);
            return __libc_realloc(ptr, size);
        }
        const char* lds = getenv("HAKMEM_LD_SAFE");
        int mode = (lds ? atoi(lds) : 1);
        // Pass-through mode, or resizing beyond Tiny range → route to libc
        if (mode >= 2 || size > TINY_MAX_SIZE) {
            extern void* __libc_realloc(void*, size_t);
            return __libc_realloc(ptr, size);
        }
        // Tiny-only safe mode: if the existing pointer is NOT Tiny-managed,
        // do not touch it — delegate to libc to avoid header mismatches.
        if (ptr != NULL && !hak_tiny_is_managed(ptr)) {
            extern void* __libc_realloc(void*, size_t);
            return __libc_realloc(ptr, size);
        }
    }

    g_hakmem_lock_depth++;
    void* new_ptr = NULL;

    if (!ptr) {
        // realloc(NULL, size) = malloc(size)
        new_ptr = hak_alloc_at(size, HAK_CALLSITE());
    } else if (size == 0) {
        // realloc(ptr, 0) = free(ptr)
        hak_free_at(ptr, 0, HAK_CALLSITE());
        new_ptr = NULL;
    } else {
        // Allocate new block
        new_ptr = hak_alloc_at(size, HAK_CALLSITE());

        if (new_ptr) {
            // Get old size from header
            void* raw = (char*)ptr - HEADER_SIZE;
            AllocHeader* hdr = (AllocHeader*)raw;

            if (hdr->magic == HAKMEM_MAGIC) {
                size_t old_size = hdr->size - HEADER_SIZE;  // User-visible size
                size_t copy_size = (old_size < size) ? old_size : size;
                memcpy(new_ptr, ptr, copy_size);
            } else {
                // Invalid header, copy what we can (best effort)
                memcpy(new_ptr, ptr, size);
            }

            // Free old block
            hak_free_at(ptr, 0, HAK_CALLSITE());
        }
    }

    g_hakmem_lock_depth--;
    return new_ptr;
}

#endif  // HAKMEM_FORCE_LIBC_ALLOC_BUILD