// bench_fast_box.c - BenchFast Mode Implementation
// Purpose: Ultra-minimal Tiny alloc/free for structural ceiling measurement
// WARNING: Bypasses ALL safety mechanisms - benchmark only!

#include "bench_fast_box.h"
#include "../hakmem_tiny.h"
#include "../tiny_region_id.h"
#include "../box/tiny_next_ptr_box.h"
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <stdatomic.h>

// External Tiny infrastructure (defined in hakmem_tiny.c)
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
extern int g_tls_sll_enable;
extern int hak_tiny_size_to_class(size_t size);
extern const size_t g_tiny_class_sizes[];
// Public API fallbacks (correct signatures from hakmem.h)
#include "../hakmem.h"

// Guard: Disable BenchFast during initialization to avoid recursion
// Phase 8-TLS-Fix: Changed from __thread to atomic_int
// Root Cause: pthread_once() creates new threads with fresh TLS (= 0),
//             breaking the guard. Atomic variable works across ALL threads.
// Box Contract: Guard must protect entire process, not just calling thread.
atomic_int g_bench_fast_init_in_progress = 0;

// BenchFast alloc - Minimal path (POP-ONLY, NO REFILL)
// Flow:
//   1. size → class_idx (inline table lookup)
//   2. TLS SLL pop (3-4 instructions)
//   3. Write header + return (2-3 instructions)
// NOTE: No refill! Pool must be preallocated via bench_fast_init()
void* bench_fast_alloc(size_t size) {
    // Guard: Avoid recursion during init phase (atomic for cross-thread safety)
    if (__builtin_expect(atomic_load(&g_bench_fast_init_in_progress), 0)) {
        // Initialization in progress - use normal allocator to avoid recursion
        return hak_alloc_at(size, "bench_fast_alloc_init");
    }

    // 1. Size → class_idx (inline, 1-2 instructions)
    int class_idx = hak_tiny_size_to_class(size);
    if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
        fprintf(stderr, "[BENCH_FAST] Invalid size %zu (class %d out of range)\n",
                size, class_idx);
        return NULL;  // Out of range
    }

    // 2. TLS SLL pop (3-4 instructions) - NO REFILL!
    void* base = NULL;
    void* head = g_tls_sll[class_idx].head;
    if (__builtin_expect(head != NULL, 1)) {
        // Read next pointer from header (header+1 = next ptr storage)
        void* next = tiny_next_read(class_idx, head);

        g_tls_sll[class_idx].head = next;
        g_tls_sll[class_idx].count--;
        base = head;
    }

    // 3. Pool exhausted - NO REFILL (benchmark failure)
    if (__builtin_expect(base == NULL, 0)) {
        fprintf(stderr, "[BENCH_FAST] Pool exhausted for C%d (size=%zu)\n",
                class_idx, size);
        fprintf(stderr, "[BENCH_FAST] Increase PREALLOC_COUNT or reduce iteration count\n");
        return NULL;
    }

    // 4. Write header + return USER pointer (2-3 instructions)
    // Phase 8-P3-Fix: Write header DIRECTLY (bypass tiny_region_id_write_header)
    // Reason: P3 optimization skips header writes by default (class_map mode)
    //         But BenchFast REQUIRES headers for free routing (0xa0-0xa7 magic)
    // Contract: BenchFast always writes headers, regardless of P3 optimization
    #ifdef HAKMEM_TINY_HEADER_CLASSIDX
    *(uint8_t*)base = (uint8_t)(0xa0 | (class_idx & 0x0f));  // Direct header write
    return (void*)((char*)base + 1);  // Return USER pointer
    #else
    return base;  // No header mode - return BASE directly
    #endif
}

// BenchFast free - Minimal path (3-5 instructions)
// Flow:
//   1. Read header (1 instruction)
//   2. BASE pointer (ptr-1) (1 instruction)
//   3. TLS SLL push (2-3 instructions)
void bench_fast_free(void* ptr) {
    if (__builtin_expect(!ptr, 0)) return;

    #ifdef HAKMEM_TINY_HEADER_CLASSIDX
    // 1. Read class_idx from header (1 instruction, 2-3 cycles)
    int class_idx = tiny_region_id_read_header(ptr);
    if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
        // Invalid header - fallback to normal free
        hak_free_at(ptr, 0, "bench_fast_free");
        return;
    }

    // 2. Compute BASE pointer (1 instruction)
    void* base = (void*)((char*)ptr - 1);

    // 3. TLS SLL push (2-3 instructions) - ALWAYS push if class_idx valid
    // Fast path: Direct inline push (no Box API overhead, no capacity check)
    tiny_next_write(class_idx, base, g_tls_sll[class_idx].head);
    g_tls_sll[class_idx].head = base;
    g_tls_sll[class_idx].count++;
    #else
    // Fallback to normal free (no header mode)
    hak_free_at(ptr, 0, "bench_fast_free");
    #endif
}

// BenchFast init - Preallocate pool to avoid recursion
// Strategy:
//   1. Called BEFORE benchmark (normal allocator OK)
//   2. Allocates 50,000 blocks per class (C2-C7)
//   3. Frees them to populate TLS SLL
//   4. BenchFast mode just pops from pre-filled pool (no refill)
// Returns: Total blocks preallocated, or 0 if disabled
int bench_fast_init(void) {
    if (!bench_fast_enabled()) {
        fprintf(stderr, "[BENCH_FAST] HAKMEM_BENCH_FAST_MODE not set, skipping init\n");
        return 0;
    }

    // Set guard to prevent recursion during initialization (atomic for cross-thread safety)
    atomic_store(&g_bench_fast_init_in_progress, 1);

    // Phase 8 Root Cause Fix: REMOVED unified_cache_init() call
    // Reason: BenchFast uses TLS SLL directly, NOT Unified Cache
    // The prewarm was a design misunderstanding - BenchFast has its own allocation strategy
    // Calling unified_cache_init() created 16KB mmap allocations that crashed when freed
    // in BenchFast mode (header misclassification bug)

    fprintf(stderr, "[BENCH_FAST] Starting preallocation...\n");

    // Layer 0 Root Cause Fix: Limit prealloc to actual TLS SLL capacity
    // Problem: Old code preallocated 50,000 blocks/class, but TLS SLL capacity is 128 (adaptive sizing)
    //          The "lost" blocks (beyond capacity) caused heap corruption
    // Analysis: sll_cap_for_class() returns "desired" capacity (2048), but adaptive sizing
    //           limits actual capacity to 128 at runtime. We must use the actual limit.
    // Solution: Hard-code to 128 blocks/class (observed actual capacity from runtime output)
    extern const size_t g_tiny_class_sizes[];

    int total = 0;
    const uint32_t ACTUAL_TLS_SLL_CAPACITY = 128;  // Observed actual capacity (adaptive sizing limit)

    // Preallocate C2-C7 (32B-1024B, skip C0/C1 - too small, rarely used)
    for (int cls = 2; cls <= 7; cls++) {
        uint32_t capacity = ACTUAL_TLS_SLL_CAPACITY;

        fprintf(stderr, "[BENCH_FAST] Preallocating C%d (%zu bytes): %u blocks (actual TLS SLL capacity)...\n",
                cls, g_tiny_class_sizes[cls], capacity);

        for (int i = 0; i < (int)capacity; i++) {
            // Use normal allocator (hak_alloc_at) - recursion safe here
            size_t size = g_tiny_class_sizes[cls];
            #ifdef HAKMEM_TINY_HEADER_CLASSIDX
            // Adjust for header: if class size is N, we need N-1 bytes of user data
            size = size - 1;
            #endif

            void* ptr = hak_alloc_at(size, "bench_fast_init");

            if (!ptr) {
                fprintf(stderr, "[BENCH_FAST] Failed to preallocate C%d at %d/%u\n",
                        cls, i, capacity);
                fprintf(stderr, "[BENCH_FAST] Total preallocated: %d blocks\n", total);
                return total;
            }

            #ifdef HAKMEM_TINY_HEADER_CLASSIDX
            // Convert USER → BASE pointer
            void* base = (void*)((char*)ptr - 1);

            // Read and verify class from header
            int header_cls = tiny_region_id_read_header(ptr);
            if (header_cls != cls) {
                fprintf(stderr, "[BENCH_FAST] Header mismatch: expected C%d, got C%d\n",
                        cls, header_cls);
                // Free normally and continue
                hak_free_at(ptr, size, "bench_fast_init_mismatch");
                continue;
            }

            // Push directly to TLS SLL (bypass drain logic)
            // This ensures blocks stay in TLS pool for BenchFast mode
            tiny_next_write(cls, base, g_tls_sll[cls].head);
            g_tls_sll[cls].head = base;
            g_tls_sll[cls].count++;
            #else
            // No header mode - use normal free
            free(ptr);
            #endif

            total++;

            // Progress indicator (only for large capacities)
            if (capacity >= 500 && (i + 1) % 500 == 0) {
                fprintf(stderr, "[BENCH_FAST]   C%d: %d/%u blocks...\n",
                        cls, i + 1, capacity);
            }
        }

        fprintf(stderr, "[BENCH_FAST] C%d complete: %u blocks in TLS SLL\n",
                cls, g_tls_sll[cls].count);
    }

    fprintf(stderr, "[BENCH_FAST] Prealloc complete: %d total blocks\n", total);
    fprintf(stderr, "[BENCH_FAST] TLS SLL counts:\n");
    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        if (g_tls_sll[cls].count > 0) {
            fprintf(stderr, "[BENCH_FAST]   C%d: %u blocks\n", cls, g_tls_sll[cls].count);
        }
    }

    // Clear guard - initialization complete, BenchFast mode can now be used
    atomic_store(&g_bench_fast_init_in_progress, 0);

    return total;
}

// BenchFast stats - Print remaining blocks per class
// Use after benchmark to verify pool wasn't exhausted
void bench_fast_stats(void) {
    if (!bench_fast_enabled()) {
        return;
    }

    fprintf(stderr, "[BENCH_FAST] Final TLS SLL counts:\n");
    for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) {
        if (g_tls_sll[cls].count > 0) {
            fprintf(stderr, "[BENCH_FAST]   C%d: %u blocks remaining\n",
                    cls, g_tls_sll[cls].count);
        }
    }
}