// tls_ss_hint_box.h - TLS SuperSlab Hint Cache for Headerless Mode
//
// BOX THEORY:
// -----------
// Mission: Cache recently-used SuperSlab references in TLS to accelerate
//          ptr→SuperSlab resolution in Headerless mode, avoiding expensive
//          hash table lookups on the critical free() path.
//
// Design: Provides O(1) lookup for hot SuperSlabs (L1 cache hit, 2-5 cycles)
//         Falls back to global registry on miss (fail-safe, no data loss)
//         No ownership, no remote queues, pure read-only cache
//         FIFO eviction policy with configurable cache size (4 slots)
//
// Invariants:
//         - hint.base <= ptr < hint.end implies hint.ss is valid
//         - Miss is always safe (triggers fallback to hak_super_lookup)
//         - TLS data survives only within thread lifetime
//         - Cache entries are invalidated implicitly by FIFO rotation
//         - Magic number check (SUPERSLAB_MAGIC) validates all pointers
//
// Boundary:
//         - Input: raw user pointer (void* ptr) from free() path
//         - Output: SuperSlab* or NULL (miss triggers fallback)
//         - Does NOT determine class_idx (that's slab_index_for's job)
//         - Does NOT perform ownership validation (that's SuperSlab's job)
//
// Performance:
//         - Cache hit: 2-5 cycles (L1 cache hit, 4 pointer comparisons)
//         - Cache miss: fallback to hak_super_lookup (10-50 cycles)
//         - Expected hit rate: 85-95% for single-threaded workloads
//         - Expected hit rate: 70-85% for multi-threaded workloads
//
// Thread Safety:
//         - TLS storage: no sharing, no synchronization required
//         - Read-only cache: never modifies SuperSlab state
//         - Stale entries: caught by magic number check

#ifndef TLS_SS_HINT_BOX_H
#define TLS_SS_HINT_BOX_H

#include <stdint.h>
#include <stdbool.h>
#include <stddef.h>
#include "hakmem_build_flags.h"

// Forward declaration
struct SuperSlab;

// Cache entry for a single SuperSlab hint
// Size: 24 bytes (cache-friendly, fits in 1 cache line with metadata)
typedef struct {
    void* base;              // SuperSlab base address (aligned to 1MB or 2MB)
    void* end;               // base + superslab_size (for range check)
    struct SuperSlab* ss;    // Cached SuperSlab pointer
} TlsSsHintEntry;

// TLS hint cache configuration
// - 4 slots provide good hit rate without excessive overhead
// - Larger caches (8, 16) show diminishing returns in benchmarks
// - Smaller caches (2) may thrash on workloads with 3+ active SuperSlabs
#define TLS_SS_HINT_SLOTS 4

// Thread-local SuperSlab hint cache
// Total size: 24*4 + 16 = 112 bytes per thread (negligible overhead)
typedef struct {
    TlsSsHintEntry entries[TLS_SS_HINT_SLOTS];  // Cache entries
    uint32_t count;          // Number of valid entries (0 to TLS_SS_HINT_SLOTS)
    uint32_t next_slot;      // Next slot for FIFO rotation (wraps at TLS_SS_HINT_SLOTS)

    // Statistics (optional, for profiling builds)
    // Disabled in HAKMEM_BUILD_RELEASE to save 16 bytes per thread
    #if !HAKMEM_BUILD_RELEASE
    uint64_t hits;           // Cache hit count
    uint64_t misses;         // Cache miss count
    #endif
} TlsSsHintCache;

// Thread-local storage instance
// Initialized to zero by TLS semantics, formal init in tls_ss_hint_init()
extern __thread TlsSsHintCache g_tls_ss_hint;

// ============================================================================
// API FUNCTIONS
// ============================================================================

/**
 * @brief Initialize TLS hint cache for current thread
 *
 * Call once per thread, typically in thread-local initialization path.
 * Safe to call multiple times (idempotent).
 *
 * Thread Safety: TLS, no synchronization required
 * Performance: ~10 cycles (negligible one-time cost)
 */
static inline void tls_ss_hint_init(void) {
    // Zero-initialization by TLS, but explicit init for clarity
    g_tls_ss_hint.count = 0;
    g_tls_ss_hint.next_slot = 0;

    #if !HAKMEM_BUILD_RELEASE
    g_tls_ss_hint.hits = 0;
    g_tls_ss_hint.misses = 0;
    #endif

    // Clear all entries (paranoid, but cache-friendly loop)
    for (int i = 0; i < TLS_SS_HINT_SLOTS; i++) {
        g_tls_ss_hint.entries[i].base = NULL;
        g_tls_ss_hint.entries[i].end = NULL;
        g_tls_ss_hint.entries[i].ss = NULL;
    }
}

/**
 * @brief Update hint cache with a SuperSlab reference
 *
 * Called on paths where we know the SuperSlab for a given address range:
 * - After successful tiny_alloc (cache the allocated-from SuperSlab)
 * - After superslab refill (cache the newly bound SuperSlab)
 * - After unified cache refill (cache the refilled SuperSlab)
 *
 * Duplicate detection: If the SuperSlab is already cached, no update occurs.
 * This prevents thrashing when repeatedly allocating from the same SuperSlab.
 *
 * @param ss    SuperSlab to cache (must be non-NULL, SUPERSLAB_MAGIC validated by caller)
 * @param base  SuperSlab base address (1MB or 2MB aligned)
 * @param size  SuperSlab size in bytes (1MB or 2MB)
 *
 * Thread Safety: TLS, no synchronization required
 * Performance: ~15-20 cycles (duplicate check + FIFO rotation)
 */
static inline void tls_ss_hint_update(struct SuperSlab* ss, void* base, size_t size) {
    // Sanity check: reject invalid inputs
    if (__builtin_expect(!ss || !base || size == 0, 0)) {
        return;
    }

    // Duplicate detection: check if this SuperSlab is already cached
    // This prevents thrashing when allocating from the same SuperSlab repeatedly
    for (uint32_t i = 0; i < g_tls_ss_hint.count; i++) {
        if (g_tls_ss_hint.entries[i].ss == ss) {
            return;  // Already cached, no update needed
        }
    }

    // Add to next slot (FIFO rotation)
    uint32_t slot = g_tls_ss_hint.next_slot;
    g_tls_ss_hint.entries[slot].base = base;
    g_tls_ss_hint.entries[slot].end = (char*)base + size;
    g_tls_ss_hint.entries[slot].ss = ss;

    // Advance to next slot (wrap at TLS_SS_HINT_SLOTS)
    g_tls_ss_hint.next_slot = (slot + 1) % TLS_SS_HINT_SLOTS;

    // Increment count until cache is full
    if (g_tls_ss_hint.count < TLS_SS_HINT_SLOTS) {
        g_tls_ss_hint.count++;
    }
}

/**
 * @brief Lookup SuperSlab for given pointer (fast path)
 *
 * Called on free() entry, before falling back to hak_super_lookup().
 * Performs linear search over cached entries (4 iterations max).
 *
 * Cache hit: Returns true, sets *out_ss to cached SuperSlab pointer
 * Cache miss: Returns false, caller must use hak_super_lookup()
 *
 * @param ptr     User pointer to lookup (arbitrary alignment)
 * @param out_ss  Output: SuperSlab pointer if found (only valid if return true)
 * @return true if cache hit (out_ss is valid), false if miss
 *
 * Thread Safety: TLS, no synchronization required
 * Performance: 2-5 cycles (hit), 8-12 cycles (miss)
 *
 * NOTE: Caller MUST validate SUPERSLAB_MAGIC after successful lookup.
 *       This Box does not perform magic validation to keep fast path minimal.
 */
static inline bool tls_ss_hint_lookup(void* ptr, struct SuperSlab** out_ss) {
    // Fast path: iterate over valid entries
    // Unrolling this loop (if count is small) is beneficial, but let compiler decide
    for (uint32_t i = 0; i < g_tls_ss_hint.count; i++) {
        TlsSsHintEntry* e = &g_tls_ss_hint.entries[i];

        // Range check: base <= ptr < end
        // Note: end is exclusive (base + size), so use < not <=
        if (ptr >= e->base && ptr < e->end) {
            // Cache hit!
            *out_ss = e->ss;

            #if !HAKMEM_BUILD_RELEASE
            g_tls_ss_hint.hits++;
            #endif

            return true;
        }
    }

    // Cache miss: caller must fall back to hak_super_lookup()
    #if !HAKMEM_BUILD_RELEASE
    g_tls_ss_hint.misses++;
    #endif

    return false;
}

/**
 * @brief Clear all cached hints (for testing/reset)
 *
 * Use cases:
 * - Unit tests: Reset cache between test cases
 * - Debug: Force cache cold start for profiling
 * - Thread teardown: Optional cleanup (TLS auto-cleanup on thread exit)
 *
 * Thread Safety: TLS, no synchronization required
 * Performance: ~10 cycles
 */
static inline void tls_ss_hint_clear(void) {
    g_tls_ss_hint.count = 0;
    g_tls_ss_hint.next_slot = 0;

    #if !HAKMEM_BUILD_RELEASE
    // Preserve stats across clear (for cumulative profiling)
    // Uncomment to reset stats:
    // g_tls_ss_hint.hits = 0;
    // g_tls_ss_hint.misses = 0;
    #endif

    // Optional: zero out entries (paranoid, not required for correctness)
    for (int i = 0; i < TLS_SS_HINT_SLOTS; i++) {
        g_tls_ss_hint.entries[i].base = NULL;
        g_tls_ss_hint.entries[i].end = NULL;
        g_tls_ss_hint.entries[i].ss = NULL;
    }
}

/**
 * @brief Get cache statistics (for profiling builds)
 *
 * Returns hit/miss counters for performance analysis.
 * Only available in non-release builds (HAKMEM_BUILD_RELEASE=0).
 *
 * @param hits    Output: Total cache hits
 * @param misses  Output: Total cache misses
 *
 * Thread Safety: TLS, no synchronization required
 * Performance: ~5 cycles (two loads)
 */
#if !HAKMEM_BUILD_RELEASE
static inline void tls_ss_hint_stats(uint64_t* hits, uint64_t* misses) {
    if (hits) *hits = g_tls_ss_hint.hits;
    if (misses) *misses = g_tls_ss_hint.misses;
}
#endif

#endif // TLS_SS_HINT_BOX_H