// ============================================================================
// Performance Measurement: TLS SLL Hit Rate (ENV-gated)
// ============================================================================
// Global atomic counters for TLS SLL performance measurement
// ENV: HAKMEM_MEASURE_UNIFIED_CACHE=1 to enable (default: OFF)
#include <stdatomic.h>
#include "box/tiny_heap_env_box.h"  // TinyHeap/C7 gate for TLS SLL skips
_Atomic uint64_t g_tls_sll_push_count_global = 0;
_Atomic uint64_t g_tls_sll_pop_count_global = 0;
_Atomic uint64_t g_tls_sll_pop_empty_count_global = 0;

// Hot-path cheap sampling counter to avoid rand() in allocation path
// Phase 9.4: TLS single-linked freelist (mimalloc-inspired) for hottest classes (≤128B/≤256B)
int g_tls_sll_enable = 1;                 // HAKMEM_TINY_TLS_SLL=0 to disable
// Phase 6-1.7: Export TLS variables for box refactor (Box 5/6 need access from hakmem.c)
// CRITICAL FIX: Explicit initializers prevent SEGV from uninitialized TLS in worker threads
// PRIORITY 3: TLS Canaries - Add canaries around TLS arrays to detect buffer overruns
#define TLS_CANARY_MAGIC 0xDEADBEEFDEADBEEFULL
// Phase 3d-B: Unified TLS SLL (head+count in same cache line for +12-18% cache hit rate)
#include "front/tiny_heap_v2.h"
__thread uint64_t g_tls_canary_before_sll = TLS_CANARY_MAGIC;
__thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES] = {0};
__thread uint64_t g_tls_canary_after_sll = TLS_CANARY_MAGIC;
__thread const char* g_tls_sll_last_writer[TINY_NUM_CLASSES] = {0};
__thread TinyHeapV2Mag g_tiny_heap_v2_mag[TINY_NUM_CLASSES] = {0};
__thread TinyHeapV2Stats g_tiny_heap_v2_stats[TINY_NUM_CLASSES] = {0};
__thread int g_tls_heap_v2_initialized = 0;

// Phase 1: TLS SuperSlab Hint Box for Headerless mode
// Size: 112 bytes per thread (4 slots * 24 bytes + 16 bytes overhead)
#if HAKMEM_TINY_SS_TLS_HINT
#include "box/tls_ss_hint_box.h"
__thread TlsSsHintCache g_tls_ss_hint = {0};
#endif
static int g_tiny_ultra = 0;                     // HAKMEM_TINY_ULTRA=1 for SLL-only ultra mode
static int g_ultra_validate = 0;                 // HAKMEM_TINY_ULTRA_VALIDATE=1 to enable per-pop validation
// Ultra debug counters
#if HAKMEM_DEBUG_COUNTERS
static __attribute__((unused)) uint64_t g_ultra_pop_hits[TINY_NUM_CLASSES] = {0};
static uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_ultra_resets[TINY_NUM_CLASSES] = {0};
#endif

// Path counters (normal mode visibility): lightweight, for debugging/bench only
#if HAKMEM_DEBUG_COUNTERS
static __attribute__((unused)) uint64_t g_path_sll_pop[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_path_mag_pop[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_path_front_pop[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_path_superslab[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_path_refill_calls[TINY_NUM_CLASSES] = {0};
// New: slow/bitmap/bump/bin instrumentation
static __attribute__((unused)) uint64_t g_alloc_slow_calls[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_superslab_refill_calls_dbg[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_bitmap_scan_calls[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_bgbin_pops[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_bump_hits[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_bump_arms[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_spec_calls[TINY_NUM_CLASSES] = {0};
static __attribute__((unused)) uint64_t g_spec_hits[TINY_NUM_CLASSES] = {0};
#endif
static int g_path_debug_enabled = 0;

// Spill hysteresis（freeホットパスからgetenvを排除）
static int g_spill_hyst = 32;  // default margin (configured at init; never getenv on hot path)

// Optional per-class refill batch overrides (0=use global defaults)
static int g_refill_max_c[TINY_NUM_CLASSES] = {0};
static int g_refill_max_hot_c[TINY_NUM_CLASSES] = {0};
static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int class_idx) {
    int v = g_refill_max_c[class_idx];
    if (v > 0) return v;
    if (class_idx <= 3) {
        int hv = g_refill_max_hot_c[class_idx];
        if (hv > 0) return hv;
        return g_tiny_refill_max_hot;
    }
    return g_tiny_refill_max;
}

// Phase 9.5: Frontend/Backend split - Tiny Front modules（QuickSlot / FastCache）
#include "front/quick_slot.h"
#include "front/fast_cache.h"
__thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
static int g_frontend_enable = 0;                // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend)
// SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER)
int g_sll_multiplier = 2;
// Cached thread id (uint32) to avoid repeated pthread_self() in hot paths
static __thread uint32_t g_tls_tid32;
static __thread int g_tls_tid32_inited;
// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c)
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) {
#else
static inline __attribute__((always_inline)) uint32_t tiny_self_u32(void) {
#endif
    if (__builtin_expect(!g_tls_tid32_inited, 0)) {
        g_tls_tid32 = (uint32_t)(uintptr_t)pthread_self();
        g_tls_tid32_inited = 1;
    }
    return g_tls_tid32;
}
// Cached pthread_t as-is for APIs that require pthread_t comparison
static __thread pthread_t g_tls_pt_self;
static __thread int g_tls_pt_inited;

// Frontend FastCache hit/miss counters (Small diagnostics)
unsigned long long g_front_fc_hit[TINY_NUM_CLASSES] = {0};
unsigned long long g_front_fc_miss[TINY_NUM_CLASSES] = {0};
// TLS SLL class mask: bit i = 1 allows SLL for class i. Default: all 8 classes enabled.
int g_tls_sll_class_mask = 0xFF;
// Phase 6-1.7: Export for box refactor (Box 6 needs access from hakmem.c)
static inline __attribute__((always_inline)) pthread_t tiny_self_pt(void) {
    if (__builtin_expect(!g_tls_pt_inited, 0)) {
        g_tls_pt_self = pthread_self();
        g_tls_pt_inited = 1;
    }
    return g_tls_pt_self;
}

#include "tiny_refill.h"
// tiny_mmap_gate.h already included at top
#include "tiny_publish.h"

// Optional prefetch on SLL pop (guarded by env: HAKMEM_TINY_PREFETCH=1)
static int g_tiny_prefetch = 0;

// Small-class magazine pre-initialization (to avoid cap==0 checks on hot path)


// Hot-class small TLS magazine（実体とスイッチ）
typedef struct {
    void* slots[128];
    uint16_t top;   // 0..128
    uint16_t cap;   // =128
} TinyHotMag;
static int g_hotmag_cap_default = 128;         // default capacity (fixed)
static int g_hotmag_refill_default = 32;       // default refill batch (fixed)
static int g_hotmag_enable = 0;                // 既定OFF（ENVトグル削除済み）
static uint16_t g_hotmag_cap_current[TINY_NUM_CLASSES];
static uint8_t g_hotmag_cap_locked[TINY_NUM_CLASSES];
static uint16_t g_hotmag_refill_current[TINY_NUM_CLASSES];
static uint8_t g_hotmag_refill_locked[TINY_NUM_CLASSES];
static uint8_t g_hotmag_class_en[TINY_NUM_CLASSES];       // 0=disabled for class, 1=enabled
static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES];
// Inline helpers

#include "box/tls_sll_box.h"  // Box TLS-SLL: Safe SLL operations API (needed by hotmag)
#include "hakmem_tiny_hotmag.inc.h"

// Diagnostics: invalid TLS SLL pointers detected (range check failures)
_Atomic uint64_t g_tls_sll_invalid_head[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_tls_sll_invalid_push[TINY_NUM_CLASSES] = {0};
_Atomic uint64_t g_tls_sll_pop_counter[TINY_NUM_CLASSES] = {0};

// Size-specialized tiny alloc (32B/64B) via function pointers (A/B用)
// TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata)
// Opt-in via HAKMEM_TINY_QUICK=1
// NOTE: This type definition must come BEFORE the Phase 2D-1 includes below
int g_quick_enable = 0;                 // HAKMEM_TINY_QUICK=1
__thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below

// Phase 2D-1: Hot-path inline function extractions（Front）
// NOTE: TinyFastCache/TinyQuickSlot は front/ で定義済み
#include "hakmem_tiny_hot_pop.inc.h"       // 4 functions: tiny_hot_pop_class{0..3}
#include "hakmem_tiny_refill.inc.h"        // 8 functions: refill operations
#if HAKMEM_TINY_P0_BATCH_REFILL
#include "hakmem_tiny_refill_p0.inc.h"     // P0 batch refill → FastCache 直補充
#endif

// Phase 7 Task 3: Pre-warm TLS cache at init
// Pre-allocate blocks to reduce first-allocation miss penalty
#if HAKMEM_TINY_PREWARM_TLS
void hak_tiny_prewarm_tls_cache(void) {
    // Pre-warm each class with HAKMEM_TINY_PREWARM_COUNT blocks
    // This reduces the first-allocation miss penalty by populating TLS cache
    // Phase E1-CORRECT: ALL classes (including C7) now use TLS SLL
    for (int class_idx = 0; class_idx < TINY_NUM_CLASSES; class_idx++) {
        // TinyHeap front ON では対象クラスを TLS SLL へ積まず、TinyHeapBox 側に任せる。
        if (tiny_heap_class_route_enabled(class_idx)) {
            continue;
        }

        int count = HAKMEM_TINY_PREWARM_COUNT;  // Default: 16 blocks per class

        // Trigger refill to populate TLS cache
        // P0 Fix: Use appropriate refill function based on P0 status
#if HAKMEM_TINY_P0_BATCH_REFILL
        sll_refill_batch_from_ss(class_idx, count);
#else
        sll_refill_small_from_ss(class_idx, count);
#endif
    }
}
#endif

// Ultra-Simple front - REMOVED (dead code cleanup 2025-11-27)

// HotMag helpers (for classes 0..3)
static inline int is_hot_class(int class_idx) { return class_idx >= 0 && class_idx <= 3; }

// Optional front (HotMag) push helper: compile-out in release builds
static inline int tiny_optional_push(int class_idx, void* ptr) {
#if HAKMEM_BUILD_RELEASE
    (void)class_idx;
    (void)ptr;
    return 0;
#else
    if (__builtin_expect(is_hot_class(class_idx), 0)) {
        if (__builtin_expect(hotmag_push(class_idx, ptr), 0)) {
            return 1;
        }
    }
    return 0;
#endif
}

// Phase 9.6: Deferred Intelligence (event queue + background)
// Extended event for FLINT Intelligence (lightweight; recorded off hot path only)
// Observability, ACE, and intelligence helpers