// hakmem_debug.c - Debug Timing Implementation
//
// License: MIT
// Date: 2025-10-21

#include "hakmem_debug.h"
#include "hakmem_env_cache.h"  // Priority-2: ENV cache
#include <stdio.h>
#include <stdlib.h>
#include <string.h>

#if HAKMEM_DEBUG_TIMING

// ============================================================================
// Global State
// ============================================================================

static int g_timing_enabled = 0;  // Runtime enable flag (set from HAKMEM_TIMING env)
static int g_initialized = 0;

// Thread-local statistics (lock-free, per-thread accumulation)
static _Thread_local HkmTimingStat g_tls_stats[HKM_CAT_MAX];

// Global statistics (accumulated from all threads at shutdown)
static HkmTimingStat g_global_stats[HKM_CAT_MAX];

// Category names for pretty printing (Phase 6.11.3: Updated for profiling)
static const char* g_category_names[HKM_CAT_MAX] = {
    // Syscalls and fallbacks
    "syscall_mmap",
    "syscall_munmap",
    "syscall_madvise",
    "fallback_malloc",

    // Whale Fast-Path
    "whale_get",
    "whale_put",

    // L2.5 Pool
    "l25_get",
    "l25_put",
    "l25_refill",

    // Tiny Pool
    "tiny_alloc",
    "tiny_free",
    "tiny_slab_search",

    // BigCache
    "bigcache_get",
    "bigcache_put",
    "bigcache_evict_scan",

    // Site Rules
    "site_rules_lookup",
    "site_rules_adopt",

    // ELO Learning
    "elo_select",
    "elo_update",

    // Top-level API
    "hak_alloc",
    "hak_free",

    // Legacy
    "region_get",
    "region_put",
    "hash_fnv1a",

    // Mid (L2 Pool) fine-grained
    "pool_get",
    "pool_lock",
    "pool_refill",
    "tc_drain",
    "tls_fast",
    "shard_steal",
    // Additional Mid (L2) fine-grained (NEW)
    "pool_tls_ring_pop",
    "pool_tls_lifo_pop",
    "pool_remote_push",
    "pool_alloc_tls_page",

    // L2.5 (LargePool) fine-grained (NEW)
    "l25_lock",
    "l25_tls_ring_pop",
    "l25_tls_lifo_pop",
    "l25_tc_drain",
    "l25_remote_push",
    "l25_alloc_tls_page",
    "l25_shard_steal",
};

// ============================================================================
// Internal Helpers
// ============================================================================

static void dump_stats_internal(void) {
    fprintf(stderr, "\n========================================\n");
    fprintf(stderr, "HAKMEM Debug Timing Statistics\n");
    fprintf(stderr, "========================================\n");

    uint64_t total_cycles = 0;
    for (int i = 0; i < HKM_CAT_MAX; i++) {
        total_cycles += g_global_stats[i].cycles;
    }

    for (int i = 0; i < HKM_CAT_MAX; i++) {
        HkmTimingStat* stat = &g_global_stats[i];
        if (stat->count == 0) continue;

        double avg_cycles = (double)stat->cycles / stat->count;
        double pct = total_cycles > 0 ? (double)stat->cycles / total_cycles * 100.0 : 0.0;

        fprintf(stderr, "%-24s: count=%8lu avg=%8.0f cycles total=%12lu cycles (%.1f%%)\n",
                g_category_names[i],
                (unsigned long)stat->count,
                avg_cycles,
                (unsigned long)stat->cycles,
                pct);
    }

    fprintf(stderr, "========================================\n");
    fprintf(stderr, "Total cycles: %lu\n", (unsigned long)total_cycles);
    fprintf(stderr, "========================================\n");
}

static void accumulate_tls_to_global(void) {
    // Accumulate thread-local stats to global (called at shutdown)
    for (int i = 0; i < HKM_CAT_MAX; i++) {
        g_global_stats[i].count += g_tls_stats[i].count;
        g_global_stats[i].cycles += g_tls_stats[i].cycles;
    }
}

static void atexit_handler(void) {
    if (!g_initialized) return;

    // Accumulate TLS to global
    accumulate_tls_to_global();

    // Dump stats
    dump_stats_internal();
}

// ============================================================================
// Public API Implementation
// ============================================================================

void hkm_timing_init(void) {
    if (g_initialized) return;

#if !HAKMEM_BUILD_RELEASE
    // Priority-2: Use cached ENV (gated behind !HAKMEM_BUILD_RELEASE)
    g_timing_enabled = HAK_ENV_TIMING_ENABLED();
#else
    g_timing_enabled = 0;  // Always disabled in release builds
#endif

    // Initialize global stats
    memset(g_global_stats, 0, sizeof(g_global_stats));

    // Initialize TLS stats
    memset(g_tls_stats, 0, sizeof(g_tls_stats));

    // Register atexit handler
    atexit(atexit_handler);

    g_initialized = 1;

    if (g_timing_enabled) {
        fprintf(stderr, "[HAKMEM Timing] Enabled (HAKMEM_TIMING=1)\n");
    }
}

void hkm_timing_shutdown(void) {
    if (!g_initialized) return;

    // Accumulate TLS to global
    accumulate_tls_to_global();

    // Dump stats
    dump_stats_internal();

    g_initialized = 0;
}

void hkm_count_inc(HkmTimingCategory cat) {
    if (!g_timing_enabled) return;
    if (cat >= HKM_CAT_MAX) return;

    g_tls_stats[cat].count++;
}

void hkm_cycles_add(HkmTimingCategory cat, uint64_t cycles) {
    if (!g_timing_enabled) return;
    if (cat >= HKM_CAT_MAX) return;

    g_tls_stats[cat].cycles += cycles;
}

void hkm_timing_dump(void) {
    if (!g_initialized) return;

    // Accumulate TLS to global
    accumulate_tls_to_global();

    // Dump stats
    dump_stats_internal();
}

#else

// Build-time disabled: empty implementations

void hkm_timing_init(void) {}
void hkm_timing_shutdown(void) {}
void hkm_count_inc(HkmTimingCategory cat) { (void)cat; }
void hkm_cycles_add(HkmTimingCategory cat, uint64_t cycles) { (void)cat; (void)cycles; }
void hkm_timing_dump(void) {}

#endif