// hakmem_bigcache.c - Big-Block Cache Implementation
// Purpose: Per-site ring cache for large allocations
//
// License: MIT
// Date: 2025-10-21

#include "hakmem_bigcache.h"
#include "hakmem_internal.h"  // Phase 6.15 P0.1: For HAKMEM_LOG macro
#include <stdlib.h>
#include <string.h>
#include <stdio.h>
#include <pthread.h>

// ============================================================================
// Data Structures (Box理論: 箱の内部構造)
// ============================================================================

typedef struct __attribute__((aligned(64))) {
    void*     ptr;           // Cached pointer (user pointer, not raw)
    size_t    actual_bytes;  // Actual allocated size (for safety check)
    size_t    class_bytes;   // Size class (1MB, 2MB, 4MB, 8MB) for indexing
    uintptr_t site;          // Allocation site
    int       valid;         // 1 if slot is valid
    uint16_t  freq;          // Phase 6.11 P0-BigCache-2: LFU frequency counter (0-65535)
} BigCacheSlot;

// Phase 6.4 P2: O(1) Direct Table [site][class]
// メモリ使用量: 64 sites × 4 classes × 32 bytes = 8 KB (cache-friendly!)
static BigCacheSlot g_cache[BIGCACHE_MAX_SITES][BIGCACHE_NUM_CLASSES];
static pthread_mutex_t g_cache_locks[BIGCACHE_MAX_SITES];

// Statistics (for debugging/paper)
static struct {
    uint64_t hits;
    uint64_t misses;
    uint64_t puts;
    uint64_t evictions;
    uint64_t rejects;
} g_stats;

static int g_initialized = 0;

// Phase 6.11 P0-BigCache-2: LFU Hybrid - Decay tracking
static uint64_t g_put_count = 0;       // Total puts (for decay trigger)
#define LFU_DECAY_INTERVAL 1024        // Decay every 1024 puts (prevents overflow + adapts to workload changes)

// ============================================================================
// Helper Functions (Box内部実装)
// ============================================================================

// Phase 6.11 P0-BigCache-3: FNV-1a hash function (better distribution than modulo)
// FNV-1a (Fowler-Noll-Vo) hash: fast, simple, excellent distribution
static inline int hash_site(uintptr_t site) {
    uint32_t hash = 2166136261u;  // FNV offset basis
    uint8_t* bytes = (uint8_t*)&site;

    // FNV-1a: XOR then multiply (better avalanche than FNV-1)
    for (int i = 0; i < sizeof(uintptr_t); i++) {
        hash ^= bytes[i];
        hash *= 16777619u;  // FNV prime
    }

    // Modulo to fit into BIGCACHE_MAX_SITES (256 sites)
    return (int)(hash % BIGCACHE_MAX_SITES);
}

// Check if size is cacheable
static inline int is_cacheable(size_t size) {
    return size >= BIGCACHE_MIN_SIZE;
}

// Phase 6.11: Finer-grained size-class決定 (8 classes)
// Returns: 0-7 (class index) for O(1) table lookup
// Classes: 512KB, 1MB, 2MB, 3MB, 4MB, 6MB, 8MB, 16MB
static inline int get_class_index(size_t size) {
    // Simple conditional approach (easier to maintain with non-power-of-2 classes)
    if (size < BIGCACHE_CLASS_1MB)   return 0;  // 512KB-1MB
    if (size < BIGCACHE_CLASS_2MB)   return 1;  // 1MB-2MB
    if (size < BIGCACHE_CLASS_3MB)   return 2;  // 2MB-3MB (NEW: reduces fragmentation)
    if (size < BIGCACHE_CLASS_4MB)   return 3;  // 3MB-4MB (NEW)
    if (size < BIGCACHE_CLASS_6MB)   return 4;  // 4MB-6MB
    if (size < BIGCACHE_CLASS_8MB)   return 5;  // 6MB-8MB (NEW)
    if (size < BIGCACHE_CLASS_16MB)  return 6;  // 8MB-16MB
    return 7;  // 16MB+ (NEW: very large allocations)
}

// Get size class bytes from index
static inline size_t class_index_to_bytes(int class_idx) {
    static const size_t class_sizes[BIGCACHE_NUM_CLASSES] = {
        BIGCACHE_CLASS_512KB,  // Phase 6.11: NEW class for 512KB-1MB
        BIGCACHE_CLASS_1MB,
        BIGCACHE_CLASS_2MB,
        BIGCACHE_CLASS_3MB,    // Phase 6.11: NEW class to reduce fragmentation (e.g., 2.1MB → 3MB instead of 4MB)
        BIGCACHE_CLASS_4MB,
        BIGCACHE_CLASS_6MB,    // Phase 6.11: NEW class
        BIGCACHE_CLASS_8MB,
        BIGCACHE_CLASS_16MB    // Phase 6.11: NEW class for very large allocations
    };
    return class_sizes[class_idx];
}

// Callback for actual freeing (set by hakmem.c)
static void (*g_free_callback)(void* ptr, size_t size) = NULL;

// Free a cached block (when evicting)
static inline void evict_slot(BigCacheSlot* slot) {
    if (!slot->valid) return;

    // Use callback if available, otherwise just mark invalid
    if (g_free_callback) {
        // Pass actual allocated size, not class_bytes!
        g_free_callback(slot->ptr, slot->actual_bytes);
    }

    slot->valid = 0;
    slot->freq = 0;  // Phase 6.11: Reset frequency on eviction
    g_stats.evictions++;
}

// Phase 6.11 P0-BigCache-2: LFU Hybrid - Decay all frequencies
// Purpose: Prevent overflow + adapt to changing workload patterns
static inline void decay_frequencies(void) {
    for (int site_idx = 0; site_idx < BIGCACHE_MAX_SITES; site_idx++) {
        for (int class_idx = 0; class_idx < BIGCACHE_NUM_CLASSES; class_idx++) {
            BigCacheSlot* slot = &g_cache[site_idx][class_idx];
            if (slot->valid) {
                slot->freq = slot->freq >> 1;  // Halve frequency (shift right by 1)
            }
        }
    }
}

// ============================================================================
// Public API (Box Interface)
// ============================================================================

void hak_bigcache_init(void) {
    if (g_initialized) return;

    memset(g_cache, 0, sizeof(g_cache));
    memset(&g_stats, 0, sizeof(g_stats));

    for (int i = 0; i < BIGCACHE_MAX_SITES; i++) {
        pthread_mutex_init(&g_cache_locks[i], NULL);
    }

    g_initialized = 1;

    HAKMEM_LOG("[BigCache] Initialized (P2: O(1) direct table, sites=%d, classes=%d)\n",
           BIGCACHE_MAX_SITES, BIGCACHE_NUM_CLASSES);
    HAKMEM_LOG("[BigCache] Size classes: 1MB, 2MB, 4MB, 8MB (P3: branchless)\n");
}

void hak_bigcache_shutdown(void) {
    if (!g_initialized) return;

    // Free all cached blocks (O(sites × classes) = 64 × 4 = 256 slots)
    for (int site_idx = 0; site_idx < BIGCACHE_MAX_SITES; site_idx++) {
        for (int class_idx = 0; class_idx < BIGCACHE_NUM_CLASSES; class_idx++) {
            BigCacheSlot* slot = &g_cache[site_idx][class_idx];
            if (slot->valid) {
                evict_slot(slot);
            }
        }
    }

    hak_bigcache_print_stats();

    g_initialized = 0;
}

// Phase 6.4 P2: O(1) get - Direct table lookup
int hak_bigcache_try_get(size_t size, uintptr_t site, void** out_ptr) {
    if (!g_initialized) hak_bigcache_init();
    if (!is_cacheable(size)) return 0;

    // O(1) calculation: site_idx, class_idx
    int site_idx = hash_site(site);
    int class_idx = get_class_index(size);  // P3: branchless

    // O(1) lookup: table[site_idx][class_idx]
    pthread_mutex_t* lock = &g_cache_locks[site_idx];
    pthread_mutex_lock(lock);
    BigCacheSlot* slot = &g_cache[site_idx][class_idx];

    // Check: valid, matching site, AND sufficient size (Segfault fix!)
    if (slot->valid && slot->site == site && slot->actual_bytes >= size) {
        // Hit! Return and invalidate slot
        *out_ptr = slot->ptr;
        slot->valid = 0;

        // Phase 6.11 P0-BigCache-2: LFU - increment frequency on hit (saturating at 65535)
        if (slot->freq < 65535) slot->freq++;

        g_stats.hits++;
        pthread_mutex_unlock(lock);
        return 1;
    }

    // Miss (invalid, wrong site, or undersized)
    g_stats.misses++;
    pthread_mutex_unlock(lock);
    return 0;
}

// Phase 6.4 P2: O(1) put - Direct table insertion
int hak_bigcache_put(void* ptr, size_t actual_bytes, uintptr_t site) {
    if (!g_initialized) hak_bigcache_init();
    if (!is_cacheable(actual_bytes)) {
        g_stats.rejects++;
        return 0;
    }

    // O(1) calculation: site_idx, class_idx
    int site_idx = hash_site(site);
    int class_idx = get_class_index(actual_bytes);  // P3: branchless

    // O(1) lookup: table[site_idx][class_idx]
    pthread_mutex_t* lock = &g_cache_locks[site_idx];
    pthread_mutex_lock(lock);
    BigCacheSlot* slot = &g_cache[site_idx][class_idx];

    // Phase 6.11 P0-BigCache-2: LFU Hybrid Eviction
    // Instead of evicting target slot directly, find coldest slot in same site
    if (slot->valid) {
        BigCacheSlot* coldest = slot;
        uint16_t min_freq = slot->freq;

        // Scan all class slots in same site (8 slots max)
        for (int c = 0; c < BIGCACHE_NUM_CLASSES; c++) {
            BigCacheSlot* candidate = &g_cache[site_idx][c];
            if (!candidate->valid) {
                // Invalid slot = coldest (freq=0, prefer reusing empty slots)
                coldest = candidate;
                break;
            }
            if (candidate->freq < min_freq) {
                min_freq = candidate->freq;
                coldest = candidate;
            }
        }

        // Evict coldest slot (might be target slot, might be different)
        evict_slot(coldest);

        // If we evicted a different slot, use it instead of target slot
        if (coldest != slot) {
            slot = coldest;
            class_idx = get_class_index(actual_bytes);  // Recalculate class for new slot
        }
    }

    // Store in cache (O(1) direct write)
    slot->ptr = ptr;
    slot->actual_bytes = actual_bytes;  // Store actual size (Segfault fix!)
    slot->class_bytes = class_index_to_bytes(class_idx);  // For stats/debugging
    slot->site = site;
    slot->valid = 1;
    slot->freq = 0;  // Phase 6.11: Initialize frequency to 0 (will increment on first hit)

    g_stats.puts++;
    g_put_count++;

    // Phase 6.11 P0-BigCache-2: Periodic decay (every 1024 puts)
    if (g_put_count % LFU_DECAY_INTERVAL == 0) {
        decay_frequencies();
    }

    pthread_mutex_unlock(lock);
    return 1;
}

void hak_bigcache_print_stats(void) {
    if (!g_initialized) return;

    printf("\n========================================\n");
    printf("BigCache Statistics\n");
    printf("========================================\n");
    printf("Hits:      %lu\n", (unsigned long)g_stats.hits);
    printf("Misses:    %lu\n", (unsigned long)g_stats.misses);
    printf("Puts:      %lu\n", (unsigned long)g_stats.puts);
    printf("Evictions: %lu\n", (unsigned long)g_stats.evictions);
    printf("Rejects:   %lu\n", (unsigned long)g_stats.rejects);

    if (g_stats.hits + g_stats.misses > 0) {
        double hit_rate = (double)g_stats.hits / (g_stats.hits + g_stats.misses) * 100.0;
        printf("Hit Rate:  %.1f%%\n", hit_rate);
    }

    printf("========================================\n");
}

void hak_bigcache_set_free_callback(hak_bigcache_free_fn_t fn) {
    g_free_callback = fn;
}