// hakmem_tiny_simple.c
// Phase 6-1: Ultra-Simple Tiny Allocator Implementation
//
// Design: "Simple Front + Smart Back"
// - Front: 3-4 instruction fast path (this file, Phase 1)
// - Back: Learning layer (Phase 2, to be added)
//
// Backend: Simple mmap-based chunk allocation (Phase 1)
//          Will integrate with SuperSlab in Phase 2

#include "hakmem_tiny_simple.h"
#include <stdio.h>
#include <string.h>
#include <pthread.h>
#include <sys/mman.h>
#include <unistd.h>

// ============================================================================
// Phase 1: Ultra-Simple Fast Path Data Structures
// ============================================================================

// TLS Free List - THE ONLY fast path data structure!
__thread void* g_tls_tiny_cache[TINY_NUM_CLASSES] = {NULL};

// TLS Stats (per class)
__thread TinySimpleStats g_tls_tiny_stats[TINY_NUM_CLASSES] = {{0}};

// Size class metadata
static const size_t g_class_sizes[TINY_NUM_CLASSES] = {
    8, 16, 32, 64, 128, 256, 512, 1024
};

// Refill count (Phase 1: fixed, Phase 2: adaptive)
#define REFILL_COUNT 64

// ============================================================================
// Backend: Simple Chunk Allocator (Phase 1)
// ============================================================================

// Chunk size: 1MB (will be tuned in Phase 2)
#define CHUNK_SIZE (1024 * 1024)

// Per-class chunk state (TLS)
typedef struct {
    char* current_chunk;     // Current chunk base (byte pointer for arithmetic)
    char* chunk_cursor;      // Next free block in chunk
    char* chunk_end;         // End of current chunk
    uint64_t chunks_allocated;  // Total chunks allocated
} TinyChunkState;

static __thread TinyChunkState g_chunk_state[TINY_NUM_CLASSES];

// Allocate a new chunk from OS
static void* allocate_chunk(void) {
    void* chunk = mmap(NULL, CHUNK_SIZE,
                       PROT_READ | PROT_WRITE,
                       MAP_PRIVATE | MAP_ANONYMOUS, -1, 0);
    if (chunk == MAP_FAILED) {
        return NULL;
    }
    return chunk;
}

// ============================================================================
// Initialization
// ============================================================================

void hak_tiny_simple_init(void) {
    // Clear TLS cache and stats
    memset(g_tls_tiny_cache, 0, sizeof(g_tls_tiny_cache));
    memset(g_tls_tiny_stats, 0, sizeof(g_tls_tiny_stats));
    memset(g_chunk_state, 0, sizeof(g_chunk_state));
}

// ============================================================================
// Ultra-Fast Allocation (3-4 instructions!)
// ============================================================================

void* hak_tiny_simple_alloc(size_t size) {
    // Convert size to class (inlined)
    int cls = hak_tiny_simple_size_to_class(size);
    if (cls < 0) return NULL;  // >1KB, not handled by Tiny

    // Ultra-fast path: Pop from free list
    void** head = &g_tls_tiny_cache[cls];
    void* ptr = *head;
    if (ptr) {
        *head = *(void**)ptr;  // Single instruction pop!
        g_tls_tiny_stats[cls].alloc_count++;
        g_tls_tiny_stats[cls].hit_count++;
        return ptr;
    }

    // Miss: go to slow path
    g_tls_tiny_stats[cls].miss_count++;
    return hak_tiny_simple_alloc_slow(size, cls);
}

// ============================================================================
// Fast Free
// ============================================================================

void hak_tiny_simple_free(void* ptr, size_t size) {
    if (ptr == NULL) return;

    // Convert size to class (inlined)
    int cls = hak_tiny_simple_size_to_class(size);
    if (cls < 0) return;  // Invalid size

    // Fast path: Push to free list
    void** head = &g_tls_tiny_cache[cls];
    *(void**)ptr = *head;  // ptr->next = head
    *head = ptr;            // head = ptr

    g_tls_tiny_stats[cls].free_count++;
}

// ============================================================================
// Slow Path: Batch Refill
// ============================================================================

void* hak_tiny_simple_alloc_slow(size_t size, int class_idx) {
    // Phase 1: Simple batch refill from chunk
    // Phase 2: Will add adaptive refill count based on miss rate

    size_t class_size = g_class_sizes[class_idx];
    void** head = &g_tls_tiny_cache[class_idx];
    TinyChunkState* cs = &g_chunk_state[class_idx];

    // Refill batch (Phase 1: fixed at 64 blocks)
    int refilled = 0;
    for (int i = 0; i < REFILL_COUNT; i++) {
        // Check if current chunk is exhausted
        if (cs->chunk_cursor + (ptrdiff_t)class_size > cs->chunk_end) {
            // Allocate new chunk
            void* new_chunk = allocate_chunk();
            if (new_chunk == NULL) {
                break;  // Out of memory
            }

            cs->current_chunk = (char*)new_chunk;
            cs->chunk_cursor = (char*)new_chunk;
            cs->chunk_end = (char*)new_chunk + CHUNK_SIZE;
            cs->chunks_allocated++;
        }

        // Carve out a block from chunk
        void* block = (void*)cs->chunk_cursor;
        cs->chunk_cursor = cs->chunk_cursor + (ptrdiff_t)class_size;

        // Add to free list
        *(void**)block = *head;
        *head = block;
        refilled++;
    }

    // Pop one block for the caller
    void* ptr = *head;
    if (ptr) {
        *head = *(void**)ptr;
        g_tls_tiny_stats[class_idx].alloc_count++;
        return ptr;
    }

    // Complete failure (out of memory)
    return NULL;
}

// ============================================================================
// Stats (for debugging and Phase 2 learning layer)
// ============================================================================

void hak_tiny_simple_get_stats(int class_idx, TinySimpleStats* stats) {
    if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES || stats == NULL) {
        return;
    }
    *stats = g_tls_tiny_stats[class_idx];
}

void hak_tiny_simple_print_stats(void) {
    printf("\n=== Tiny Simple Allocator Stats ===\n");
    printf("Class | Size  | Allocs    | Frees     | Hits      | Misses    | Hit Rate\n");
    printf("------|-------|-----------|-----------|-----------|-----------|----------\n");

    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        TinySimpleStats* s = &g_tls_tiny_stats[i];
        double hit_rate = (s->alloc_count > 0)
            ? (100.0 * s->hit_count / s->alloc_count)
            : 0.0;

        printf("  %d   | %4zuB | %9lu | %9lu | %9lu | %9lu | %6.2f%%\n",
            i, g_class_sizes[i],
            s->alloc_count, s->free_count,
            s->hit_count, s->miss_count,
            hit_rate);
    }
    printf("\n");
}