/**
 * hakmem_mid_mt.c
 *
 * Mid Range Multi-threaded Allocator Implementation (8-32KB)
 * mimalloc-style per-thread segment for optimal MT performance
 *
 * Design:
 * - Per-thread segments (TLS) for lock-free allocation
 * - Global registry for segment lookup during free()
 * - 64KB chunks with bump + free list allocation
 * - Phase 1: Local free only (remote free = memory leak, acceptable for benchmarking)
 * - Phase 2: Will add atomic remote free list
 */

#include "hakmem_mid_mt.h"
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sys/mman.h>
#include <unistd.h>
#include <assert.h>
#include <stdatomic.h>

// Use likely/unlikely hints for branch prediction
#ifndef likely
#define likely(x)   __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif

// ============================================================================
// Global and TLS Variables
// ============================================================================

// TLS: Each thread has independent segments (lock-free!)
__thread MidThreadSegment g_mid_segments[MID_NUM_CLASSES] = {0};

// Phase 6-B: Registry removed (no longer needed with header-based free)

// Statistics (if enabled)
#if MID_ENABLE_STATS
MidStats g_mid_stats = {0};
#endif

// Initialization flag
static volatile int g_mid_initialized = 0;
static pthread_mutex_t g_init_lock = PTHREAD_MUTEX_INITIALIZER;

// ============================================================================
// Forward Declarations
// ============================================================================

static bool segment_refill(MidThreadSegment* seg, int class_idx);
static void* segment_alloc(MidThreadSegment* seg, int class_idx);
static void segment_free_local(MidThreadSegment* seg, void* ptr);
static void* chunk_allocate(size_t chunk_size);
static void chunk_deallocate(void* chunk, size_t chunk_size);
// Phase 6-B: Registry functions removed (header-based free instead)

// ============================================================================
// Chunk Management (mmap/munmap wrappers)
// ============================================================================

/**
 * chunk_allocate - Allocate a new chunk via mmap
 *
 * @param chunk_size  Size of chunk (typically 64KB)
 * @return            Chunk base address, or NULL on failure
 */
static void* chunk_allocate(size_t chunk_size) {
    void* chunk = mmap(
        NULL,
        chunk_size,
        PROT_READ | PROT_WRITE,
        MAP_PRIVATE | MAP_ANONYMOUS,
        -1,
        0
    );

    if (chunk == MAP_FAILED) {
        MID_LOG("ERROR: mmap failed for chunk_size=%zu", chunk_size);
        return NULL;
    }

    MID_LOG("Chunk allocated: %p, size=%zu", chunk, chunk_size);
    return chunk;
}

/**
 * chunk_deallocate - Free chunk via munmap
 *
 * @param chunk       Chunk base address
 * @param chunk_size  Size of chunk
 */
static void chunk_deallocate(void* chunk, size_t chunk_size) {
    if (!chunk) return;

    int ret = munmap(chunk, chunk_size);
    if (ret != 0) {
        MID_LOG("ERROR: munmap failed for chunk=%p, size=%zu", chunk, chunk_size);
    } else {
        MID_LOG("Chunk deallocated: %p, size=%zu", chunk, chunk_size);
    }
}

// ============================================================================
// Segment Operations
// ============================================================================

/**
 * segment_refill - Allocate new chunk and setup segment
 *
 * Called when segment is exhausted (rare, ~0.1% of allocations)
 *
 * Phase 6-B: No longer registers chunks (header-based free instead)
 *
 * @return true on success, false on OOM
 */
static bool segment_refill(MidThreadSegment* seg, int class_idx) {
    size_t block_size = mid_class_to_size(class_idx);
    size_t chunk_size = MID_CHUNK_SIZE;

    // Allocate new chunk via mmap
    void* chunk = chunk_allocate(chunk_size);
    if (!chunk) {
        return false;
    }

    // Phase 6-B: No registry add (header-based free doesn't need registry)

    // Setup segment
    seg->chunk_base = chunk;
    seg->chunk_size = chunk_size;
    seg->block_size = block_size;
    seg->current = chunk;
    seg->end = (uint8_t*)chunk + chunk_size;
    seg->capacity = chunk_size / block_size;
    seg->refill_count++;

    MID_LOG("Segment refill: class=%d, block_size=%zu, capacity=%u, chunk=%p",
            class_idx, block_size, seg->capacity, chunk);

    return true;
}

/**
 * segment_alloc - Allocate from segment (fast path)
 *
 * PERFORMANCE: Force inline for maximum speed
 *
 * Fast path priority:
 *   1. Free list (most common, ~90-95% hit rate)
 *   2. Bump allocation (when free list empty)
 *   3. Refill (when segment exhausted)
 *
 * Phase 6-B: Now writes MidMTHeader for lock-free free()
 *
 * @return Allocated pointer (after header), or NULL on OOM
 */
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) __attribute__((always_inline));
static inline void* segment_alloc(MidThreadSegment* seg, int class_idx) {
    void* block;  // Block start (includes header space)
    size_t block_size = seg->block_size;

    // === Path 0: First allocation - need refill ===
    // CRITICAL FIX: TLS is zero-initialized, so chunk_base == NULL on first call
    if (unlikely(seg->chunk_base == NULL)) {
        if (!segment_refill(seg, class_idx)) {
            return NULL;  // OOM
        }
        block_size = seg->block_size;  // Update after refill
    }

    // === Path 1: Free list (fastest, ~4-5 instructions) ===
    // Note: Free list stores next pointer at block start (overwrites header when freed)
    block = seg->free_list;
    if (likely(block != NULL)) {
        seg->free_list = *(void**)block;  // Pop from free list
        seg->used_count++;
        seg->alloc_count++;

        // Phase 6-B: Write header before returning
        MidMTHeader* hdr = (MidMTHeader*)block;
        hdr->block_size = (uint32_t)block_size;
        hdr->class_idx = (uint16_t)class_idx;
        hdr->magic = MID_MT_MAGIC;

        return (uint8_t*)block + sizeof(MidMTHeader);  // Return user pointer after header
    }

    // === Path 2: Bump allocation (fast, ~6-8 instructions) ===
    block = seg->current;
    void* next = (uint8_t*)block + block_size;

    if (likely(next <= seg->end)) {
        seg->current = next;
        seg->used_count++;
        seg->alloc_count++;

        // Phase 6-B: Write header before returning
        MidMTHeader* hdr = (MidMTHeader*)block;
        hdr->block_size = (uint32_t)block_size;
        hdr->class_idx = (uint16_t)class_idx;
        hdr->magic = MID_MT_MAGIC;

        return (uint8_t*)block + sizeof(MidMTHeader);  // Return user pointer after header
    }

    // === Path 3: Refill (slow, called ~once per 64KB) ===
    if (!segment_refill(seg, class_idx)) {
        return NULL;  // OOM
    }

    // Retry after refill
    block = seg->current;
    block_size = seg->block_size;  // Update after refill
    seg->current = (uint8_t*)block + block_size;
    seg->used_count++;
    seg->alloc_count++;

    // Phase 6-B: Write header before returning
    MidMTHeader* hdr = (MidMTHeader*)block;
    hdr->block_size = (uint32_t)block_size;
    hdr->class_idx = (uint16_t)class_idx;
    hdr->magic = MID_MT_MAGIC;

    return (uint8_t*)block + sizeof(MidMTHeader);  // Return user pointer after header
}

/**
 * segment_free_local - Free to local segment (same thread)
 *
 * @param seg  Segment to free to
 * @param ptr  Pointer to free (user pointer, after header)
 *
 * Phase 6-B: Adjusted for header-based allocation
 */
static inline void segment_free_local(MidThreadSegment* seg, void* ptr) {
    // Phase 6-B: Get block start (before header)
    void* block = (uint8_t*)ptr - sizeof(MidMTHeader);

    // Push to free list (lock-free, local operation)
    // Note: Overwrites header with next pointer (header no longer needed after free)
    *(void**)block = seg->free_list;
    seg->free_list = block;
    seg->used_count--;
    seg->free_count++;

#if MID_ENABLE_STATS
    __sync_fetch_and_add(&g_mid_stats.local_frees, 1);
#endif
}

// ============================================================================
// Public API
// ============================================================================

/**
 * mid_mt_init - Initialize Mid Range MT allocator
 *
 * Thread-safe, idempotent
 *
 * Phase 6-B: Simplified (no registry initialization)
 */
void mid_mt_init(void) {
    if (g_mid_initialized) return;

    pthread_mutex_lock(&g_init_lock);

    if (!g_mid_initialized) {
        // Phase 6-B: No registry initialization (header-based free)

#if MID_ENABLE_STATS
        memset(&g_mid_stats, 0, sizeof(g_mid_stats));
#endif

        g_mid_initialized = 1;

        MID_LOG("Mid MT allocator initialized (Phase 6-B: header-based)");
    }

    pthread_mutex_unlock(&g_init_lock);
}

/**
 * mid_mt_alloc - Allocate memory from Mid Range pool (8-32KB)
 *
 * Thread-safe, lock-free (uses TLS)
 */
void* mid_mt_alloc(size_t size) {
    // Validate size range (Phase 16: dynamic min size based on Tiny's max)
    if (unlikely(size < mid_get_min_size() || size > MID_MAX_SIZE)) {
        return NULL;
    }

    // Initialize if needed (thread-safe)
    if (unlikely(!g_mid_initialized)) {
        mid_mt_init();
    }

    // Get size class
    int class_idx = mid_size_to_class(size);
    if (unlikely(class_idx < 0)) {
        return NULL;
    }

    // Get thread-local segment
    MidThreadSegment* seg = &g_mid_segments[class_idx];

    // Allocate from segment (fast path)
    void* p = segment_alloc(seg, class_idx);

#if MID_ENABLE_STATS
    if (p) {
        __sync_fetch_and_add(&g_mid_stats.total_allocs, 1);
    }
#endif

    return p;
}

/**
 * mid_mt_free - Free memory allocated by mid_mt_alloc
 *
 * Phase 6-B: Header-based free (lock-free, no registry lookup!)
 * - Reads MidMTHeader to get block metadata (O(1), ~2 cycles)
 * - Eliminates pthread_mutex_lock/unlock (13.98% CPU overhead)
 * - Expected: +17-27% throughput improvement
 *
 * Local free (same thread): Ultra-fast, lock-free
 * Remote free (cross-thread): NOT IMPLEMENTED (memory leak, Phase 2 will add atomic remote free list)
 */
void mid_mt_free(void* ptr, size_t size) {
    if (unlikely(!ptr)) return;

#if MID_ENABLE_STATS
    __sync_fetch_and_add(&g_mid_stats.total_frees, 1);
#endif

    // Phase 6-B: Read header for O(1) metadata lookup (no mutex!)
    void* block = (uint8_t*)ptr - sizeof(MidMTHeader);
    MidMTHeader* hdr = (MidMTHeader*)block;

    // Validate header magic (sanity check)
    if (unlikely(hdr->magic != MID_MT_MAGIC)) {
        MID_LOG("ERROR: Invalid Mid MT magic 0x%X (expected 0x%X) for ptr %p",
                hdr->magic, MID_MT_MAGIC, ptr);
        return;
    }

    // Get metadata from header (no registry lookup!)
    int class_idx = hdr->class_idx;

    // Validate class_idx
    if (unlikely(class_idx < 0 || class_idx >= MID_NUM_CLASSES)) {
        MID_LOG("ERROR: Invalid class_idx %d in header for ptr %p", class_idx, ptr);
        return;
    }

    // Get thread-local segment for this size class
    MidThreadSegment* seg = &g_mid_segments[class_idx];

    // === Fast path: Check if block belongs to current segment ===
    // Note: Check block (not ptr), since segment tracks block addresses
    if (likely(seg->chunk_base != NULL &&
               block >= seg->chunk_base &&
               block < seg->end)) {
        // Local free (same thread, lock-free)
        segment_free_local(seg, ptr);
        return;
    }

    // === Slow path: Remote free (cross-thread) ===
    // Phase 1: NOT IMPLEMENTED
    // We would need to find the owning segment and push to its remote free list.
    //
    // For Phase 1 (benchmarking), we accept this memory leak.
    // bench_mid_mt_gap uses single-threaded workload, so remote frees never happen.

    MID_LOG("WARNING: Remote free not implemented, leaking %p (block_size=%u, class=%d)",
            ptr, hdr->block_size, class_idx);

#if MID_ENABLE_STATS
    __sync_fetch_and_add(&g_mid_stats.remote_frees, 1);
#endif

    // TODO Phase 2: Implement remote free
    // segment_free_remote(ptr, hdr->block_size, class_idx);
}

/**
 * mid_mt_thread_exit - Cleanup thread-local segments
 *
 * Called on thread exit to release resources
 *
 * Phase 6-B: No registry cleanup needed (header-based free)
 */
void mid_mt_thread_exit(void) {
    MID_LOG("Thread exit cleanup");

    // Free all chunks from this thread's segments
    for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
        MidThreadSegment* seg = &g_mid_segments[class_idx];

        if (seg->chunk_base) {
            // Phase 6-B: No registry remove (no registry exists)

            // Deallocate chunk
            chunk_deallocate(seg->chunk_base, seg->chunk_size);

            // Clear segment
            memset(seg, 0, sizeof(MidThreadSegment));
        }
    }
}

// ============================================================================
// Statistics (Debug/Profiling)
// ============================================================================

#if MID_ENABLE_STATS

void mid_mt_print_stats(void) {
    printf("\n=== Mid Range MT Statistics ===\n");
    printf("Total allocations:   %lu\n", g_mid_stats.total_allocs);
    printf("Total frees:         %lu\n", g_mid_stats.total_frees);
    printf("Local frees:         %lu (%.1f%%)\n",
           g_mid_stats.local_frees,
           100.0 * g_mid_stats.local_frees / (g_mid_stats.total_frees + 1));
    printf("Remote frees:        %lu (%.1f%%)\n",
           g_mid_stats.remote_frees,
           100.0 * g_mid_stats.remote_frees / (g_mid_stats.total_frees + 1));
    printf("Registry lookups:    %lu\n", g_mid_stats.registry_lookups);
    printf("\n");

    // Per-segment stats
    for (int class_idx = 0; class_idx < MID_NUM_CLASSES; class_idx++) {
        MidThreadSegment* seg = &g_mid_segments[class_idx];
        if (seg->alloc_count > 0) {
            printf("Class %d (%zu bytes):\n", class_idx, mid_class_to_size(class_idx));
            printf("  Allocations:  %lu\n", seg->alloc_count);
            printf("  Frees:        %lu\n", seg->free_count);
            printf("  Refills:      %u\n", seg->refill_count);
            printf("  Used count:   %u / %u\n", seg->used_count, seg->capacity);
        }
    }
    printf("\n");
}

#endif // MID_ENABLE_STATS