hakmem/core/box/ultra_slim_alloc_box.h

// ultra_slim_alloc_box.h - Box: Ultra SLIM Allocation (4-Layer Fast Path)
// Purpose: Minimal latency allocation with learning capability preserved
// Goal: 58M → 90-110M ops/s (mimalloc 90-110% target)
//
// Architecture (4 layers):
//   Layer 1: Init Safety      (1-2 cycles, cold path only)
//   Layer 2: Size-to-Class    (1-2 cycles, LUT lookup)
//   Layer 3: ACE Learning     (2-3 cycles, histogram update)
//   Layer 4: TLS SLL Direct   (3-5 cycles, freelist pop)
//   Total:   7-12 cycles      (~2-4ns on 3GHz CPU)
//
// Box Boundary:
//   - Input: size (bytes)
//   - Output: BASE pointer (HAK_RET_ALLOC converts to USER)
//   - Env Control: HAKMEM_TINY_ULTRA_SLIM=1
//   - Fallback: Returns NULL on miss, caller handles refill
//
// Invariants:
//   - ACE learning MUST execute on every allocation
//   - TLS SLL accessed directly (no FastCache/SFC/HeapV2 layers)
//   - Init checks preserved (SEGV safety)
//   - Lock-free (TLS only, no atomics)
//
// Deleted Layers (from standard 7-layer path):
//   ❌ HeapV2 (C0-C3 magazine)
//   ❌ FastCache (C0-C3 array stack)
//   ❌ SFC (Super Front Cache)
//   ❌ TLS List fallback
//   Savings: 11-15 cycles removed
//
// Design Philosophy:
//   "Simple Front + Smart Back" - Keep frontend minimal, push complexity to backend
//   Learning preserved for adaptive behavior (HAKMEM's differentiator vs mimalloc)
//
// Phase 19-2: Ultra SLIM Box
// Expected: Random Mixed 256B: 58M → 90-110M ops/s (+55-90%)

#pragma once
#include "hakmem_tiny.h"
#include "tiny_region_id.h"
#include "tls_sll_box.h"
#include "tiny_sizeclass_hist_box.h"
#include "hakmem_tiny_lazy_init.inc.h"
#include <stddef.h>
#include <stdio.h>
#include <pthread.h>

// Phase 7 Header constants (from tiny_region_id.h)
#ifndef HEADER_MAGIC
#define HEADER_MAGIC 0xA0
#endif
#ifndef HEADER_CLASS_MASK
#define HEADER_CLASS_MASK 0x0F
#endif

// Forward declarations
extern int hak_tiny_size_to_class(size_t size);
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
extern void* tiny_region_id_write_header(void* base, int class_idx);

// ========== Box: Ultra SLIM Allocation (4-Layer Fast Path) ==========

// ========== Statistics & Diagnostics ==========

// Ultra SLIM hit/miss counters (per-class, TLS)
static __thread uint64_t g_ultra_slim_hits[TINY_NUM_CLASSES] = {0};
static __thread uint64_t g_ultra_slim_misses[TINY_NUM_CLASSES] = {0};

static inline void ultra_slim_track_hit(int class_idx) {
    if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) {
        g_ultra_slim_hits[class_idx]++;
    }
}

static inline void ultra_slim_track_miss(int class_idx) {
    if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) {
        g_ultra_slim_misses[class_idx]++;
    }
}

// Ultra SLIM mode detection (TLS cached, checked once per thread)
static inline int ultra_slim_mode_enabled(void) {
    static __thread int g_ultra_slim_checked = 0;
    static __thread int g_ultra_slim = 0;

    if (__builtin_expect(!g_ultra_slim_checked, 0)) {
        const char* e = getenv("HAKMEM_TINY_ULTRA_SLIM");
        g_ultra_slim = (e && *e && *e != '0') ? 1 : 0;
        g_ultra_slim_checked = 1;

        // Log mode activation (once per thread)
        if (g_ultra_slim) {
            fprintf(stderr, "[ULTRA_SLIM] 4-layer fast path enabled (TID=%ld)\n",
                    (long)pthread_self());
        }
    }

    return g_ultra_slim;
}

// Ultra SLIM 4-layer allocation path (internal helper)
// Returns: BASE pointer on hit, NULL on miss
// Note: This is a helper that returns BASE pointer. Use ultra_slim_alloc_4layer_user() for USER pointer.
static inline void* ultra_slim_alloc_4layer_base(size_t size, int* out_class_idx) {
    // ========== Layer 1: Init Safety (1-2 cycles, cold path only) ==========
    lazy_init_global();

    // ========== Layer 2: Size-to-Class (1-2 cycles, LUT lookup) ==========
    int class_idx = hak_tiny_size_to_class(size);
    if (__builtin_expect(class_idx < 0, 0)) {
        return NULL;  // Size > 1KB, not Tiny
    }

    lazy_init_class(class_idx);

    // ========== Layer 3: ACE Learning (2-3 cycles, histogram update) ==========
    // CRITICAL: This preserves HAKMEM's learning capability (differentiator vs mimalloc)
    tiny_sizeclass_hist_hit(class_idx);

    // ========== Layer 4: TLS SLL Direct Pop (3-5 cycles, main allocation) ==========
    // Box Boundary: Use TLS SLL Box API (C7-safe, lock-free)
    void* base = NULL;
    if (tls_sll_pop(class_idx, &base)) {
        // HIT: Fast path success (total: 7-12 cycles)
        ultra_slim_track_hit(class_idx);
        *out_class_idx = class_idx;
        return base;  // Return BASE (caller converts to USER)
    }

    // MISS: Return NULL (caller handles refill)
    ultra_slim_track_miss(class_idx);
    return NULL;
}

// Ultra SLIM 4-layer allocation path (USER pointer version)
// Returns: USER pointer (ready to use) or NULL on miss
static inline void* ultra_slim_alloc_4layer(size_t size) {
    int class_idx = -1;
    void* base = ultra_slim_alloc_4layer_base(size, &class_idx);
    if (!base) return NULL;

    // Convert BASE → USER using HAK_RET_ALLOC logic
#if HAKMEM_TINY_HEADER_CLASSIDX && HAKMEM_BUILD_RELEASE
    // Write header and return USER pointer
    *(uint8_t*)base = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
    return (void*)((uint8_t*)base + 1);
#else
    // Debug/Legacy: Use full validation
    return tiny_region_id_write_header(base, class_idx);
#endif
}

// Ultra SLIM allocation with refill (complete fast path)
// Returns: USER pointer (ready to use) or NULL on OOM
// This is the main entry point for Ultra SLIM mode
static inline void* ultra_slim_alloc_with_refill(size_t size) {
    // Debug: Track that Ultra SLIM path is being called
    static __thread uint64_t g_ultra_slim_call_count = 0;
    g_ultra_slim_call_count++;

    // Fast path: Try 4-layer direct allocation (returns USER pointer)
    void* user_ptr = ultra_slim_alloc_4layer(size);
    if (__builtin_expect(user_ptr != NULL, 1)) {
        // Fast path HIT: Already converted to USER pointer
        return user_ptr;
    }

    // Fast path MISS: Need refill
    // Note: tiny_alloc_fast_refill is declared static inline in tiny_alloc_fast.inc.h,
    // so we can't forward declare it here. Instead, we inline the refill logic.
    int class_idx = hak_tiny_size_to_class(size);
    if (class_idx < 0) return NULL;

    // Call backend refill (access via inline from tiny_alloc_fast.inc.h)
    // Note: We're included after tiny_alloc_fast.inc.h, so tiny_alloc_fast_refill is visible
    extern int sll_refill_batch_from_ss(int class_idx, int max_take);

    // Simple refill: Ask backend for 16 blocks
    int refilled = 0;
#if HAKMEM_TINY_P0_BATCH_REFILL
    refilled = sll_refill_batch_from_ss(class_idx, 16);
#else
    // Fallback: Use slow path if P0 disabled
    extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
    void* slow_ptr = hak_tiny_alloc_slow(size, class_idx);
    if (slow_ptr) {
        // Slow path returns BASE pointer, convert to USER
#if HAKMEM_TINY_HEADER_CLASSIDX && HAKMEM_BUILD_RELEASE
        *(uint8_t*)slow_ptr = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
        return (void*)((uint8_t*)slow_ptr + 1);
#else
        return tiny_region_id_write_header(slow_ptr, class_idx);
#endif
    }
    return NULL;
#endif

    if (refilled > 0) {
        // Retry after refill
        user_ptr = ultra_slim_alloc_4layer(size);
        if (user_ptr) {
            return user_ptr;
        }
    }

    // Slow path (OOM or new SuperSlab allocation)
    extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
    void* slow_base = hak_tiny_alloc_slow(size, class_idx);
    if (slow_base) {
        // Slow path returns BASE pointer, convert to USER
#if HAKMEM_TINY_HEADER_CLASSIDX && HAKMEM_BUILD_RELEASE
        *(uint8_t*)slow_base = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
        return (void*)((uint8_t*)slow_base + 1);
#else
        return tiny_region_id_write_header(slow_base, class_idx);
#endif
    }

    return NULL;  // OOM
}

// Print Ultra SLIM statistics (env: HAKMEM_ULTRA_SLIM_STATS=1 or HAKMEM_STATS=slim)
#include "../hakmem_stats_master.h"  // Phase 4d: Master stats control
static inline int ultra_slim_stats_enabled(void) {
    static int enabled = -1;
    if (__builtin_expect(enabled == -1, 0)) {
        enabled = hak_stats_check("HAKMEM_ULTRA_SLIM_STATS", "slim");
    }
    return enabled;
}

static void ultra_slim_print_stats(void) __attribute__((destructor));
static void ultra_slim_print_stats(void) {
    if (!ultra_slim_stats_enabled()) return;
    if (!ultra_slim_mode_enabled()) return;

    uint64_t total_hits = 0, total_misses = 0;
    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        total_hits += g_ultra_slim_hits[i];
        total_misses += g_ultra_slim_misses[i];
    }

    // Always print stats to debug if Ultra SLIM is actually being called
    fprintf(stderr, "\n========== Ultra SLIM 4-Layer Stats (DEBUG) ==========\n");
    fprintf(stderr, "Total Hits:   %lu\n", (unsigned long)total_hits);
    fprintf(stderr, "Total Misses: %lu\n", (unsigned long)total_misses);
    fprintf(stderr, "Total Calls:  %lu\n", (unsigned long)(total_hits + total_misses));

    if (total_hits + total_misses == 0) {
        fprintf(stderr, "⚠️  WARNING: Ultra SLIM mode enabled but no allocations tracked!\n");
        fprintf(stderr, "This suggests the Ultra SLIM path is not being called.\n");
        fprintf(stderr, "=====================================================\n\n");
        return;
    }

    fprintf(stderr, "\n========== Ultra SLIM 4-Layer Stats ==========\n");
    fprintf(stderr, "Total Hits:   %lu\n", (unsigned long)total_hits);
    fprintf(stderr, "Total Misses: %lu\n", (unsigned long)total_misses);
    fprintf(stderr, "Hit Rate:     %.1f%%\n",
            100.0 * total_hits / (total_hits + total_misses));

    fprintf(stderr, "\nPer-Class Breakdown:\n");
    fprintf(stderr, "Class | Hits      | Misses    | Hit Rate\n");
    fprintf(stderr, "------+-----------+-----------+---------\n");
    for (int i = 0; i < TINY_NUM_CLASSES; i++) {
        uint64_t h = g_ultra_slim_hits[i];
        uint64_t m = g_ultra_slim_misses[i];
        if (h + m == 0) continue;

        fprintf(stderr, "C%-4d | %9lu | %9lu | %5.1f%%\n",
                i, (unsigned long)h, (unsigned long)m,
                100.0 * h / (h + m));
    }
    fprintf(stderr, "=============================================\n\n");
}

// ========== Performance Notes ==========
//
// Expected Performance:
// - Fast path hit:  7-12 cycles (~2-4ns on 3GHz CPU)
// - Fast path miss: 50-100 cycles (refill overhead)
// - Target throughput: 90-110M ops/s (mimalloc parity)
//
// Comparison with Standard 7-Layer Path:
// - Standard: 31ns average (7 layers, 25-35 cycles)
// - Ultra SLIM: 10ns average (4 layers, 7-12 cycles)
// - Improvement: -68% latency, +210% throughput expected
//
// Deleted Layers (savings):
// - HeapV2:    3-5 cycles saved
// - FastCache: 5-7 cycles saved (C0-C3 only)
// - SFC:       6-8 cycles saved
// - Total:     14-20 cycles saved
//
// Preserved Capabilities:
// ✅ ACE learning (adaptive behavior)
// ✅ Init safety (no SEGV risk)
// ✅ Box Theory (clean boundaries)
// ✅ A/B testing (env gated)