// ultra_slim_alloc_box.h - Box: Ultra SLIM Allocation (4-Layer Fast Path) // Purpose: Minimal latency allocation with learning capability preserved // Goal: 58M → 90-110M ops/s (mimalloc 90-110% target) // // Architecture (4 layers): // Layer 1: Init Safety (1-2 cycles, cold path only) // Layer 2: Size-to-Class (1-2 cycles, LUT lookup) // Layer 3: ACE Learning (2-3 cycles, histogram update) // Layer 4: TLS SLL Direct (3-5 cycles, freelist pop) // Total: 7-12 cycles (~2-4ns on 3GHz CPU) // // Box Boundary: // - Input: size (bytes) // - Output: BASE pointer (HAK_RET_ALLOC converts to USER) // - Env Control: HAKMEM_TINY_ULTRA_SLIM=1 // - Fallback: Returns NULL on miss, caller handles refill // // Invariants: // - ACE learning MUST execute on every allocation // - TLS SLL accessed directly (no FastCache/SFC/HeapV2 layers) // - Init checks preserved (SEGV safety) // - Lock-free (TLS only, no atomics) // // Deleted Layers (from standard 7-layer path): // ❌ HeapV2 (C0-C3 magazine) // ❌ FastCache (C0-C3 array stack) // ❌ SFC (Super Front Cache) // ❌ TLS List fallback // Savings: 11-15 cycles removed // // Design Philosophy: // "Simple Front + Smart Back" - Keep frontend minimal, push complexity to backend // Learning preserved for adaptive behavior (HAKMEM's differentiator vs mimalloc) // // Phase 19-2: Ultra SLIM Box // Expected: Random Mixed 256B: 58M → 90-110M ops/s (+55-90%) #pragma once #include "hakmem_tiny.h" #include "tiny_region_id.h" #include "tls_sll_box.h" #include "tiny_sizeclass_hist_box.h" #include "hakmem_tiny_lazy_init.inc.h" #include #include #include // Phase 7 Header constants (from tiny_region_id.h) #ifndef HEADER_MAGIC #define HEADER_MAGIC 0xA0 #endif #ifndef HEADER_CLASS_MASK #define HEADER_CLASS_MASK 0x0F #endif // Forward declarations extern int hak_tiny_size_to_class(size_t size); extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; extern void* tiny_region_id_write_header(void* base, int class_idx); // ========== Box: Ultra SLIM Allocation (4-Layer Fast Path) ========== // ========== Statistics & Diagnostics ========== // Ultra SLIM hit/miss counters (per-class, TLS) static __thread uint64_t g_ultra_slim_hits[TINY_NUM_CLASSES] = {0}; static __thread uint64_t g_ultra_slim_misses[TINY_NUM_CLASSES] = {0}; static inline void ultra_slim_track_hit(int class_idx) { if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) { g_ultra_slim_hits[class_idx]++; } } static inline void ultra_slim_track_miss(int class_idx) { if (class_idx >= 0 && class_idx < TINY_NUM_CLASSES) { g_ultra_slim_misses[class_idx]++; } } // Ultra SLIM mode detection (TLS cached, checked once per thread) static inline int ultra_slim_mode_enabled(void) { static __thread int g_ultra_slim_checked = 0; static __thread int g_ultra_slim = 0; if (__builtin_expect(!g_ultra_slim_checked, 0)) { const char* e = getenv("HAKMEM_TINY_ULTRA_SLIM"); g_ultra_slim = (e && *e && *e != '0') ? 1 : 0; g_ultra_slim_checked = 1; // Log mode activation (once per thread) if (g_ultra_slim) { fprintf(stderr, "[ULTRA_SLIM] 4-layer fast path enabled (TID=%ld)\n", (long)pthread_self()); } } return g_ultra_slim; } // Ultra SLIM 4-layer allocation path (internal helper) // Returns: BASE pointer on hit, NULL on miss // Note: This is a helper that returns BASE pointer. Use ultra_slim_alloc_4layer_user() for USER pointer. static inline void* ultra_slim_alloc_4layer_base(size_t size, int* out_class_idx) { // ========== Layer 1: Init Safety (1-2 cycles, cold path only) ========== lazy_init_global(); // ========== Layer 2: Size-to-Class (1-2 cycles, LUT lookup) ========== int class_idx = hak_tiny_size_to_class(size); if (__builtin_expect(class_idx < 0, 0)) { return NULL; // Size > 1KB, not Tiny } lazy_init_class(class_idx); // ========== Layer 3: ACE Learning (2-3 cycles, histogram update) ========== // CRITICAL: This preserves HAKMEM's learning capability (differentiator vs mimalloc) tiny_sizeclass_hist_hit(class_idx); // ========== Layer 4: TLS SLL Direct Pop (3-5 cycles, main allocation) ========== // Box Boundary: Use TLS SLL Box API (C7-safe, lock-free) void* base = NULL; if (tls_sll_pop(class_idx, &base)) { // HIT: Fast path success (total: 7-12 cycles) ultra_slim_track_hit(class_idx); *out_class_idx = class_idx; return base; // Return BASE (caller converts to USER) } // MISS: Return NULL (caller handles refill) ultra_slim_track_miss(class_idx); return NULL; } // Ultra SLIM 4-layer allocation path (USER pointer version) // Returns: USER pointer (ready to use) or NULL on miss static inline void* ultra_slim_alloc_4layer(size_t size) { int class_idx = -1; void* base = ultra_slim_alloc_4layer_base(size, &class_idx); if (!base) return NULL; // Convert BASE → USER using HAK_RET_ALLOC logic #if HAKMEM_TINY_HEADER_CLASSIDX && HAKMEM_BUILD_RELEASE // Write header and return USER pointer *(uint8_t*)base = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK); return (void*)((uint8_t*)base + 1); #else // Debug/Legacy: Use full validation return tiny_region_id_write_header(base, class_idx); #endif } // Ultra SLIM allocation with refill (complete fast path) // Returns: USER pointer (ready to use) or NULL on OOM // This is the main entry point for Ultra SLIM mode static inline void* ultra_slim_alloc_with_refill(size_t size) { // Debug: Track that Ultra SLIM path is being called static __thread uint64_t g_ultra_slim_call_count = 0; g_ultra_slim_call_count++; // Fast path: Try 4-layer direct allocation (returns USER pointer) void* user_ptr = ultra_slim_alloc_4layer(size); if (__builtin_expect(user_ptr != NULL, 1)) { // Fast path HIT: Already converted to USER pointer return user_ptr; } // Fast path MISS: Need refill // Note: tiny_alloc_fast_refill is declared static inline in tiny_alloc_fast.inc.h, // so we can't forward declare it here. Instead, we inline the refill logic. int class_idx = hak_tiny_size_to_class(size); if (class_idx < 0) return NULL; // Call backend refill (access via inline from tiny_alloc_fast.inc.h) // Note: We're included after tiny_alloc_fast.inc.h, so tiny_alloc_fast_refill is visible extern int sll_refill_batch_from_ss(int class_idx, int max_take); // Simple refill: Ask backend for 16 blocks int refilled = 0; #if HAKMEM_TINY_P0_BATCH_REFILL refilled = sll_refill_batch_from_ss(class_idx, 16); #else // Fallback: Use slow path if P0 disabled extern void* hak_tiny_alloc_slow(size_t size, int class_idx); void* slow_ptr = hak_tiny_alloc_slow(size, class_idx); if (slow_ptr) { // Slow path returns BASE pointer, convert to USER #if HAKMEM_TINY_HEADER_CLASSIDX && HAKMEM_BUILD_RELEASE *(uint8_t*)slow_ptr = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK); return (void*)((uint8_t*)slow_ptr + 1); #else return tiny_region_id_write_header(slow_ptr, class_idx); #endif } return NULL; #endif if (refilled > 0) { // Retry after refill user_ptr = ultra_slim_alloc_4layer(size); if (user_ptr) { return user_ptr; } } // Slow path (OOM or new SuperSlab allocation) extern void* hak_tiny_alloc_slow(size_t size, int class_idx); void* slow_base = hak_tiny_alloc_slow(size, class_idx); if (slow_base) { // Slow path returns BASE pointer, convert to USER #if HAKMEM_TINY_HEADER_CLASSIDX && HAKMEM_BUILD_RELEASE *(uint8_t*)slow_base = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK); return (void*)((uint8_t*)slow_base + 1); #else return tiny_region_id_write_header(slow_base, class_idx); #endif } return NULL; // OOM } // Print Ultra SLIM statistics (env: HAKMEM_ULTRA_SLIM_STATS=1) static inline int ultra_slim_stats_enabled(void) { static int enabled = -1; if (__builtin_expect(enabled == -1, 0)) { const char* e = getenv("HAKMEM_ULTRA_SLIM_STATS"); enabled = (e && *e && *e != '0') ? 1 : 0; } return enabled; } static void ultra_slim_print_stats(void) __attribute__((destructor)); static void ultra_slim_print_stats(void) { if (!ultra_slim_stats_enabled()) return; if (!ultra_slim_mode_enabled()) return; uint64_t total_hits = 0, total_misses = 0; for (int i = 0; i < TINY_NUM_CLASSES; i++) { total_hits += g_ultra_slim_hits[i]; total_misses += g_ultra_slim_misses[i]; } // Always print stats to debug if Ultra SLIM is actually being called fprintf(stderr, "\n========== Ultra SLIM 4-Layer Stats (DEBUG) ==========\n"); fprintf(stderr, "Total Hits: %lu\n", (unsigned long)total_hits); fprintf(stderr, "Total Misses: %lu\n", (unsigned long)total_misses); fprintf(stderr, "Total Calls: %lu\n", (unsigned long)(total_hits + total_misses)); if (total_hits + total_misses == 0) { fprintf(stderr, "⚠️ WARNING: Ultra SLIM mode enabled but no allocations tracked!\n"); fprintf(stderr, "This suggests the Ultra SLIM path is not being called.\n"); fprintf(stderr, "=====================================================\n\n"); return; } fprintf(stderr, "\n========== Ultra SLIM 4-Layer Stats ==========\n"); fprintf(stderr, "Total Hits: %lu\n", (unsigned long)total_hits); fprintf(stderr, "Total Misses: %lu\n", (unsigned long)total_misses); fprintf(stderr, "Hit Rate: %.1f%%\n", 100.0 * total_hits / (total_hits + total_misses)); fprintf(stderr, "\nPer-Class Breakdown:\n"); fprintf(stderr, "Class | Hits | Misses | Hit Rate\n"); fprintf(stderr, "------+-----------+-----------+---------\n"); for (int i = 0; i < TINY_NUM_CLASSES; i++) { uint64_t h = g_ultra_slim_hits[i]; uint64_t m = g_ultra_slim_misses[i]; if (h + m == 0) continue; fprintf(stderr, "C%-4d | %9lu | %9lu | %5.1f%%\n", i, (unsigned long)h, (unsigned long)m, 100.0 * h / (h + m)); } fprintf(stderr, "=============================================\n\n"); } // ========== Performance Notes ========== // // Expected Performance: // - Fast path hit: 7-12 cycles (~2-4ns on 3GHz CPU) // - Fast path miss: 50-100 cycles (refill overhead) // - Target throughput: 90-110M ops/s (mimalloc parity) // // Comparison with Standard 7-Layer Path: // - Standard: 31ns average (7 layers, 25-35 cycles) // - Ultra SLIM: 10ns average (4 layers, 7-12 cycles) // - Improvement: -68% latency, +210% throughput expected // // Deleted Layers (savings): // - HeapV2: 3-5 cycles saved // - FastCache: 5-7 cycles saved (C0-C3 only) // - SFC: 6-8 cycles saved // - Total: 14-20 cycles saved // // Preserved Capabilities: // ✅ ACE learning (adaptive behavior) // ✅ Init safety (no SEGV risk) // ✅ Box Theory (clean boundaries) // ✅ A/B testing (env gated)