// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions) // Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%) // Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend // Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart #pragma once #include "tiny_atomic.h" #include "hakmem_tiny.h" // ========== Debug Counters (compile-time gated) ========== #if HAKMEM_DEBUG_COUNTERS // Refill-stage counters (defined in hakmem_tiny.c) extern unsigned long long g_rf_total_calls[]; extern unsigned long long g_rf_hit_bench[]; extern unsigned long long g_rf_hit_hot[]; extern unsigned long long g_rf_hit_mail[]; extern unsigned long long g_rf_hit_slab[]; extern unsigned long long g_rf_hit_ss[]; extern unsigned long long g_rf_hit_reg[]; extern unsigned long long g_rf_mmap_calls[]; // Publish hits (defined in hakmem_tiny.c) extern unsigned long long g_pub_mail_hits[]; extern unsigned long long g_pub_bench_hits[]; extern unsigned long long g_pub_hot_hits[]; // Free pipeline (defined in hakmem_tiny.c) extern unsigned long long g_free_via_tls_sll[]; #endif // ========== Box 5: Allocation Fast Path ========== // 箱理論の Fast Allocation 層。TLS freelist から直接 pop(3-4命令)。 // 不変条件: // - TLS freelist が非空なら即座に return (no lock, no sync) // - Miss なら Backend (Box 3: SuperSlab) に委譲 // - Cross-thread allocation は考慮しない(Backend が処理) // External TLS variables (defined in hakmem_tiny.c) extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; // External backend functions extern int sll_refill_small_from_ss(int class_idx, int max_take); extern void* hak_tiny_alloc_slow(size_t size, int class_idx); extern int hak_tiny_size_to_class(size_t size); // External macros #ifndef HAK_RET_ALLOC #define HAK_RET_ALLOC(cls, ptr) return (ptr) #endif // ========== RDTSC Profiling (lightweight) ========== #ifdef __x86_64__ static inline uint64_t tiny_fast_rdtsc(void) { unsigned int lo, hi; __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); return ((uint64_t)hi << 32) | lo; } #else static inline uint64_t tiny_fast_rdtsc(void) { return 0; } #endif // Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1) static __thread uint64_t g_tiny_alloc_hits = 0; static __thread uint64_t g_tiny_alloc_cycles = 0; static __thread uint64_t g_tiny_refill_calls = 0; static __thread uint64_t g_tiny_refill_cycles = 0; static int g_tiny_profile_enabled = -1; // -1: uninitialized static inline int tiny_profile_enabled(void) { if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) { const char* env = getenv("HAKMEM_TINY_PROFILE"); g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0; } return g_tiny_profile_enabled; } // Print profiling results at exit static void tiny_fast_print_profile(void) __attribute__((destructor)); static void tiny_fast_print_profile(void) { if (!tiny_profile_enabled()) return; if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return; fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n"); if (g_tiny_alloc_hits > 0) { fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n", (unsigned long)g_tiny_alloc_hits, (unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits)); } if (g_tiny_refill_calls > 0) { fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n", (unsigned long)g_tiny_refill_calls, (unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls)); } fprintf(stderr, "===================================================\n\n"); } // ========== Fast Path: TLS Freelist Pop (3-4 instructions) ========== // Allocation fast path (inline for zero-cost) // Returns: pointer on success, NULL on miss (caller should try refill/slow) // // Assembly (x86-64, optimized): // mov rax, QWORD PTR g_tls_sll_head[class_idx] ; Load head // test rax, rax ; Check NULL // je .miss ; If empty, miss // mov rdx, QWORD PTR [rax] ; Load next // mov QWORD PTR g_tls_sll_head[class_idx], rdx ; Update head // ret ; Return ptr // .miss: // ; Fall through to refill // // Expected: 3-4 instructions on hit (1 load, 1 test, 1 load, 1 store) static inline void* tiny_alloc_fast_pop(int class_idx) { uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0; // Box Boundary: TLS freelist の先頭を pop // Ownership: TLS なので所有権チェック不要(同一スレッド保証) void* head = g_tls_sll_head[class_idx]; if (__builtin_expect(head != NULL, 1)) { // Fast path hit: 3 instructions g_tls_sll_head[class_idx] = *(void**)head; // Pop: next = *head // Optional: update count (for stats, can be disabled) if (g_tls_sll_count[class_idx] > 0) { g_tls_sll_count[class_idx]--; } #if HAKMEM_DEBUG_COUNTERS // Track TLS freelist hits (compile-time gated, zero runtime cost when disabled) g_free_via_tls_sll[class_idx]++; #endif if (start) { g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start); g_tiny_alloc_hits++; } return head; } // Fast path miss → NULL (caller should refill) return NULL; } // ========== Refill Path: Backend Integration ========== // Refill TLS freelist from backend (SuperSlab/ACE/Learning layer) // Returns: number of blocks refilled // // This integrates with existing HAKMEM infrastructure: // - SuperSlab provides memory chunks // - ACE provides adaptive capacity learning // - L25 provides mid-large integration // // Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 32) // - Smaller count (8-16): better for diverse workloads, faster warmup // - Larger count (64-128): better for homogeneous workloads, fewer refills static inline int tiny_alloc_fast_refill(int class_idx) { uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0; // Tunable refill count (cached in TLS for performance) static __thread int s_refill_count = 0; if (__builtin_expect(s_refill_count == 0, 0)) { int def = 16; // Default: 16 (smaller = less overhead per refill) char* env = getenv("HAKMEM_TINY_REFILL_COUNT"); int v = (env ? atoi(env) : def); // Clamp to sane range (avoid pathological cases) if (v < 8) v = 8; // Minimum: avoid thrashing if (v > 256) v = 256; // Maximum: avoid excessive TLS memory s_refill_count = v; } #if HAKMEM_DEBUG_COUNTERS // Track refill calls (compile-time gated) g_rf_total_calls[class_idx]++; #endif // Box Boundary: Delegate to Backend (Box 3: SuperSlab) // This gives us ACE, Learning layer, L25 integration for free! // Note: g_rf_hit_slab counter is incremented inside sll_refill_small_from_ss() int refilled = sll_refill_small_from_ss(class_idx, s_refill_count); if (start) { g_tiny_refill_cycles += (tiny_fast_rdtsc() - start); g_tiny_refill_calls++; } return refilled; } // ========== Combined Fast Path (Alloc + Refill) ========== // Complete fast path allocation (inline for zero-cost) // Returns: pointer on success, NULL on failure (OOM or size too large) // // Flow: // 1. TLS freelist pop (3-4 instructions) - Hit rate ~95% // 2. Miss → Refill from backend (~5% cases) // 3. Refill success → Retry pop // 4. Refill failure → Slow path (OOM or new SuperSlab allocation) // // Example usage: // void* ptr = tiny_alloc_fast(64); // if (!ptr) { // // OOM handling // } static inline void* tiny_alloc_fast(size_t size) { // 1. Size → class index (inline, fast) int class_idx = hak_tiny_size_to_class(size); if (__builtin_expect(class_idx < 0, 0)) { return NULL; // Size > 1KB, not Tiny } // 2. Fast path: TLS freelist pop (3-4 instructions, 95% hit rate) void* ptr = tiny_alloc_fast_pop(class_idx); if (__builtin_expect(ptr != NULL, 1)) { HAK_RET_ALLOC(class_idx, ptr); } // 3. Miss: Refill from backend (Box 3: SuperSlab) int refilled = tiny_alloc_fast_refill(class_idx); if (__builtin_expect(refilled > 0, 1)) { // Refill success → retry pop ptr = tiny_alloc_fast_pop(class_idx); if (ptr) { HAK_RET_ALLOC(class_idx, ptr); } } // 4. Refill failure or still empty → slow path (OOM or new SuperSlab) // Box Boundary: Delegate to Slow Path (Box 3 backend) ptr = hak_tiny_alloc_slow(size, class_idx); if (ptr) { HAK_RET_ALLOC(class_idx, ptr); } return ptr; // NULL if OOM } // ========== Push to TLS Freelist (for free path) ========== // Push block to TLS freelist (used by free fast path) // This is a "helper" for Box 6 (Free Fast Path) // // Invariant: ptr must belong to current thread (no ownership check here) // Caller (Box 6) is responsible for ownership verification static inline void tiny_alloc_fast_push(int class_idx, void* ptr) { // Box Boundary: Push to TLS freelist *(void**)ptr = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = ptr; g_tls_sll_count[class_idx]++; } // ========== Statistics & Diagnostics ========== // Get TLS freelist stats (for debugging/profiling) typedef struct { int class_idx; void* head; uint32_t count; } TinyAllocFastStats; static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) { TinyAllocFastStats stats = { .class_idx = class_idx, .head = g_tls_sll_head[class_idx], .count = g_tls_sll_count[class_idx] }; return stats; } // Reset TLS freelist (for testing/benchmarking) // WARNING: This leaks memory! Only use in controlled test environments. static inline void tiny_alloc_fast_reset(int class_idx) { g_tls_sll_head[class_idx] = NULL; g_tls_sll_count[class_idx] = 0; } // ========== Performance Notes ========== // // Expected metrics (based on System tcache & HAKX +171% results): // - Fast path hit rate: 95%+ (workload dependent) // - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs) // - Miss penalty: ~20-50 instructions (refill from SuperSlab) // - Throughput improvement: +10-25% vs current multi-layer design // // Key optimizations: // 1. `__builtin_expect` for branch prediction (hot path first) // 2. `static inline` for zero-cost abstraction // 3. TLS variables (no atomic ops, no locks) // 4. Minimal work in fast path (defer stats/accounting to backend) // // Comparison with current design: // - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...) // - New: 3-4 instructions (TLS freelist pop only) // - Reduction: -80% instructions in hot path // // Inspired by: // - System tcache (glibc malloc) - 3-4 instruction fast path // - HAKX Mid-Large (+171%) - "Simple Front + Smart Back" // - Box Theory - Clear boundaries, minimal coupling