// tiny_ultra_hot.h - Ultra-fast hot path for C2/C3/C4/C5 (16B-128B allocations) // Purpose: // - Minimize L1 dcache misses (30x → 3x target) by using 2 cache line TLS // - Minimize instructions (6.2x → 2x target) by ultra-simple straight-line path // - Minimize branches (7.1x → 2x target) by predict-likely hints // // Design (ChatGPT consultation Phase 14 + Phase 14-B): // - Phase 14: C2/C3 (16B/32B) - Coverage: 1.71% // - Phase 14-B: +C4/C5 (64B/128B) - Coverage: 11.14% (6.5x improvement!) // - TLS structure: 2 cache lines (128B) for 4 magazines with adaptive slot counts // - Path: 2-3 instructions per alloc/free (pop/push from magazine) // - Fallback: If magazine empty/full → existing TinyHeapV2/FastCache path // // Cache locality strategy: // - All state in 1 cache line (64B): 2x mag[8] + 2x top + padding // - No pointer chasing, no indirect access // - Touches only 1 struct per alloc/free // // Instruction reduction strategy: // - Size→class: 1 compare (size <= 16 ? C1 : C2) // - Magazine access: Direct array index (no loops) // - Fallback: Return NULL immediately (caller handles) // // Branch prediction strategy: // - __builtin_expect(hit, 1) - expect 95%+ hit rate // - No nested branches in hot path #ifndef HAK_FRONT_TINY_ULTRA_HOT_H #define HAK_FRONT_TINY_ULTRA_HOT_H #include #include #include #include "../box/tls_sll_box.h" // Phase 14-C: Borrowing design - refill from TLS SLL // Magazine capacity - adaptive sizing for cache locality (Phase 14-B) // Design principle: Balance capacity vs cache line usage // // Cache line 0 (64B): C2 + C3 magazines // C2 (16B): 4 slots × 8B ptr = 32B // C3 (32B): 4 slots × 8B ptr = 32B // Total: 64B (perfect fit!) // // Cache line 1 (64B): C4 + C5 magazines + counters // C4 (64B): 2 slots × 8B ptr = 16B // C5 (128B): 1 slot × 8B ptr = 8B // Counters: c1_top, c2_top, c4_top, c5_top = 4B // Padding: 36B // Total: 64B (fits!) // // Why fewer slots for larger classes? // - Maintain cache locality (2 cache lines = 128B total) // - Block size scales, so magazine memory scales proportionally // - Free path supplies blocks → even 1-2 slots maintain high hit rate // #ifndef ULTRA_HOT_MAG_CAP_C2 #define ULTRA_HOT_MAG_CAP_C2 4 // C2 (16B) - 4 slots #endif #ifndef ULTRA_HOT_MAG_CAP_C3 #define ULTRA_HOT_MAG_CAP_C3 4 // C3 (32B) - 4 slots #endif #ifndef ULTRA_HOT_MAG_CAP_C4 #define ULTRA_HOT_MAG_CAP_C4 2 // C4 (64B) - 2 slots (NEW Phase 14-B) #endif #ifndef ULTRA_HOT_MAG_CAP_C5 #define ULTRA_HOT_MAG_CAP_C5 1 // C5 (128B) - 1 slot (NEW Phase 14-B) #endif // TLS structure: 2 cache lines (128B) for hot path (Phase 14-B expanded) // Layout: // Cache line 0 (64B): C2_mag[4] (32B) + C3_mag[4] (32B) // Cache line 1 (64B): C4_mag[2] (16B) + C5_mag[1] (8B) + counters (4B) + pad (36B) // Cache line 2+: Statistics (cold path) // Total hot state: 128B (2 cache lines) typedef struct { // ===== Cache line 0 (64B): C2/C3 magazines ===== void* c1_mag[ULTRA_HOT_MAG_CAP_C2]; // C2 (16B) - 4 slots, 32B void* c2_mag[ULTRA_HOT_MAG_CAP_C3]; // C3 (32B) - 4 slots, 32B // ===== Cache line 1 (64B): C4/C5 magazines + counters ===== void* c4_mag[ULTRA_HOT_MAG_CAP_C4]; // C4 (64B) - 2 slots, 16B (NEW Phase 14-B) void* c5_mag[ULTRA_HOT_MAG_CAP_C5]; // C5 (128B) - 1 slot, 8B (NEW Phase 14-B) uint8_t c1_top; // C2 magazine top index uint8_t c2_top; // C3 magazine top index uint8_t c4_top; // C4 magazine top index (NEW Phase 14-B) uint8_t c5_top; // C5 magazine top index (NEW Phase 14-B) uint8_t pad[36]; // Padding to cache line boundary // ===== Statistics (cold path, cache line 2+) ===== uint64_t c1_alloc_calls; uint64_t c1_hits; uint64_t c1_misses; uint64_t c2_alloc_calls; uint64_t c2_hits; uint64_t c2_misses; uint64_t c4_alloc_calls; // NEW Phase 14-B uint64_t c4_hits; // NEW Phase 14-B uint64_t c4_misses; // NEW Phase 14-B uint64_t c5_alloc_calls; // NEW Phase 14-B uint64_t c5_hits; // NEW Phase 14-B uint64_t c5_misses; // NEW Phase 14-B uint64_t c1_free_calls; uint64_t c1_free_hits; uint64_t c2_free_calls; uint64_t c2_free_hits; uint64_t c4_free_calls; // NEW Phase 14-B uint64_t c4_free_hits; // NEW Phase 14-B uint64_t c5_free_calls; // NEW Phase 14-B uint64_t c5_free_hits; // NEW Phase 14-B } __attribute__((aligned(64))) TinyUltraHot; // External TLS variable (defined in hakmem_tiny.c) extern __thread TinyUltraHot g_ultra_hot; // Enable flag (cached) // ENV: HAKMEM_TINY_ULTRA_HOT // - 0: Disable (use existing TinyHeapV2/FastCache) // - 1 (default): Enable ultra-fast C1/C2 path static inline int ultra_hot_enabled(void) { static int g_enable = -1; if (__builtin_expect(g_enable == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ULTRA_HOT"); if (e && *e) { g_enable = (*e != '0') ? 1 : 0; } else { g_enable = 1; // Default: ON (Phase 14 decision) } #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[UltraHot-INIT] ultra_hot_enabled() = %d\n", g_enable); fflush(stderr); #endif } return g_enable; } // Phase 14-C: Max size control (ENV: HAKMEM_TINY_ULTRA_HOT_MAX_SIZE) // Purpose: Control which size classes UltraHot handles // Default: 32 (C2/C3 only, safe for Random Mixed) // Fixed-size: 128 (C2-C5, optimal for fixed-size workloads) static inline size_t ultra_hot_max_size(void) { static size_t g_max_size = 0; if (__builtin_expect(g_max_size == 0, 0)) { const char* e = getenv("HAKMEM_TINY_ULTRA_HOT_MAX_SIZE"); if (e && *e) { g_max_size = (size_t)atoi(e); } else { g_max_size = 32; // Default: C2/C3 only (Phase 14 behavior) } #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[UltraHot-INIT] ultra_hot_max_size() = %zu\n", g_max_size); fflush(stderr); #endif } return g_max_size; } // Ultra-fast alloc (C2/C3/C4/C5 - Phase 14-B expanded) // Contract: // - Input: size (must be 9-128B for C2-C5) // - Output: BASE pointer (not USER pointer!) or NULL // - Caller converts BASE → USER via HAK_RET_ALLOC // // Hot path (expect 95% hit rate): // 1. size → class (cascading compares) // 2. magazine pop (1 load + 1 decrement + 1 store) // 3. return BASE // // Cold path (5% miss rate): // - return NULL → caller uses existing TinyHeapV2/FastCache // // Performance target: // - L1 dcache: 2 cache lines load (128B) - all 4 mags // - Instructions: 5-7 instructions total per hit // - Branches: 2 branches (size check + mag empty check) static inline void* ultra_hot_alloc(size_t size) { // Fast path: size → class (cascading compares for branch prediction) // C2 = 16B (9-16), C3 = 32B (17-32), C4 = 64B (33-64), C5 = 128B (65-128) if (__builtin_expect(size <= 16, 1)) { // C2 path (16B) g_ultra_hot.c1_alloc_calls++; if (__builtin_expect(g_ultra_hot.c1_top > 0, 1)) { // Magazine hit! (5 instructions: load top, dec, load mag, store top, ret) g_ultra_hot.c1_hits++; uint8_t idx = --g_ultra_hot.c1_top; void* base = g_ultra_hot.c1_mag[idx]; return base; // Return BASE (caller converts to USER) } else { // Magazine empty (cold path) g_ultra_hot.c1_misses++; return NULL; } } else if (__builtin_expect(size <= 32, 1)) { // C3 path (32B) g_ultra_hot.c2_alloc_calls++; if (__builtin_expect(g_ultra_hot.c2_top > 0, 1)) { // Magazine hit! g_ultra_hot.c2_hits++; uint8_t idx = --g_ultra_hot.c2_top; void* base = g_ultra_hot.c2_mag[idx]; return base; } else { // Magazine empty g_ultra_hot.c2_misses++; return NULL; } } else if (__builtin_expect(size <= 64 && ultra_hot_max_size() >= 64, 0)) { // C4 path (64B) - Phase 14-C: ENV gated g_ultra_hot.c4_alloc_calls++; if (__builtin_expect(g_ultra_hot.c4_top > 0, 1)) { // Magazine hit! g_ultra_hot.c4_hits++; uint8_t idx = --g_ultra_hot.c4_top; void* base = g_ultra_hot.c4_mag[idx]; return base; } else { // Magazine empty g_ultra_hot.c4_misses++; return NULL; } } else if (__builtin_expect(size <= 128 && ultra_hot_max_size() >= 128, 0)) { // C5 path (128B) - Phase 14-C: ENV gated g_ultra_hot.c5_alloc_calls++; if (__builtin_expect(g_ultra_hot.c5_top > 0, 1)) { // Magazine hit! g_ultra_hot.c5_hits++; uint8_t idx = --g_ultra_hot.c5_top; void* base = g_ultra_hot.c5_mag[idx]; return base; } else { // Magazine empty g_ultra_hot.c5_misses++; return NULL; } } else { // Size out of range (C6+ or C0) return NULL; } } // Ultra-fast free (C2/C3/C4/C5 - Phase 14-B expanded) // Contract: // - Input: base (BASE pointer), class_idx // - Output: 1 if handled, 0 if magazine full (fallback to existing path) // // Hot path (expect 95% hit rate): // 1. class check (1 compare) // 2. magazine push (1 load top + 1 store mag + 1 increment + 1 store top) // 3. return 1 // // Cold path (5% miss rate): // - return 0 → caller uses existing TinyHeapV2/TLS SLL path static inline int ultra_hot_free_by_class(void* base, int class_idx) { // Fast path: class → magazine // NOTE: HAKMEM class numbering: C0=8B, C1=?, C2=16B, C3=32B, C4=64B, C5=128B if (__builtin_expect(class_idx == 2, 1)) { // C2 path (16B) g_ultra_hot.c1_free_calls++; if (__builtin_expect(g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2, 1)) { // Magazine has room! (5 instructions) g_ultra_hot.c1_free_hits++; uint8_t idx = g_ultra_hot.c1_top++; g_ultra_hot.c1_mag[idx] = base; return 1; // Success } else { // Magazine full → fallback return 0; } } else if (__builtin_expect(class_idx == 3, 1)) { // C3 path (32B) g_ultra_hot.c2_free_calls++; if (__builtin_expect(g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3, 1)) { // Magazine has room! g_ultra_hot.c2_free_hits++; uint8_t idx = g_ultra_hot.c2_top++; g_ultra_hot.c2_mag[idx] = base; return 1; } else { // Magazine full return 0; } } else if (__builtin_expect(class_idx == 4, 0)) { // C4 path (64B) - NEW Phase 14-B g_ultra_hot.c4_free_calls++; if (__builtin_expect(g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4, 1)) { // Magazine has room! g_ultra_hot.c4_free_hits++; uint8_t idx = g_ultra_hot.c4_top++; g_ultra_hot.c4_mag[idx] = base; return 1; } else { // Magazine full return 0; } } else if (__builtin_expect(class_idx == 5, 0)) { // C5 path (128B) - NEW Phase 14-B g_ultra_hot.c5_free_calls++; if (__builtin_expect(g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5, 1)) { // Magazine has room! g_ultra_hot.c5_free_hits++; uint8_t idx = g_ultra_hot.c5_top++; g_ultra_hot.c5_mag[idx] = base; return 1; } else { // Magazine full return 0; } } else { // Class out of range (not C2-C5) return 0; } } // Magazine refill (called from existing front when it has spare blocks) // Strategy: TinyHeapV2 / FastCache can "donate" blocks to UltraHot // This is optional - UltraHot can work with just free path supply static inline void ultra_hot_try_refill_c1(void* base) { if (g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2) { g_ultra_hot.c1_mag[g_ultra_hot.c1_top++] = base; } } static inline void ultra_hot_try_refill_c2(void* base) { if (g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3) { g_ultra_hot.c2_mag[g_ultra_hot.c2_top++] = base; } } static inline void ultra_hot_try_refill_c4(void* base) { if (g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4) { g_ultra_hot.c4_mag[g_ultra_hot.c4_top++] = base; } } static inline void ultra_hot_try_refill_c5(void* base) { if (g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5) { g_ultra_hot.c5_mag[g_ultra_hot.c5_top++] = base; } } // Print statistics (called at program exit if HAKMEM_TINY_ULTRA_HOT_STATS=1) // Declaration only (implementation in hakmem_tiny.c for external linkage) void ultra_hot_print_stats(void); // Design notes: // // 1. Cache locality: // - All state fits in 2 cache lines (128B total) // - First line (64B): Both magazines (C1 + C2) // - Second line (64B): Counters + stats // - Expected L1 miss: ~1-2 per alloc/free (vs 30+ currently) // // 2. Instruction count: // - Alloc hit: ~7 instructions (size check + mag pop + return) // - Free hit: ~7 instructions (size check + mag push + return) // - Total: ~14 instructions per alloc/free pair (vs ~281M/500K = 562 currently) // - Reduction: 562 → 14 = 40x improvement // // 3. Branch prediction: // - Size check: __builtin_expect(size <= 16, 1) - predict C1 likely // - Magazine check: __builtin_expect(top > 0, 1) - predict hit likely // - Expected branch-miss: ~5% (vs 7.83% currently) // // 4. Integration with existing front: // - UltraHot is L0 (fastest) // - TinyHeapV2 is L1 (fast) // - FastCache is L2 (normal) // - If UltraHot misses → fallback to L1/L2 // - Free path supplies both UltraHot and TinyHeapV2 // // 5. Supply strategy: // - Free path: Always try UltraHot first, then TinyHeapV2, then TLS SLL // - Alloc path: Try UltraHot first, then TinyHeapV2, then FastCache // - No refill from backend (keeps UltraHot ultra-simple) // // 6. Expected performance: // - Current: 9.3M ops/s (Random Mixed 256B) // - Target: 40-60M ops/s (+330-545%) // - L1 miss: 2.9M → ~300K (-90%) // - Instructions: 281M → ~80M (-71%) // - Branches: 59M → ~15M (-75%) // // 7. Why C1/C2 only? // - C1 (16B) + C2 (32B) cover ~60% of tiny allocations // - Small magazine (4 slots) fits both in 1-2 cache lines // - Size check is trivial (size <= 16 / size <= 32) // - Larger classes (C3+) have different access patterns (less cache-sensitive) // // 8. Why not C0 (8B)? // - TinyHeapV2 showed -5% regression on C0 // - 8B allocations are rare in real workloads // - Magazine overhead too high for 8B blocks // // 9. Comparison with TinyHeapV2: // - TinyHeapV2: 16 slots per class, covers C1-C3 // - UltraHot: 4 slots per class, covers C1-C2 only // - UltraHot is "ultra-hot subset" of TinyHeapV2 // - Trade magazine capacity for cache locality // // 10. ENV flags: // - HAKMEM_TINY_ULTRA_HOT=0/1 - Enable/disable (default: 1) // - HAKMEM_TINY_ULTRA_HOT_STATS=0/1 - Print stats at exit (default: 0) // ============================================================================= // Phase 14-C: Borrowing Design - Refill from TLS SLL (正史から借りる) // ============================================================================= // Design: UltraHot は「TLS SLL の手前にあるビュー」として動作 // - Free: 正史(TLS SLL)に戻す(横取りしない) // - Alloc miss: TLS SLL から借りて magazine を refill // - 学習層(Superslab/drain)が正しい在庫を追跡できる // // Call this after ultra_hot_alloc() miss to refill magazine from TLS SLL static inline void ultra_hot_try_refill(int class_idx) { if (!ultra_hot_enabled()) return; if (class_idx < 2 || class_idx > 5) return; // C2-C5 のみ // Refill magazine to full capacity (borrow from TLS SLL = 正史) if (class_idx == 2) { // C2 (16B): 4 slots magazine while (g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2) { void* ptr = NULL; if (!tls_sll_pop(class_idx, &ptr)) break; // TLS SLL から借りる g_ultra_hot.c1_mag[g_ultra_hot.c1_top++] = ptr; } } else if (class_idx == 3) { // C3 (32B): 4 slots magazine while (g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3) { void* ptr = NULL; if (!tls_sll_pop(class_idx, &ptr)) break; g_ultra_hot.c2_mag[g_ultra_hot.c2_top++] = ptr; } } else if (class_idx == 4) { // C4 (64B): 2 slots magazine while (g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4) { void* ptr = NULL; if (!tls_sll_pop(class_idx, &ptr)) break; g_ultra_hot.c4_mag[g_ultra_hot.c4_top++] = ptr; } } else if (class_idx == 5) { // C5 (128B): 1 slot magazine while (g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5) { void* ptr = NULL; if (!tls_sll_pop(class_idx, &ptr)) break; g_ultra_hot.c5_mag[g_ultra_hot.c5_top++] = ptr; } } } #endif // HAK_FRONT_TINY_ULTRA_HOT_H