Phase 15: Box BenchMeta separation + ExternalGuard debug + investigation report

- Implement Box BenchMeta pattern in bench_random_mixed.c (BENCH_META_CALLOC/FREE) - Add enhanced debug logging to external_guard_box.h (caller tracking, FG classification) - Document investigation in PHASE15_BUG_ANALYSIS.md Issue: Page-aligned MIDCAND pointer not in SuperSlab registry → ExternalGuard → crash Hypothesis: May be pre-existing SuperSlab bug (not Phase 15-specific) Next: Test in Phase 14-C to verify
2025-11-15 23:00:21 +09:00
parent cef99b311d
commit d378ee11a0
9 changed files with 785 additions and 40 deletions
--- a/core/front/tiny_ultra_hot.h
+++ b/core/front/tiny_ultra_hot.h
@ -0,0 +1,458 @@
+// tiny_ultra_hot.h - Ultra-fast hot path for C2/C3/C4/C5 (16B-128B allocations)
+// Purpose:
+//   - Minimize L1 dcache misses (30x → 3x target) by using 2 cache line TLS
+//   - Minimize instructions (6.2x → 2x target) by ultra-simple straight-line path
+//   - Minimize branches (7.1x → 2x target) by predict-likely hints
+//
+// Design (ChatGPT consultation Phase 14 + Phase 14-B):
+//   - Phase 14:   C2/C3 (16B/32B) - Coverage: 1.71%
+//   - Phase 14-B: +C4/C5 (64B/128B) - Coverage: 11.14% (6.5x improvement!)
+//   - TLS structure: 2 cache lines (128B) for 4 magazines with adaptive slot counts
+//   - Path: 2-3 instructions per alloc/free (pop/push from magazine)
+//   - Fallback: If magazine empty/full → existing TinyHeapV2/FastCache path
+//
+// Cache locality strategy:
+//   - All state in 1 cache line (64B): 2x mag[8] + 2x top + padding
+//   - No pointer chasing, no indirect access
+//   - Touches only 1 struct per alloc/free
+//
+// Instruction reduction strategy:
+//   - Size→class: 1 compare (size <= 16 ? C1 : C2)
+//   - Magazine access: Direct array index (no loops)
+//   - Fallback: Return NULL immediately (caller handles)
+//
+// Branch prediction strategy:
+//   - __builtin_expect(hit, 1) - expect 95%+ hit rate
+//   - No nested branches in hot path
+
+#ifndef HAK_FRONT_TINY_ULTRA_HOT_H
+#define HAK_FRONT_TINY_ULTRA_HOT_H
+
+#include <stdint.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include "../box/tls_sll_box.h"  // Phase 14-C: Borrowing design - refill from TLS SLL
+
+// Magazine capacity - adaptive sizing for cache locality (Phase 14-B)
+// Design principle: Balance capacity vs cache line usage
+//
+// Cache line 0 (64B): C2 + C3 magazines
+//   C2 (16B): 4 slots × 8B ptr = 32B
+//   C3 (32B): 4 slots × 8B ptr = 32B
+//   Total: 64B (perfect fit!)
+//
+// Cache line 1 (64B): C4 + C5 magazines + counters
+//   C4 (64B):  2 slots × 8B ptr = 16B
+//   C5 (128B): 1 slot  × 8B ptr = 8B
+//   Counters: c1_top, c2_top, c4_top, c5_top = 4B
+//   Padding: 36B
+//   Total: 64B (fits!)
+//
+// Why fewer slots for larger classes?
+//   - Maintain cache locality (2 cache lines = 128B total)
+//   - Block size scales, so magazine memory scales proportionally
+//   - Free path supplies blocks → even 1-2 slots maintain high hit rate
+//
+#ifndef ULTRA_HOT_MAG_CAP_C2
+#define ULTRA_HOT_MAG_CAP_C2 4  // C2 (16B) - 4 slots
+#endif
+#ifndef ULTRA_HOT_MAG_CAP_C3
+#define ULTRA_HOT_MAG_CAP_C3 4  // C3 (32B) - 4 slots
+#endif
+#ifndef ULTRA_HOT_MAG_CAP_C4
+#define ULTRA_HOT_MAG_CAP_C4 2  // C4 (64B) - 2 slots (NEW Phase 14-B)
+#endif
+#ifndef ULTRA_HOT_MAG_CAP_C5
+#define ULTRA_HOT_MAG_CAP_C5 1  // C5 (128B) - 1 slot (NEW Phase 14-B)
+#endif
+
+// TLS structure: 2 cache lines (128B) for hot path (Phase 14-B expanded)
+// Layout:
+//   Cache line 0 (64B): C2_mag[4] (32B) + C3_mag[4] (32B)
+//   Cache line 1 (64B): C4_mag[2] (16B) + C5_mag[1] (8B) + counters (4B) + pad (36B)
+//   Cache line 2+: Statistics (cold path)
+// Total hot state: 128B (2 cache lines)
+typedef struct {
+    // ===== Cache line 0 (64B): C2/C3 magazines =====
+    void* c1_mag[ULTRA_HOT_MAG_CAP_C2];  // C2 (16B) - 4 slots, 32B
+    void* c2_mag[ULTRA_HOT_MAG_CAP_C3];  // C3 (32B) - 4 slots, 32B
+
+    // ===== Cache line 1 (64B): C4/C5 magazines + counters =====
+    void* c4_mag[ULTRA_HOT_MAG_CAP_C4];  // C4 (64B) - 2 slots, 16B (NEW Phase 14-B)
+    void* c5_mag[ULTRA_HOT_MAG_CAP_C5];  // C5 (128B) - 1 slot, 8B (NEW Phase 14-B)
+
+    uint8_t c1_top;    // C2 magazine top index
+    uint8_t c2_top;    // C3 magazine top index
+    uint8_t c4_top;    // C4 magazine top index (NEW Phase 14-B)
+    uint8_t c5_top;    // C5 magazine top index (NEW Phase 14-B)
+    uint8_t pad[36];   // Padding to cache line boundary
+
+    // ===== Statistics (cold path, cache line 2+) =====
+    uint64_t c1_alloc_calls;
+    uint64_t c1_hits;
+    uint64_t c1_misses;
+    uint64_t c2_alloc_calls;
+    uint64_t c2_hits;
+    uint64_t c2_misses;
+    uint64_t c4_alloc_calls;  // NEW Phase 14-B
+    uint64_t c4_hits;         // NEW Phase 14-B
+    uint64_t c4_misses;       // NEW Phase 14-B
+    uint64_t c5_alloc_calls;  // NEW Phase 14-B
+    uint64_t c5_hits;         // NEW Phase 14-B
+    uint64_t c5_misses;       // NEW Phase 14-B
+
+    uint64_t c1_free_calls;
+    uint64_t c1_free_hits;
+    uint64_t c2_free_calls;
+    uint64_t c2_free_hits;
+    uint64_t c4_free_calls;   // NEW Phase 14-B
+    uint64_t c4_free_hits;    // NEW Phase 14-B
+    uint64_t c5_free_calls;   // NEW Phase 14-B
+    uint64_t c5_free_hits;    // NEW Phase 14-B
+} __attribute__((aligned(64))) TinyUltraHot;
+
+// External TLS variable (defined in hakmem_tiny.c)
+extern __thread TinyUltraHot g_ultra_hot;
+
+// Enable flag (cached)
+// ENV: HAKMEM_TINY_ULTRA_HOT
+// - 0: Disable (use existing TinyHeapV2/FastCache)
+// - 1 (default): Enable ultra-fast C1/C2 path
+static inline int ultra_hot_enabled(void) {
+    static int g_enable = -1;
+    if (__builtin_expect(g_enable == -1, 0)) {
+        const char* e = getenv("HAKMEM_TINY_ULTRA_HOT");
+        if (e && *e) {
+            g_enable = (*e != '0') ? 1 : 0;
+        } else {
+            g_enable = 1;  // Default: ON (Phase 14 decision)
+        }
+#if !HAKMEM_BUILD_RELEASE
+        fprintf(stderr, "[UltraHot-INIT] ultra_hot_enabled() = %d\n", g_enable);
+        fflush(stderr);
+#endif
+    }
+    return g_enable;
+}
+
+// Phase 14-C: Max size control (ENV: HAKMEM_TINY_ULTRA_HOT_MAX_SIZE)
+// Purpose: Control which size classes UltraHot handles
+// Default: 32 (C2/C3 only, safe for Random Mixed)
+// Fixed-size: 128 (C2-C5, optimal for fixed-size workloads)
+static inline size_t ultra_hot_max_size(void) {
+    static size_t g_max_size = 0;
+    if (__builtin_expect(g_max_size == 0, 0)) {
+        const char* e = getenv("HAKMEM_TINY_ULTRA_HOT_MAX_SIZE");
+        if (e && *e) {
+            g_max_size = (size_t)atoi(e);
+        } else {
+            g_max_size = 32;  // Default: C2/C3 only (Phase 14 behavior)
+        }
+#if !HAKMEM_BUILD_RELEASE
+        fprintf(stderr, "[UltraHot-INIT] ultra_hot_max_size() = %zu\n", g_max_size);
+        fflush(stderr);
+#endif
+    }
+    return g_max_size;
+}
+
+// Ultra-fast alloc (C2/C3/C4/C5 - Phase 14-B expanded)
+// Contract:
+//   - Input: size (must be 9-128B for C2-C5)
+//   - Output: BASE pointer (not USER pointer!) or NULL
+//   - Caller converts BASE → USER via HAK_RET_ALLOC
+//
+// Hot path (expect 95% hit rate):
+//   1. size → class (cascading compares)
+//   2. magazine pop (1 load + 1 decrement + 1 store)
+//   3. return BASE
+//
+// Cold path (5% miss rate):
+//   - return NULL → caller uses existing TinyHeapV2/FastCache
+//
+// Performance target:
+//   - L1 dcache: 2 cache lines load (128B) - all 4 mags
+//   - Instructions: 5-7 instructions total per hit
+//   - Branches: 2 branches (size check + mag empty check)
+static inline void* ultra_hot_alloc(size_t size) {
+    // Fast path: size → class (cascading compares for branch prediction)
+    // C2 = 16B (9-16), C3 = 32B (17-32), C4 = 64B (33-64), C5 = 128B (65-128)
+    if (__builtin_expect(size <= 16, 1)) {
+        // C2 path (16B)
+        g_ultra_hot.c1_alloc_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c1_top > 0, 1)) {
+            // Magazine hit! (5 instructions: load top, dec, load mag, store top, ret)
+            g_ultra_hot.c1_hits++;
+            uint8_t idx = --g_ultra_hot.c1_top;
+            void* base = g_ultra_hot.c1_mag[idx];
+            return base;  // Return BASE (caller converts to USER)
+        } else {
+            // Magazine empty (cold path)
+            g_ultra_hot.c1_misses++;
+            return NULL;
+        }
+    } else if (__builtin_expect(size <= 32, 1)) {
+        // C3 path (32B)
+        g_ultra_hot.c2_alloc_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c2_top > 0, 1)) {
+            // Magazine hit!
+            g_ultra_hot.c2_hits++;
+            uint8_t idx = --g_ultra_hot.c2_top;
+            void* base = g_ultra_hot.c2_mag[idx];
+            return base;
+        } else {
+            // Magazine empty
+            g_ultra_hot.c2_misses++;
+            return NULL;
+        }
+    } else if (__builtin_expect(size <= 64 && ultra_hot_max_size() >= 64, 0)) {
+        // C4 path (64B) - Phase 14-C: ENV gated
+        g_ultra_hot.c4_alloc_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c4_top > 0, 1)) {
+            // Magazine hit!
+            g_ultra_hot.c4_hits++;
+            uint8_t idx = --g_ultra_hot.c4_top;
+            void* base = g_ultra_hot.c4_mag[idx];
+            return base;
+        } else {
+            // Magazine empty
+            g_ultra_hot.c4_misses++;
+            return NULL;
+        }
+    } else if (__builtin_expect(size <= 128 && ultra_hot_max_size() >= 128, 0)) {
+        // C5 path (128B) - Phase 14-C: ENV gated
+        g_ultra_hot.c5_alloc_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c5_top > 0, 1)) {
+            // Magazine hit!
+            g_ultra_hot.c5_hits++;
+            uint8_t idx = --g_ultra_hot.c5_top;
+            void* base = g_ultra_hot.c5_mag[idx];
+            return base;
+        } else {
+            // Magazine empty
+            g_ultra_hot.c5_misses++;
+            return NULL;
+        }
+    } else {
+        // Size out of range (C6+ or C0)
+        return NULL;
+    }
+}
+
+// Ultra-fast free (C2/C3/C4/C5 - Phase 14-B expanded)
+// Contract:
+//   - Input: base (BASE pointer), class_idx
+//   - Output: 1 if handled, 0 if magazine full (fallback to existing path)
+//
+// Hot path (expect 95% hit rate):
+//   1. class check (1 compare)
+//   2. magazine push (1 load top + 1 store mag + 1 increment + 1 store top)
+//   3. return 1
+//
+// Cold path (5% miss rate):
+//   - return 0 → caller uses existing TinyHeapV2/TLS SLL path
+static inline int ultra_hot_free_by_class(void* base, int class_idx) {
+    // Fast path: class → magazine
+    // NOTE: HAKMEM class numbering: C0=8B, C1=?, C2=16B, C3=32B, C4=64B, C5=128B
+    if (__builtin_expect(class_idx == 2, 1)) {
+        // C2 path (16B)
+        g_ultra_hot.c1_free_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2, 1)) {
+            // Magazine has room! (5 instructions)
+            g_ultra_hot.c1_free_hits++;
+            uint8_t idx = g_ultra_hot.c1_top++;
+            g_ultra_hot.c1_mag[idx] = base;
+            return 1;  // Success
+        } else {
+            // Magazine full → fallback
+            return 0;
+        }
+    } else if (__builtin_expect(class_idx == 3, 1)) {
+        // C3 path (32B)
+        g_ultra_hot.c2_free_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3, 1)) {
+            // Magazine has room!
+            g_ultra_hot.c2_free_hits++;
+            uint8_t idx = g_ultra_hot.c2_top++;
+            g_ultra_hot.c2_mag[idx] = base;
+            return 1;
+        } else {
+            // Magazine full
+            return 0;
+        }
+    } else if (__builtin_expect(class_idx == 4, 0)) {
+        // C4 path (64B) - NEW Phase 14-B
+        g_ultra_hot.c4_free_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4, 1)) {
+            // Magazine has room!
+            g_ultra_hot.c4_free_hits++;
+            uint8_t idx = g_ultra_hot.c4_top++;
+            g_ultra_hot.c4_mag[idx] = base;
+            return 1;
+        } else {
+            // Magazine full
+            return 0;
+        }
+    } else if (__builtin_expect(class_idx == 5, 0)) {
+        // C5 path (128B) - NEW Phase 14-B
+        g_ultra_hot.c5_free_calls++;
+
+        if (__builtin_expect(g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5, 1)) {
+            // Magazine has room!
+            g_ultra_hot.c5_free_hits++;
+            uint8_t idx = g_ultra_hot.c5_top++;
+            g_ultra_hot.c5_mag[idx] = base;
+            return 1;
+        } else {
+            // Magazine full
+            return 0;
+        }
+    } else {
+        // Class out of range (not C2-C5)
+        return 0;
+    }
+}
+
+// Magazine refill (called from existing front when it has spare blocks)
+// Strategy: TinyHeapV2 / FastCache can "donate" blocks to UltraHot
+// This is optional - UltraHot can work with just free path supply
+static inline void ultra_hot_try_refill_c1(void* base) {
+    if (g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2) {
+        g_ultra_hot.c1_mag[g_ultra_hot.c1_top++] = base;
+    }
+}
+
+static inline void ultra_hot_try_refill_c2(void* base) {
+    if (g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3) {
+        g_ultra_hot.c2_mag[g_ultra_hot.c2_top++] = base;
+    }
+}
+
+static inline void ultra_hot_try_refill_c4(void* base) {
+    if (g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4) {
+        g_ultra_hot.c4_mag[g_ultra_hot.c4_top++] = base;
+    }
+}
+
+static inline void ultra_hot_try_refill_c5(void* base) {
+    if (g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5) {
+        g_ultra_hot.c5_mag[g_ultra_hot.c5_top++] = base;
+    }
+}
+
+// Print statistics (called at program exit if HAKMEM_TINY_ULTRA_HOT_STATS=1)
+// Declaration only (implementation in hakmem_tiny.c for external linkage)
+void ultra_hot_print_stats(void);
+
+// Design notes:
+//
+// 1. Cache locality:
+//    - All state fits in 2 cache lines (128B total)
+//    - First line (64B): Both magazines (C1 + C2)
+//    - Second line (64B): Counters + stats
+//    - Expected L1 miss: ~1-2 per alloc/free (vs 30+ currently)
+//
+// 2. Instruction count:
+//    - Alloc hit: ~7 instructions (size check + mag pop + return)
+//    - Free hit: ~7 instructions (size check + mag push + return)
+//    - Total: ~14 instructions per alloc/free pair (vs ~281M/500K = 562 currently)
+//    - Reduction: 562 → 14 = 40x improvement
+//
+// 3. Branch prediction:
+//    - Size check: __builtin_expect(size <= 16, 1) - predict C1 likely
+//    - Magazine check: __builtin_expect(top > 0, 1) - predict hit likely
+//    - Expected branch-miss: ~5% (vs 7.83% currently)
+//
+// 4. Integration with existing front:
+//    - UltraHot is L0 (fastest)
+//    - TinyHeapV2 is L1 (fast)
+//    - FastCache is L2 (normal)
+//    - If UltraHot misses → fallback to L1/L2
+//    - Free path supplies both UltraHot and TinyHeapV2
+//
+// 5. Supply strategy:
+//    - Free path: Always try UltraHot first, then TinyHeapV2, then TLS SLL
+//    - Alloc path: Try UltraHot first, then TinyHeapV2, then FastCache
+//    - No refill from backend (keeps UltraHot ultra-simple)
+//
+// 6. Expected performance:
+//    - Current: 9.3M ops/s (Random Mixed 256B)
+//    - Target: 40-60M ops/s (+330-545%)
+//    - L1 miss: 2.9M → ~300K (-90%)
+//    - Instructions: 281M → ~80M (-71%)
+//    - Branches: 59M → ~15M (-75%)
+//
+// 7. Why C1/C2 only?
+//    - C1 (16B) + C2 (32B) cover ~60% of tiny allocations
+//    - Small magazine (4 slots) fits both in 1-2 cache lines
+//    - Size check is trivial (size <= 16 / size <= 32)
+//    - Larger classes (C3+) have different access patterns (less cache-sensitive)
+//
+// 8. Why not C0 (8B)?
+//    - TinyHeapV2 showed -5% regression on C0
+//    - 8B allocations are rare in real workloads
+//    - Magazine overhead too high for 8B blocks
+//
+// 9. Comparison with TinyHeapV2:
+//    - TinyHeapV2: 16 slots per class, covers C1-C3
+//    - UltraHot: 4 slots per class, covers C1-C2 only
+//    - UltraHot is "ultra-hot subset" of TinyHeapV2
+//    - Trade magazine capacity for cache locality
+//
+// 10. ENV flags:
+//     - HAKMEM_TINY_ULTRA_HOT=0/1 - Enable/disable (default: 1)
+//     - HAKMEM_TINY_ULTRA_HOT_STATS=0/1 - Print stats at exit (default: 0)
+
+// =============================================================================
+// Phase 14-C: Borrowing Design - Refill from TLS SLL (正史から借りる)
+// =============================================================================
+// Design: UltraHot は「TLS SLL の手前にあるビュー」として動作
+//   - Free: 正史（TLS SLL）に戻す（横取りしない）
+//   - Alloc miss: TLS SLL から借りて magazine を refill
+//   - 学習層（Superslab/drain）が正しい在庫を追跡できる
+//
+// Call this after ultra_hot_alloc() miss to refill magazine from TLS SLL
+static inline void ultra_hot_try_refill(int class_idx) {
+    if (!ultra_hot_enabled()) return;
+    if (class_idx < 2 || class_idx > 5) return;  // C2-C5 のみ
+
+    // Refill magazine to full capacity (borrow from TLS SLL = 正史)
+    if (class_idx == 2) {
+        // C2 (16B): 4 slots magazine
+        while (g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2) {
+            void* ptr = NULL;
+            if (!tls_sll_pop(class_idx, &ptr)) break;  // TLS SLL から借りる
+            g_ultra_hot.c1_mag[g_ultra_hot.c1_top++] = ptr;
+        }
+    } else if (class_idx == 3) {
+        // C3 (32B): 4 slots magazine
+        while (g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3) {
+            void* ptr = NULL;
+            if (!tls_sll_pop(class_idx, &ptr)) break;
+            g_ultra_hot.c2_mag[g_ultra_hot.c2_top++] = ptr;
+        }
+    } else if (class_idx == 4) {
+        // C4 (64B): 2 slots magazine
+        while (g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4) {
+            void* ptr = NULL;
+            if (!tls_sll_pop(class_idx, &ptr)) break;
+            g_ultra_hot.c4_mag[g_ultra_hot.c4_top++] = ptr;
+        }
+    } else if (class_idx == 5) {
+        // C5 (128B): 1 slot magazine
+        while (g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5) {
+            void* ptr = NULL;
+            if (!tls_sll_pop(class_idx, &ptr)) break;
+            g_ultra_hot.c5_mag[g_ultra_hot.c5_top++] = ptr;
+        }
+    }
+}
+
+#endif // HAK_FRONT_TINY_ULTRA_HOT_H