Files
hakmem/core/front/tiny_ultra_hot.h
Moe Charm (CI) d378ee11a0 Phase 15: Box BenchMeta separation + ExternalGuard debug + investigation report
- Implement Box BenchMeta pattern in bench_random_mixed.c (BENCH_META_CALLOC/FREE)
- Add enhanced debug logging to external_guard_box.h (caller tracking, FG classification)
- Document investigation in PHASE15_BUG_ANALYSIS.md

Issue: Page-aligned MIDCAND pointer not in SuperSlab registry → ExternalGuard → crash
Hypothesis: May be pre-existing SuperSlab bug (not Phase 15-specific)
Next: Test in Phase 14-C to verify
2025-11-15 23:00:21 +09:00

459 lines
17 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_ultra_hot.h - Ultra-fast hot path for C2/C3/C4/C5 (16B-128B allocations)
// Purpose:
// - Minimize L1 dcache misses (30x → 3x target) by using 2 cache line TLS
// - Minimize instructions (6.2x → 2x target) by ultra-simple straight-line path
// - Minimize branches (7.1x → 2x target) by predict-likely hints
//
// Design (ChatGPT consultation Phase 14 + Phase 14-B):
// - Phase 14: C2/C3 (16B/32B) - Coverage: 1.71%
// - Phase 14-B: +C4/C5 (64B/128B) - Coverage: 11.14% (6.5x improvement!)
// - TLS structure: 2 cache lines (128B) for 4 magazines with adaptive slot counts
// - Path: 2-3 instructions per alloc/free (pop/push from magazine)
// - Fallback: If magazine empty/full → existing TinyHeapV2/FastCache path
//
// Cache locality strategy:
// - All state in 1 cache line (64B): 2x mag[8] + 2x top + padding
// - No pointer chasing, no indirect access
// - Touches only 1 struct per alloc/free
//
// Instruction reduction strategy:
// - Size→class: 1 compare (size <= 16 ? C1 : C2)
// - Magazine access: Direct array index (no loops)
// - Fallback: Return NULL immediately (caller handles)
//
// Branch prediction strategy:
// - __builtin_expect(hit, 1) - expect 95%+ hit rate
// - No nested branches in hot path
#ifndef HAK_FRONT_TINY_ULTRA_HOT_H
#define HAK_FRONT_TINY_ULTRA_HOT_H
#include <stdint.h>
#include <stdlib.h>
#include <stdio.h>
#include "../box/tls_sll_box.h" // Phase 14-C: Borrowing design - refill from TLS SLL
// Magazine capacity - adaptive sizing for cache locality (Phase 14-B)
// Design principle: Balance capacity vs cache line usage
//
// Cache line 0 (64B): C2 + C3 magazines
// C2 (16B): 4 slots × 8B ptr = 32B
// C3 (32B): 4 slots × 8B ptr = 32B
// Total: 64B (perfect fit!)
//
// Cache line 1 (64B): C4 + C5 magazines + counters
// C4 (64B): 2 slots × 8B ptr = 16B
// C5 (128B): 1 slot × 8B ptr = 8B
// Counters: c1_top, c2_top, c4_top, c5_top = 4B
// Padding: 36B
// Total: 64B (fits!)
//
// Why fewer slots for larger classes?
// - Maintain cache locality (2 cache lines = 128B total)
// - Block size scales, so magazine memory scales proportionally
// - Free path supplies blocks → even 1-2 slots maintain high hit rate
//
#ifndef ULTRA_HOT_MAG_CAP_C2
#define ULTRA_HOT_MAG_CAP_C2 4 // C2 (16B) - 4 slots
#endif
#ifndef ULTRA_HOT_MAG_CAP_C3
#define ULTRA_HOT_MAG_CAP_C3 4 // C3 (32B) - 4 slots
#endif
#ifndef ULTRA_HOT_MAG_CAP_C4
#define ULTRA_HOT_MAG_CAP_C4 2 // C4 (64B) - 2 slots (NEW Phase 14-B)
#endif
#ifndef ULTRA_HOT_MAG_CAP_C5
#define ULTRA_HOT_MAG_CAP_C5 1 // C5 (128B) - 1 slot (NEW Phase 14-B)
#endif
// TLS structure: 2 cache lines (128B) for hot path (Phase 14-B expanded)
// Layout:
// Cache line 0 (64B): C2_mag[4] (32B) + C3_mag[4] (32B)
// Cache line 1 (64B): C4_mag[2] (16B) + C5_mag[1] (8B) + counters (4B) + pad (36B)
// Cache line 2+: Statistics (cold path)
// Total hot state: 128B (2 cache lines)
typedef struct {
// ===== Cache line 0 (64B): C2/C3 magazines =====
void* c1_mag[ULTRA_HOT_MAG_CAP_C2]; // C2 (16B) - 4 slots, 32B
void* c2_mag[ULTRA_HOT_MAG_CAP_C3]; // C3 (32B) - 4 slots, 32B
// ===== Cache line 1 (64B): C4/C5 magazines + counters =====
void* c4_mag[ULTRA_HOT_MAG_CAP_C4]; // C4 (64B) - 2 slots, 16B (NEW Phase 14-B)
void* c5_mag[ULTRA_HOT_MAG_CAP_C5]; // C5 (128B) - 1 slot, 8B (NEW Phase 14-B)
uint8_t c1_top; // C2 magazine top index
uint8_t c2_top; // C3 magazine top index
uint8_t c4_top; // C4 magazine top index (NEW Phase 14-B)
uint8_t c5_top; // C5 magazine top index (NEW Phase 14-B)
uint8_t pad[36]; // Padding to cache line boundary
// ===== Statistics (cold path, cache line 2+) =====
uint64_t c1_alloc_calls;
uint64_t c1_hits;
uint64_t c1_misses;
uint64_t c2_alloc_calls;
uint64_t c2_hits;
uint64_t c2_misses;
uint64_t c4_alloc_calls; // NEW Phase 14-B
uint64_t c4_hits; // NEW Phase 14-B
uint64_t c4_misses; // NEW Phase 14-B
uint64_t c5_alloc_calls; // NEW Phase 14-B
uint64_t c5_hits; // NEW Phase 14-B
uint64_t c5_misses; // NEW Phase 14-B
uint64_t c1_free_calls;
uint64_t c1_free_hits;
uint64_t c2_free_calls;
uint64_t c2_free_hits;
uint64_t c4_free_calls; // NEW Phase 14-B
uint64_t c4_free_hits; // NEW Phase 14-B
uint64_t c5_free_calls; // NEW Phase 14-B
uint64_t c5_free_hits; // NEW Phase 14-B
} __attribute__((aligned(64))) TinyUltraHot;
// External TLS variable (defined in hakmem_tiny.c)
extern __thread TinyUltraHot g_ultra_hot;
// Enable flag (cached)
// ENV: HAKMEM_TINY_ULTRA_HOT
// - 0: Disable (use existing TinyHeapV2/FastCache)
// - 1 (default): Enable ultra-fast C1/C2 path
static inline int ultra_hot_enabled(void) {
static int g_enable = -1;
if (__builtin_expect(g_enable == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ULTRA_HOT");
if (e && *e) {
g_enable = (*e != '0') ? 1 : 0;
} else {
g_enable = 1; // Default: ON (Phase 14 decision)
}
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[UltraHot-INIT] ultra_hot_enabled() = %d\n", g_enable);
fflush(stderr);
#endif
}
return g_enable;
}
// Phase 14-C: Max size control (ENV: HAKMEM_TINY_ULTRA_HOT_MAX_SIZE)
// Purpose: Control which size classes UltraHot handles
// Default: 32 (C2/C3 only, safe for Random Mixed)
// Fixed-size: 128 (C2-C5, optimal for fixed-size workloads)
static inline size_t ultra_hot_max_size(void) {
static size_t g_max_size = 0;
if (__builtin_expect(g_max_size == 0, 0)) {
const char* e = getenv("HAKMEM_TINY_ULTRA_HOT_MAX_SIZE");
if (e && *e) {
g_max_size = (size_t)atoi(e);
} else {
g_max_size = 32; // Default: C2/C3 only (Phase 14 behavior)
}
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[UltraHot-INIT] ultra_hot_max_size() = %zu\n", g_max_size);
fflush(stderr);
#endif
}
return g_max_size;
}
// Ultra-fast alloc (C2/C3/C4/C5 - Phase 14-B expanded)
// Contract:
// - Input: size (must be 9-128B for C2-C5)
// - Output: BASE pointer (not USER pointer!) or NULL
// - Caller converts BASE → USER via HAK_RET_ALLOC
//
// Hot path (expect 95% hit rate):
// 1. size → class (cascading compares)
// 2. magazine pop (1 load + 1 decrement + 1 store)
// 3. return BASE
//
// Cold path (5% miss rate):
// - return NULL → caller uses existing TinyHeapV2/FastCache
//
// Performance target:
// - L1 dcache: 2 cache lines load (128B) - all 4 mags
// - Instructions: 5-7 instructions total per hit
// - Branches: 2 branches (size check + mag empty check)
static inline void* ultra_hot_alloc(size_t size) {
// Fast path: size → class (cascading compares for branch prediction)
// C2 = 16B (9-16), C3 = 32B (17-32), C4 = 64B (33-64), C5 = 128B (65-128)
if (__builtin_expect(size <= 16, 1)) {
// C2 path (16B)
g_ultra_hot.c1_alloc_calls++;
if (__builtin_expect(g_ultra_hot.c1_top > 0, 1)) {
// Magazine hit! (5 instructions: load top, dec, load mag, store top, ret)
g_ultra_hot.c1_hits++;
uint8_t idx = --g_ultra_hot.c1_top;
void* base = g_ultra_hot.c1_mag[idx];
return base; // Return BASE (caller converts to USER)
} else {
// Magazine empty (cold path)
g_ultra_hot.c1_misses++;
return NULL;
}
} else if (__builtin_expect(size <= 32, 1)) {
// C3 path (32B)
g_ultra_hot.c2_alloc_calls++;
if (__builtin_expect(g_ultra_hot.c2_top > 0, 1)) {
// Magazine hit!
g_ultra_hot.c2_hits++;
uint8_t idx = --g_ultra_hot.c2_top;
void* base = g_ultra_hot.c2_mag[idx];
return base;
} else {
// Magazine empty
g_ultra_hot.c2_misses++;
return NULL;
}
} else if (__builtin_expect(size <= 64 && ultra_hot_max_size() >= 64, 0)) {
// C4 path (64B) - Phase 14-C: ENV gated
g_ultra_hot.c4_alloc_calls++;
if (__builtin_expect(g_ultra_hot.c4_top > 0, 1)) {
// Magazine hit!
g_ultra_hot.c4_hits++;
uint8_t idx = --g_ultra_hot.c4_top;
void* base = g_ultra_hot.c4_mag[idx];
return base;
} else {
// Magazine empty
g_ultra_hot.c4_misses++;
return NULL;
}
} else if (__builtin_expect(size <= 128 && ultra_hot_max_size() >= 128, 0)) {
// C5 path (128B) - Phase 14-C: ENV gated
g_ultra_hot.c5_alloc_calls++;
if (__builtin_expect(g_ultra_hot.c5_top > 0, 1)) {
// Magazine hit!
g_ultra_hot.c5_hits++;
uint8_t idx = --g_ultra_hot.c5_top;
void* base = g_ultra_hot.c5_mag[idx];
return base;
} else {
// Magazine empty
g_ultra_hot.c5_misses++;
return NULL;
}
} else {
// Size out of range (C6+ or C0)
return NULL;
}
}
// Ultra-fast free (C2/C3/C4/C5 - Phase 14-B expanded)
// Contract:
// - Input: base (BASE pointer), class_idx
// - Output: 1 if handled, 0 if magazine full (fallback to existing path)
//
// Hot path (expect 95% hit rate):
// 1. class check (1 compare)
// 2. magazine push (1 load top + 1 store mag + 1 increment + 1 store top)
// 3. return 1
//
// Cold path (5% miss rate):
// - return 0 → caller uses existing TinyHeapV2/TLS SLL path
static inline int ultra_hot_free_by_class(void* base, int class_idx) {
// Fast path: class → magazine
// NOTE: HAKMEM class numbering: C0=8B, C1=?, C2=16B, C3=32B, C4=64B, C5=128B
if (__builtin_expect(class_idx == 2, 1)) {
// C2 path (16B)
g_ultra_hot.c1_free_calls++;
if (__builtin_expect(g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2, 1)) {
// Magazine has room! (5 instructions)
g_ultra_hot.c1_free_hits++;
uint8_t idx = g_ultra_hot.c1_top++;
g_ultra_hot.c1_mag[idx] = base;
return 1; // Success
} else {
// Magazine full → fallback
return 0;
}
} else if (__builtin_expect(class_idx == 3, 1)) {
// C3 path (32B)
g_ultra_hot.c2_free_calls++;
if (__builtin_expect(g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3, 1)) {
// Magazine has room!
g_ultra_hot.c2_free_hits++;
uint8_t idx = g_ultra_hot.c2_top++;
g_ultra_hot.c2_mag[idx] = base;
return 1;
} else {
// Magazine full
return 0;
}
} else if (__builtin_expect(class_idx == 4, 0)) {
// C4 path (64B) - NEW Phase 14-B
g_ultra_hot.c4_free_calls++;
if (__builtin_expect(g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4, 1)) {
// Magazine has room!
g_ultra_hot.c4_free_hits++;
uint8_t idx = g_ultra_hot.c4_top++;
g_ultra_hot.c4_mag[idx] = base;
return 1;
} else {
// Magazine full
return 0;
}
} else if (__builtin_expect(class_idx == 5, 0)) {
// C5 path (128B) - NEW Phase 14-B
g_ultra_hot.c5_free_calls++;
if (__builtin_expect(g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5, 1)) {
// Magazine has room!
g_ultra_hot.c5_free_hits++;
uint8_t idx = g_ultra_hot.c5_top++;
g_ultra_hot.c5_mag[idx] = base;
return 1;
} else {
// Magazine full
return 0;
}
} else {
// Class out of range (not C2-C5)
return 0;
}
}
// Magazine refill (called from existing front when it has spare blocks)
// Strategy: TinyHeapV2 / FastCache can "donate" blocks to UltraHot
// This is optional - UltraHot can work with just free path supply
static inline void ultra_hot_try_refill_c1(void* base) {
if (g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2) {
g_ultra_hot.c1_mag[g_ultra_hot.c1_top++] = base;
}
}
static inline void ultra_hot_try_refill_c2(void* base) {
if (g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3) {
g_ultra_hot.c2_mag[g_ultra_hot.c2_top++] = base;
}
}
static inline void ultra_hot_try_refill_c4(void* base) {
if (g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4) {
g_ultra_hot.c4_mag[g_ultra_hot.c4_top++] = base;
}
}
static inline void ultra_hot_try_refill_c5(void* base) {
if (g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5) {
g_ultra_hot.c5_mag[g_ultra_hot.c5_top++] = base;
}
}
// Print statistics (called at program exit if HAKMEM_TINY_ULTRA_HOT_STATS=1)
// Declaration only (implementation in hakmem_tiny.c for external linkage)
void ultra_hot_print_stats(void);
// Design notes:
//
// 1. Cache locality:
// - All state fits in 2 cache lines (128B total)
// - First line (64B): Both magazines (C1 + C2)
// - Second line (64B): Counters + stats
// - Expected L1 miss: ~1-2 per alloc/free (vs 30+ currently)
//
// 2. Instruction count:
// - Alloc hit: ~7 instructions (size check + mag pop + return)
// - Free hit: ~7 instructions (size check + mag push + return)
// - Total: ~14 instructions per alloc/free pair (vs ~281M/500K = 562 currently)
// - Reduction: 562 → 14 = 40x improvement
//
// 3. Branch prediction:
// - Size check: __builtin_expect(size <= 16, 1) - predict C1 likely
// - Magazine check: __builtin_expect(top > 0, 1) - predict hit likely
// - Expected branch-miss: ~5% (vs 7.83% currently)
//
// 4. Integration with existing front:
// - UltraHot is L0 (fastest)
// - TinyHeapV2 is L1 (fast)
// - FastCache is L2 (normal)
// - If UltraHot misses → fallback to L1/L2
// - Free path supplies both UltraHot and TinyHeapV2
//
// 5. Supply strategy:
// - Free path: Always try UltraHot first, then TinyHeapV2, then TLS SLL
// - Alloc path: Try UltraHot first, then TinyHeapV2, then FastCache
// - No refill from backend (keeps UltraHot ultra-simple)
//
// 6. Expected performance:
// - Current: 9.3M ops/s (Random Mixed 256B)
// - Target: 40-60M ops/s (+330-545%)
// - L1 miss: 2.9M → ~300K (-90%)
// - Instructions: 281M → ~80M (-71%)
// - Branches: 59M → ~15M (-75%)
//
// 7. Why C1/C2 only?
// - C1 (16B) + C2 (32B) cover ~60% of tiny allocations
// - Small magazine (4 slots) fits both in 1-2 cache lines
// - Size check is trivial (size <= 16 / size <= 32)
// - Larger classes (C3+) have different access patterns (less cache-sensitive)
//
// 8. Why not C0 (8B)?
// - TinyHeapV2 showed -5% regression on C0
// - 8B allocations are rare in real workloads
// - Magazine overhead too high for 8B blocks
//
// 9. Comparison with TinyHeapV2:
// - TinyHeapV2: 16 slots per class, covers C1-C3
// - UltraHot: 4 slots per class, covers C1-C2 only
// - UltraHot is "ultra-hot subset" of TinyHeapV2
// - Trade magazine capacity for cache locality
//
// 10. ENV flags:
// - HAKMEM_TINY_ULTRA_HOT=0/1 - Enable/disable (default: 1)
// - HAKMEM_TINY_ULTRA_HOT_STATS=0/1 - Print stats at exit (default: 0)
// =============================================================================
// Phase 14-C: Borrowing Design - Refill from TLS SLL (正史から借りる)
// =============================================================================
// Design: UltraHot は「TLS SLL の手前にあるビュー」として動作
// - Free: 正史TLS SLLに戻す横取りしない
// - Alloc miss: TLS SLL から借りて magazine を refill
// - 学習層Superslab/drainが正しい在庫を追跡できる
//
// Call this after ultra_hot_alloc() miss to refill magazine from TLS SLL
static inline void ultra_hot_try_refill(int class_idx) {
if (!ultra_hot_enabled()) return;
if (class_idx < 2 || class_idx > 5) return; // C2-C5 のみ
// Refill magazine to full capacity (borrow from TLS SLL = 正史)
if (class_idx == 2) {
// C2 (16B): 4 slots magazine
while (g_ultra_hot.c1_top < ULTRA_HOT_MAG_CAP_C2) {
void* ptr = NULL;
if (!tls_sll_pop(class_idx, &ptr)) break; // TLS SLL から借りる
g_ultra_hot.c1_mag[g_ultra_hot.c1_top++] = ptr;
}
} else if (class_idx == 3) {
// C3 (32B): 4 slots magazine
while (g_ultra_hot.c2_top < ULTRA_HOT_MAG_CAP_C3) {
void* ptr = NULL;
if (!tls_sll_pop(class_idx, &ptr)) break;
g_ultra_hot.c2_mag[g_ultra_hot.c2_top++] = ptr;
}
} else if (class_idx == 4) {
// C4 (64B): 2 slots magazine
while (g_ultra_hot.c4_top < ULTRA_HOT_MAG_CAP_C4) {
void* ptr = NULL;
if (!tls_sll_pop(class_idx, &ptr)) break;
g_ultra_hot.c4_mag[g_ultra_hot.c4_top++] = ptr;
}
} else if (class_idx == 5) {
// C5 (128B): 1 slot magazine
while (g_ultra_hot.c5_top < ULTRA_HOT_MAG_CAP_C5) {
void* ptr = NULL;
if (!tls_sll_pop(class_idx, &ptr)) break;
g_ultra_hot.c5_mag[g_ultra_hot.c5_top++] = ptr;
}
}
}
#endif // HAK_FRONT_TINY_ULTRA_HOT_H