Files
hakmem/core/hakmem_tiny_alloc_new.inc
Moe Charm (CI) 1da8754d45 CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消
**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M   |  4T: SEGV 💀
After:  1T: 2.41M   |  4T: 4.19M   (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s 

# 4 threads: 完走(以前は SEGV)
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s 
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00

301 lines
12 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_alloc_new.inc
// New 3-layer Tiny Pool allocation (simplified)
//
// Purpose: Reduce from 6-7 layers to 3 layers
// Target: 100+ instructions/op → 20-30 instructions/op
//
// Part of 3-layer architecture simplification (2025-11-01)
// Based on ChatGPT Pro UltraThink recommendations
// === IMPORTANT: Disable old benchmark fastpath ===
// The old HAKMEM_TINY_BENCH_FASTPATH conflicts with new 3-layer architecture
// We must disable it to ensure our new code runs
#ifdef HAKMEM_TINY_BENCH_FASTPATH
#undef HAKMEM_TINY_BENCH_FASTPATH
#endif
// Debug counters (thread-local)
static __thread uint64_t g_3layer_bump_hits = 0;
static __thread uint64_t g_3layer_mag_hits = 0;
static __thread uint64_t g_3layer_slow_hits = 0;
static __thread uint64_t g_3layer_refill_count = 0;
static __thread uint64_t g_3layer_refill_items = 0;
static __thread uint64_t g_3layer_fallback_superslab_disabled = 0;
static __thread uint64_t g_3layer_fallback_no_ss = 0;
static __thread uint64_t g_3layer_fallback_no_meta = 0;
static __thread uint64_t g_3layer_batch_carve_count = 0;
// Active accounting helper (env toggle: HAKMEM_TINY_ACTIVE_FIX=0 to disable)
static inline int tiny_active_fix_enabled(void) {
static int g_active_fix_en = -1;
if (__builtin_expect(g_active_fix_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ACTIVE_FIX");
g_active_fix_en = (e && atoi(e) == 0) ? 0 : 1;
}
return g_active_fix_en;
}
static inline void tiny_active_account_alloc(void* ptr) {
if (!ptr || !g_use_superslab) return;
if (!tiny_active_fix_enabled()) return;
SuperSlab* ss = hak_super_lookup(ptr);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
ss_active_inc(ss);
}
}
// Forward declaration for Layer 3
__attribute__((noinline, cold))
static void* tiny_alloc_slow_new(int class_idx);
// ============================================================================
// Main Allocation Function (3-layer architecture)
// ============================================================================
void* hak_tiny_alloc(size_t size) {
// Initialization check (cold path, once per thread)
#if !HAKMEM_BUILD_RELEASE
if (!g_tiny_initialized) hak_tiny_init();
#else
if (__builtin_expect(!g_tiny_initialized, 0)) {
hak_tiny_init();
}
#endif
// Wrapper guard (safety check, rare)
#if !HAKMEM_BUILD_RELEASE
# if HAKMEM_WRAPPER_TLS_GUARD
if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) return NULL;
# else
extern int hak_in_wrapper(void);
if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) return NULL;
# endif
#endif
// Size to class index
int class_idx = hak_tiny_size_to_class(size);
if (class_idx < 0) return NULL; // > 1KB
// Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1)
ROUTE_BEGIN(class_idx);
// Initialize small magazine (once per thread)
if (__builtin_expect(!g_tiny_small_mag_initialized, 0)) {
tiny_small_mag_init();
}
// ========================================================================
// === LAYER 1: TLS Bump Allocator (hot classes 0-2: 8B/16B/32B) ===
// === Target: 2-3 instructions/op ===
// ========================================================================
if (likely(class_idx <= 2)) {
void* p = tiny_bump_alloc(class_idx);
if (likely(p)) {
tiny_active_account_alloc(p);
g_3layer_bump_hits++;
// Mark: bump hit便宜的にhot_hitのbitを再利用 8
ROUTE_MARK(8); ROUTE_COMMIT(class_idx, 0x40);
HAK_RET_ALLOC(class_idx, p);
}
}
// ========================================================================
// === LAYER 2: TLS Small Magazine (all classes, 128 items) ===
// === Target: 5-10 instructions/op ===
// ========================================================================
void* p = small_mag_pop(class_idx);
if (likely(p)) {
extern unsigned long long g_front_mag_hit[];
g_front_mag_hit[class_idx]++;
tiny_active_account_alloc(p);
g_3layer_mag_hits++;
// Mark: small mag hitbench_hitのbitを便宜的に再利用 10
ROUTE_MARK(10); ROUTE_COMMIT(class_idx, 0x41);
HAK_RET_ALLOC(class_idx, p);
}
// ========================================================================
// === LAYER 3: Slow path (refill, slab allocation) ===
// === Target: 50-100+ instructions/op (rare) ===
// ========================================================================
g_3layer_slow_hits++;
return tiny_alloc_slow_new(class_idx);
}
// ============================================================================
// Layer 3: Slow Path (refill and slab management)
// ============================================================================
__attribute__((noinline, cold))
static void* tiny_alloc_slow_new(int class_idx) {
// ReturnFirst Selector: try Ready/Mailbox/Sticky/Hot/Bench/Registry once
do {
static int g_return_first = -1; // env: HAKMEM_TINY_RETURN_FIRST (default ON)
if (__builtin_expect(g_return_first == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_RETURN_FIRST");
g_return_first = (e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_return_first, 1)) {
extern __thread TinyTLSSlab g_tls_slabs[];
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
SuperSlab* rs = tiny_refill_try_fast(class_idx, tls);
(void)rs; // On success, tls->ss is bound and Step 2 will carve
}
} while (0);
// ========================================================================
// Layer 3: Refill Small Magazine and/or Bump from existing infrastructure
// ========================================================================
// Step 1: Try to refill Small Magazine from existing TLS Magazine
tiny_mag_init_if_needed(class_idx);
TinyTLSMag* large_mag = &g_tls_mags[class_idx];
if (large_mag->top > 0) {
// Batch transfer from large magazine (2048) to small magazine
int batch_size = 64; // Transfer in batches of 64
if (batch_size > large_mag->top) batch_size = large_mag->top;
void* items[64];
for (int i = 0; i < batch_size; i++) {
items[i] = large_mag->items[large_mag->top - 1 - i].ptr;
}
large_mag->top -= batch_size;
// Push to Small Magazine
int pushed = small_mag_batch_push(class_idx, items, batch_size);
g_3layer_refill_count++;
g_3layer_refill_items += pushed;
// Try to pop one and return
void* p = small_mag_pop(class_idx);
if (p) {
tiny_active_account_alloc(p);
return p;
}
}
// Step 2: Large Magazine empty - batch carve from SuperSlab directly
// ChatGPT Pro P0: Complete batch化 (based on tls_refill_from_tls_slab:115-126)
if (!g_use_superslab) {
g_3layer_fallback_superslab_disabled++;
return hak_tiny_alloc_slow(0, class_idx);
}
TinyTLSSlab* tls_slab = &g_tls_slabs[class_idx];
if (!tls_slab->ss) {
if (superslab_refill(class_idx) == NULL) {
g_3layer_fallback_no_ss++;
// Optional one-shot debug
static int g_alloc_dbg = -1; if (__builtin_expect(g_alloc_dbg == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg) {
static _Atomic int printed_ss[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_ss[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] refill returned NULL (no SS) class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx); // Fallback
}
}
TinySlabMeta* meta = tls_slab->meta;
if (!meta) {
g_3layer_fallback_no_meta++;
// Optional one-shot debug
static int g_alloc_dbg2 = -1; if (__builtin_expect(g_alloc_dbg2 == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg2 = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg2) {
static _Atomic int printed_meta[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_meta[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] meta is NULL after refill class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx);
}
// Batch carve from SuperSlab (P0 optimization - no 64x function calls!)
uint32_t want = 64; // Refill target
void* items[64];
int got = 0;
// Try freelist first (small amount, usually 0)
while (got < (int)want && meta->freelist) {
void* node = meta->freelist;
meta->freelist = *(void**)node;
items[got++] = node;
meta->used++;
}
// Then linear carve (KEY OPTIMIZATION - direct array fill!)
if (got < (int)want && meta->used < meta->capacity) {
uint32_t need = want - got;
uint32_t available = meta->capacity - meta->used;
if (need > available) need = available;
size_t block_size = g_tiny_class_sizes[class_idx];
uint8_t* slab_base = tls_slab->slab_base ? tls_slab->slab_base
: tiny_slab_base_for(tls_slab->ss, tls_slab->slab_idx);
uint8_t* cursor = slab_base + ((size_t)meta->used * block_size);
// Batch carve: directly fill items array (no linked list, no 64 function calls!)
for (uint32_t i = 0; i < need; ++i) {
items[got++] = (void*)cursor;
cursor += block_size;
}
meta->used += need; // Reserve to TLS; not active until returned to user
}
if (got == 0) {
// Slab exhausted, try refill and retry once
if (superslab_refill(class_idx) != NULL) {
return tiny_alloc_slow_new(class_idx); // Recursive retry
}
static int g_alloc_dbg3 = -1; if (__builtin_expect(g_alloc_dbg3 == -1, 0)) { const char* e=getenv("HAKMEM_TINY_ALLOC_DEBUG"); g_alloc_dbg3 = (e && atoi(e)!=0)?1:0; }
if (g_alloc_dbg3) {
static _Atomic int printed_final[8]; int exp=0;
if (atomic_compare_exchange_strong(&printed_final[class_idx], &exp, 1)) {
fprintf(stderr, "[ALLOC3] no items after retry (final fallback) class=%d\n", class_idx);
}
}
return hak_tiny_alloc_slow(0, class_idx); // Ultimate fallback
}
// Take one for return, push rest to Small Magazine
g_3layer_batch_carve_count++;
void* result = items[0];
if (got > 1) {
int pushed = small_mag_batch_push(class_idx, &items[1], got - 1);
g_3layer_refill_count++;
g_3layer_refill_items += pushed;
}
tiny_active_account_alloc(result);
// Route: slab carve directlinear相当の採用扱い
ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60);
return result;
}
// Debug function: print layer statistics
__attribute__((destructor))
static void print_3layer_stats(void) {
uint64_t total = g_3layer_bump_hits + g_3layer_mag_hits + g_3layer_slow_hits;
if (total > 0) {
fprintf(stderr, "\n=== 3-Layer Architecture Stats ===\n");
fprintf(stderr, "Bump hits: %10lu (%5.2f%%)\n",
g_3layer_bump_hits, 100.0 * g_3layer_bump_hits / total);
fprintf(stderr, "Mag hits: %10lu (%5.2f%%)\n",
g_3layer_mag_hits, 100.0 * g_3layer_mag_hits / total);
fprintf(stderr, "Slow hits: %10lu (%5.2f%%)\n",
g_3layer_slow_hits, 100.0 * g_3layer_slow_hits / total);
fprintf(stderr, "Total allocs: %10lu\n", total);
fprintf(stderr, "Refill count: %10lu\n", g_3layer_refill_count);
fprintf(stderr, "Refill items: %10lu (avg %.1f/refill)\n",
g_3layer_refill_items,
g_3layer_refill_count > 0 ? (double)g_3layer_refill_items / g_3layer_refill_count : 0.0);
fprintf(stderr, "=== Fallback Paths ===\n");
fprintf(stderr, "SuperSlab disabled: %lu\n", g_3layer_fallback_superslab_disabled);
fprintf(stderr, "No SuperSlab: %lu\n", g_3layer_fallback_no_ss);
fprintf(stderr, "No meta: %lu\n", g_3layer_fallback_no_meta);
fprintf(stderr, "Batch carve count: %lu\n", g_3layer_batch_carve_count);
}
}