**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV
**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS)
```
Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV
**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:
1. **core/hakmem_tiny.c:**
- `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
- `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
- `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
- `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
- `g_tls_bend[TINY_NUM_CLASSES] = {0}`
2. **core/tiny_fastcache.c:**
- `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
- `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
- `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
- `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`
3. **core/hakmem_tiny_magazine.c:**
- `g_tls_mags[TINY_NUM_CLASSES] = {0}`
4. **core/tiny_sticky.c:**
- `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
- `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
- `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`
**効果:**
```
Before: 1T: 2.09M ✅ | 4T: SEGV 💀
After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消)
```
**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅
# 4 threads: 完走(以前は SEGV)
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```
**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
299 lines
13 KiB
C++
299 lines
13 KiB
C++
// ============================================================================
|
||
// Step 3: Cold-path outline - Wrapper Context Handler
|
||
// ============================================================================
|
||
// Purpose: Handle allocations during wrapper calls (rare execution)
|
||
// Rationale: Avoid re-entrancy hazards with pthread locks during wrapper calls
|
||
// Step 3d: Force inline for readability without performance loss
|
||
__attribute__((always_inline))
|
||
static inline void* hak_tiny_alloc_wrapper(int class_idx) {
|
||
ROUTE_BEGIN(class_idx);
|
||
|
||
// Wrapper-context fast path: magazine-only (never take locks or refill)
|
||
tiny_small_mags_init_once();
|
||
if (__builtin_expect(class_idx > 3, 0)) tiny_mag_init_if_needed(class_idx);
|
||
TinyTLSMag* mag = &g_tls_mags[class_idx];
|
||
if (mag->top > 0) {
|
||
void* p = mag->items[--mag->top].ptr;
|
||
HAK_RET_ALLOC(class_idx, p);
|
||
}
|
||
|
||
// Try TLS active slabs (owner-only, lock-free)
|
||
TinySlab* tls = g_tls_active_slab_a[class_idx];
|
||
if (!(tls && tls->free_count > 0)) tls = g_tls_active_slab_b[class_idx];
|
||
if (tls && tls->free_count > 0) {
|
||
tiny_remote_drain_owner(tls);
|
||
if (tls->free_count > 0) {
|
||
int block_idx = hak_tiny_find_free_block(tls);
|
||
if (block_idx >= 0) {
|
||
hak_tiny_set_used(tls, block_idx);
|
||
tls->free_count--;
|
||
size_t bs = g_tiny_class_sizes[class_idx];
|
||
void* p = (char*)tls->base + (block_idx * bs);
|
||
HAK_RET_ALLOC(class_idx, p);
|
||
}
|
||
}
|
||
}
|
||
|
||
// Optional: attempt limited refill under trylock (no remote drain)
|
||
if (g_wrap_tiny_refill) {
|
||
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
|
||
if (pthread_mutex_trylock(lock) == 0) {
|
||
TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
|
||
if (slab && slab->free_count > 0) {
|
||
int room = mag->cap - mag->top;
|
||
if (room > 16) room = 16; // wrapper refill is small and quick
|
||
if (room > slab->free_count) room = slab->free_count;
|
||
if (room > 0) {
|
||
size_t bs = g_tiny_class_sizes[class_idx];
|
||
void* ret = NULL;
|
||
for (int i = 0; i < room; i++) {
|
||
int idx = hak_tiny_find_free_block(slab);
|
||
if (idx < 0) break;
|
||
hak_tiny_set_used(slab, idx);
|
||
slab->free_count--;
|
||
void* p = (char*)slab->base + (idx * bs);
|
||
if (i < room - 1) {
|
||
mag->items[mag->top].ptr = p;
|
||
mag->top++;
|
||
} else {
|
||
ret = p; // return one directly
|
||
}
|
||
}
|
||
if (slab->free_count == 0) {
|
||
move_to_full_list(class_idx, slab);
|
||
}
|
||
pthread_mutex_unlock(lock);
|
||
if (ret) { HAK_RET_ALLOC(class_idx, ret); }
|
||
} else {
|
||
pthread_mutex_unlock(lock);
|
||
}
|
||
} else {
|
||
pthread_mutex_unlock(lock);
|
||
}
|
||
}
|
||
}
|
||
return NULL; // empty → fallback to next allocator tier
|
||
}
|
||
|
||
|
||
void* hak_tiny_alloc(size_t size) {
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
if (!g_tiny_initialized) hak_tiny_init();
|
||
#else
|
||
if (__builtin_expect(!g_tiny_initialized, 0)) {
|
||
hak_tiny_init();
|
||
}
|
||
#endif
|
||
// Default (safe): Avoid using Tiny during wrapper calls(TLSガード or 関数)
|
||
// If HAKMEM_WRAP_TINY=1, allow Tiny even when called from wrapper.
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
# if HAKMEM_WRAPPER_TLS_GUARD
|
||
if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) {
|
||
static int log1 = 0;
|
||
if (log1 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: in_wrapper\n"); log1++; }
|
||
return NULL;
|
||
}
|
||
# else
|
||
extern int hak_in_wrapper(void);
|
||
if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) {
|
||
static int log2 = 0;
|
||
if (log2 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: hak_in_wrapper\n"); log2++; }
|
||
return NULL;
|
||
}
|
||
# endif
|
||
#endif
|
||
|
||
// ========================================================================
|
||
// Cooperative stats polling (SIGUSR1 trigger safe point)
|
||
hak_tiny_stats_poll();
|
||
|
||
// ========================================================================
|
||
// Phase 6-1.5: Ultra-Simple Fast Path (when enabled)
|
||
// ========================================================================
|
||
// Design: "Simple Front + Smart Back" - inspired by Mid-Large HAKX +171%
|
||
// - 3-4 instruction fast path (Phase 6-1 style)
|
||
// - Existing SuperSlab + ACE + Learning backend
|
||
// Two variants:
|
||
// Phase 6-1.5: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1 (alignment guessing)
|
||
// Phase 6-1.6: -DHAKMEM_TINY_PHASE6_METADATA=1 (metadata header)
|
||
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
|
||
return hak_tiny_alloc_ultra_simple(size);
|
||
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
|
||
return hak_tiny_alloc_metadata(size);
|
||
#endif
|
||
// ========================================================================
|
||
|
||
// 1. Size → class index
|
||
int class_idx = hak_tiny_size_to_class(size);
|
||
if (class_idx < 0) {
|
||
static int log3 = 0;
|
||
if (log3 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: class_idx < 0 for size %zu\n", size); log3++; }
|
||
return NULL; // >1KB
|
||
}
|
||
// Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1)
|
||
ROUTE_BEGIN(class_idx);
|
||
do {
|
||
static int g_alloc_ring = -1;
|
||
if (__builtin_expect(g_alloc_ring == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_ALLOC_RING");
|
||
g_alloc_ring = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
if (g_alloc_ring) {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, (uint16_t)class_idx, (void*)(uintptr_t)size, 0);
|
||
}
|
||
} while (0);
|
||
|
||
#if HAKMEM_TINY_MINIMAL_FRONT
|
||
// Minimal Front for hot tiny classes (bench-focused):
|
||
// SLL direct pop → minimal refill → pop, bypassing other layers.
|
||
if (__builtin_expect(class_idx <= 3, 1)) {
|
||
void* head = g_tls_sll_head[class_idx];
|
||
if (__builtin_expect(head != NULL, 1)) {
|
||
g_tls_sll_head[class_idx] = *(void**)head;
|
||
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
|
||
HAK_RET_ALLOC(class_idx, head);
|
||
}
|
||
// Refill a small batch directly from TLS-cached SuperSlab
|
||
(void)sll_refill_small_from_ss(class_idx, 32);
|
||
head = g_tls_sll_head[class_idx];
|
||
if (__builtin_expect(head != NULL, 1)) {
|
||
g_tls_sll_head[class_idx] = *(void**)head;
|
||
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
|
||
HAK_RET_ALLOC(class_idx, head);
|
||
}
|
||
// Fall through to slow path if still empty
|
||
}
|
||
#endif
|
||
|
||
// Ultra-Front: minimal per-class stack for hot tiny classes (opt-in)
|
||
// Try ultra_pop → (optional) ultra_refill_small → ultra_pop before other layers
|
||
if (__builtin_expect(g_ultra_simple && class_idx <= 3, 0)) {
|
||
void* up = ultra_pop(class_idx);
|
||
if (__builtin_expect(up == NULL, 0)) {
|
||
(void)ultra_refill_small(class_idx);
|
||
up = ultra_pop(class_idx);
|
||
}
|
||
if (__builtin_expect(up != NULL, 0)) {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, up, 0xF0);
|
||
HAK_RET_ALLOC(class_idx, up);
|
||
}
|
||
}
|
||
|
||
if (__builtin_expect(!g_debug_fast0, 1)) {
|
||
#ifdef HAKMEM_TINY_BENCH_FASTPATH
|
||
if (__builtin_expect(class_idx <= HAKMEM_TINY_BENCH_TINY_CLASSES, 1)) {
|
||
if (__builtin_expect(class_idx <= 3, 1)) {
|
||
unsigned char* done = &g_tls_bench_warm_done[class_idx];
|
||
if (__builtin_expect(*done == 0, 0)) {
|
||
int warm = (class_idx == 0) ? HAKMEM_TINY_BENCH_WARMUP8 :
|
||
(class_idx == 1) ? HAKMEM_TINY_BENCH_WARMUP16 :
|
||
(class_idx == 2) ? HAKMEM_TINY_BENCH_WARMUP32 :
|
||
HAKMEM_TINY_BENCH_WARMUP64;
|
||
if (warm > 0) (void)sll_refill_small_from_ss(class_idx, warm);
|
||
*done = 1;
|
||
}
|
||
}
|
||
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
|
||
tiny_small_mags_init_once();
|
||
if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
|
||
#endif
|
||
void* head = g_tls_sll_head[class_idx];
|
||
if (__builtin_expect(head != NULL, 1)) {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 0);
|
||
g_tls_sll_head[class_idx] = *(void**)head;
|
||
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
|
||
HAK_RET_ALLOC(class_idx, head);
|
||
}
|
||
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
|
||
TinyTLSMag* mag = &g_tls_mags[class_idx];
|
||
int t = mag->top;
|
||
if (__builtin_expect(t > 0, 1)) {
|
||
void* p = mag->items[--t].ptr;
|
||
mag->top = t;
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, p, 1);
|
||
HAK_RET_ALLOC(class_idx, p);
|
||
}
|
||
#endif
|
||
int bench_refill = (class_idx == 0) ? HAKMEM_TINY_BENCH_REFILL8 :
|
||
(class_idx == 1) ? HAKMEM_TINY_BENCH_REFILL16 :
|
||
(class_idx == 2) ? HAKMEM_TINY_BENCH_REFILL32 :
|
||
HAKMEM_TINY_BENCH_REFILL64;
|
||
if (__builtin_expect(sll_refill_small_from_ss(class_idx, bench_refill) > 0, 0)) {
|
||
head = g_tls_sll_head[class_idx];
|
||
if (head) {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 2);
|
||
g_tls_sll_head[class_idx] = *(void**)head;
|
||
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
|
||
HAK_RET_ALLOC(class_idx, head);
|
||
}
|
||
}
|
||
// fallthrough to slow path on miss
|
||
}
|
||
#endif
|
||
|
||
// TinyHotMag front: fast-tierが枯渇したとき、キャッシュを再補充してから利用する
|
||
if (__builtin_expect(g_hotmag_enable && class_idx <= 2 && g_fast_head[class_idx] == NULL, 0)) {
|
||
hotmag_init_if_needed(class_idx);
|
||
TinyHotMag* hm = &g_tls_hot_mag[class_idx];
|
||
void* hotmag_ptr = hotmag_pop(class_idx);
|
||
if (__builtin_expect(hotmag_ptr == NULL, 0)) {
|
||
if (hotmag_try_refill(class_idx, hm) > 0) {
|
||
hotmag_ptr = hotmag_pop(class_idx);
|
||
}
|
||
}
|
||
if (__builtin_expect(hotmag_ptr != NULL, 1)) {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, hotmag_ptr, 3);
|
||
HAK_RET_ALLOC(class_idx, hotmag_ptr);
|
||
}
|
||
}
|
||
|
||
if (g_hot_alloc_fn[class_idx] != NULL) {
|
||
void* fast_hot = NULL;
|
||
switch (class_idx) {
|
||
case 0:
|
||
fast_hot = tiny_hot_pop_class0();
|
||
break;
|
||
case 1:
|
||
fast_hot = tiny_hot_pop_class1();
|
||
break;
|
||
case 2:
|
||
fast_hot = tiny_hot_pop_class2();
|
||
break;
|
||
case 3:
|
||
fast_hot = tiny_hot_pop_class3();
|
||
break;
|
||
default:
|
||
fast_hot = NULL;
|
||
break;
|
||
}
|
||
if (__builtin_expect(fast_hot != NULL, 1)) {
|
||
#if HAKMEM_BUILD_DEBUG
|
||
g_tls_hit_count[class_idx]++;
|
||
#endif
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_hot, 4);
|
||
HAK_RET_ALLOC(class_idx, fast_hot);
|
||
}
|
||
}
|
||
|
||
void* fast = tiny_fast_pop(class_idx);
|
||
if (__builtin_expect(fast != NULL, 0)) {
|
||
#if HAKMEM_BUILD_DEBUG
|
||
g_tls_hit_count[class_idx]++;
|
||
#endif
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast, 5);
|
||
HAK_RET_ALLOC(class_idx, fast);
|
||
}
|
||
} else {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, NULL, 0);
|
||
}
|
||
|
||
void* slow_ptr = hak_tiny_alloc_slow(size, class_idx);
|
||
if (slow_ptr) {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, slow_ptr, 6);
|
||
HAK_RET_ALLOC(class_idx, slow_ptr); // Increment stats for slow path success
|
||
}
|
||
tiny_alloc_dump_tls_state(class_idx, "fail", &g_tls_slabs[class_idx]);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_NULL, (uint16_t)class_idx, NULL, 0);
|
||
return slow_ptr;
|
||
}
|