**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV
**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS)
```
Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV
**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:
1. **core/hakmem_tiny.c:**
- `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
- `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
- `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
- `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
- `g_tls_bend[TINY_NUM_CLASSES] = {0}`
2. **core/tiny_fastcache.c:**
- `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
- `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
- `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
- `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`
3. **core/hakmem_tiny_magazine.c:**
- `g_tls_mags[TINY_NUM_CLASSES] = {0}`
4. **core/tiny_sticky.c:**
- `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
- `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
- `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`
**効果:**
```
Before: 1T: 2.09M ✅ | 4T: SEGV 💀
After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消)
```
**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s ✅
# 4 threads: 完走(以前は SEGV)
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s ✅
```
**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>
104 lines
4.6 KiB
C
104 lines
4.6 KiB
C
// pool_tls_ring.inc.h — Box: L2 Pool TLS ring/shard helpers
|
||
#ifndef POOL_TLS_RING_INC_H
|
||
#define POOL_TLS_RING_INC_H
|
||
|
||
// Minimal header write for Mid allocations (fast-path friendly)
|
||
static inline void mid_set_header(AllocHeader* hdr, size_t class_sz, uintptr_t site_id) {
|
||
if (g_hdr_light_enabled >= 1) return;
|
||
hdr->magic = HAKMEM_MAGIC;
|
||
hdr->method = ALLOC_METHOD_POOL;
|
||
hdr->size = class_sz;
|
||
if (!g_hdr_light_enabled) {
|
||
hdr->alloc_site = site_id;
|
||
hdr->class_bytes = 0;
|
||
hdr->owner_tid = (uintptr_t)(uintptr_t)pthread_self();
|
||
}
|
||
}
|
||
|
||
// Branchless size→class LUT for 0..52KB(Bridge classes対応)
|
||
static inline int hak_pool_get_class_index(size_t size) {
|
||
for (int i = 0; i < POOL_NUM_CLASSES; i++) {
|
||
size_t cs = g_class_sizes[i];
|
||
if (cs != 0 && size == cs) return i;
|
||
}
|
||
uint32_t kb = (uint32_t)((size + 1023) >> 10);
|
||
extern const uint8_t SIZE_TO_CLASS[53];
|
||
return (kb < 53) ? SIZE_TO_CLASS[kb] : -1;
|
||
}
|
||
|
||
// site_id→shard(64 shards)
|
||
static inline int hak_pool_get_shard_index(uintptr_t site_id) {
|
||
if (!g_shard_mix_enabled) {
|
||
return (int)((site_id >> 4) & (POOL_NUM_SHARDS - 1));
|
||
}
|
||
uint64_t x = (uint64_t)site_id;
|
||
uint64_t tid = (uint64_t)(uintptr_t)pthread_self();
|
||
x ^= (tid << 1);
|
||
x += 0x9e3779b97f4a7c15ULL;
|
||
x = (x ^ (x >> 30)) * 0xbf58476d1ce4e5b9ULL;
|
||
x = (x ^ (x >> 27)) * 0x94d049bb133111ebULL;
|
||
x = (x ^ (x >> 31));
|
||
return (int)((uint32_t)x & (POOL_NUM_SHARDS - 1));
|
||
}
|
||
|
||
// Bitmap helpers(O(1))
|
||
static inline void set_nonempty_bit(int class_idx, int shard_idx) {
|
||
atomic_fetch_or(&g_pool.nonempty_mask[class_idx], (uint64_t)(1ULL << shard_idx));
|
||
}
|
||
static inline void clear_nonempty_bit(int class_idx, int shard_idx) {
|
||
atomic_fetch_and(&g_pool.nonempty_mask[class_idx], ~(uint64_t)(1ULL << shard_idx));
|
||
}
|
||
static inline int is_shard_nonempty(int class_idx, int shard_idx) {
|
||
uint64_t mask = atomic_load(&g_pool.nonempty_mask[class_idx]);
|
||
return (mask & (1ULL << shard_idx)) != 0;
|
||
}
|
||
|
||
// Remote MPSC → freelist(ロック下)
|
||
static inline void drain_remote_locked(int class_idx, int shard_idx) {
|
||
uintptr_t head = atomic_exchange_explicit(&g_pool.remote_head[class_idx][shard_idx], (uintptr_t)0, memory_order_acq_rel);
|
||
unsigned drained = 0;
|
||
while (head) {
|
||
PoolBlock* b = (PoolBlock*)head; head = (uintptr_t)b->next;
|
||
b->next = g_pool.freelist[class_idx][shard_idx];
|
||
g_pool.freelist[class_idx][shard_idx] = b; drained++;
|
||
}
|
||
if (drained) {
|
||
atomic_fetch_sub_explicit(&g_pool.remote_count[class_idx][shard_idx], drained, memory_order_relaxed);
|
||
if (g_pool.freelist[class_idx][shard_idx]) set_nonempty_bit(class_idx, shard_idx);
|
||
}
|
||
}
|
||
|
||
// 近傍のnon-empty shardを選ぶ(無ければpreferred)
|
||
static inline int choose_nonempty_shard(int class_idx, int preferred) {
|
||
uint64_t mask = atomic_load_explicit(&g_pool.nonempty_mask[class_idx], memory_order_acquire);
|
||
if (!mask) return preferred;
|
||
int shift = preferred & 63; uint64_t rot = (mask >> shift) | (mask << (64 - shift));
|
||
if (!rot) return preferred; int off = __builtin_ctzll(rot);
|
||
return (preferred + off) & (POOL_NUM_SHARDS - 1);
|
||
}
|
||
|
||
// TLSアクティブページの確保(bump-run)
|
||
static inline int alloc_tls_page(int class_idx, PoolTLSPage* ap) {
|
||
size_t user_size = g_class_sizes[class_idx]; size_t block_size = HEADER_SIZE + user_size;
|
||
int blocks_per_page = POOL_PAGE_SIZE / block_size; if (blocks_per_page <= 0) return 0;
|
||
void* page = mmap(NULL, POOL_PAGE_SIZE, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANONYMOUS, -1, 0); if (!page) return 0;
|
||
ap->page = page; ap->bump = (char*)page; ap->end = (char*)page + POOL_PAGE_SIZE; ap->count = blocks_per_page;
|
||
mid_desc_register(page, class_idx, (uint64_t)(uintptr_t)pthread_self());
|
||
g_pool.refills[class_idx]++; g_pool.total_pages_allocated++; g_pool.pages_by_class[class_idx]++; g_pool.total_bytes_allocated += POOL_PAGE_SIZE; return 1;
|
||
}
|
||
|
||
// TLS ring/LIFO への補充(リンク無し)
|
||
static inline int refill_tls_from_active_page(int class_idx, PoolTLSRing* ring, PoolTLSBin* bin, PoolTLSPage* ap, int need) {
|
||
if (!ap || !ap->page || ap->count <= 0 || ap->bump >= ap->end) return 0; size_t blk = HEADER_SIZE + g_class_sizes[class_idx];
|
||
int moved = 0, to_add = need;
|
||
while (to_add > 0 && ap->bump < ap->end && ap->count > 0) {
|
||
PoolBlock* b = (PoolBlock*)(void*)ap->bump;
|
||
if (g_tls_ring_enabled && ring->top < POOL_L2_RING_CAP) { ring->items[ring->top++] = b; } else { b->next = bin->lo_head; bin->lo_head = b; bin->lo_count++; }
|
||
ap->bump += blk; ap->count--; moved++; to_add--;
|
||
}
|
||
return moved;
|
||
}
|
||
|
||
#endif // POOL_TLS_RING_INC_H
|
||
|