Files
hakmem/core/tiny_free_fast.inc.h
Moe Charm (CI) 1da8754d45 CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消
**問題:**
- Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走)
- System/mimalloc は 4T で 33.52M ops/s 正常動作
- SS OFF + Remote OFF でも 4T で SEGV

**根本原因: (Task agent ultrathink 調査結果)**
```
CRASH: mov (%r15),%r13
R15 = 0x6261  ← ASCII "ba" (ゴミ値、未初期化TLS)
```

Worker スレッドの TLS 変数が未初期化:
- `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];`  ← 初期化なし
- pthread_create() で生成されたスレッドでゼロ初期化されない
- NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV

**修正内容:**
全 TLS 配列に明示的初期化子 `= {0}` を追加:

1. **core/hakmem_tiny.c:**
   - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}`
   - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}`
   - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bcur[TINY_NUM_CLASSES] = {0}`
   - `g_tls_bend[TINY_NUM_CLASSES] = {0}`

2. **core/tiny_fastcache.c:**
   - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}`
   - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}`

3. **core/hakmem_tiny_magazine.c:**
   - `g_tls_mags[TINY_NUM_CLASSES] = {0}`

4. **core/tiny_sticky.c:**
   - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}`
   - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}`

**効果:**
```
Before: 1T: 2.09M   |  4T: SEGV 💀
After:  1T: 2.41M   |  4T: 4.19M   (+15% 1T, SEGV解消)
```

**テスト:**
```bash
# 1 thread: 完走
./larson_hakmem 2 8 128 1024 1 12345 1
→ Throughput = 2,407,597 ops/s 

# 4 threads: 完走(以前は SEGV)
./larson_hakmem 2 8 128 1024 1 12345 4
→ Throughput = 4,192,155 ops/s 
```

**調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00

308 lines
12 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_free_fast.inc.h - Box 6: Free Fast Path (same-thread free only)
// Purpose: Ultra-fast TLS freelist push (ownership check + 2-3 instructions)
// Invariant: owner_tid == my_tid → push to TLS, else → delegate to remote path
// Design: Clear boundary between same-thread (fast) and cross-thread (remote)
#pragma once
#include "tiny_atomic.h"
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include "slab_handle.h"
#include "tiny_alloc_fast_sfc.inc.h" // For sfc_free_push
// ========== Debug Counters (compile-time gated) ==========
#if HAKMEM_DEBUG_COUNTERS
// Free pipeline counters (defined in hakmem_tiny.c)
extern unsigned long long g_free_via_ss_local[];
extern unsigned long long g_free_via_ss_remote[];
#endif
// ========== Box 6: Free Fast Path ==========
// 箱理論の Fast Free 層。Same-thread free のみ処理2-3命令 + ownership check
// 不変条件:
// - owner_tid == my_tid → TLS freelist に push (no lock, no sync)
// - owner_tid != my_tid → Box 2 (Remote Queue) に委譲
// - Cross-thread free は絶対に TLS freelist に入れないA213 エラー防止)
// External functions (Backend)
extern void hak_tiny_free(void* ptr);
extern void hak_tiny_free_with_slab(void* ptr, TinySlab* slab);
// hak_free_at signature: (void* ptr, size_t hint_sz, hak_callsite_t site)
// where hak_callsite_t is const void*
extern void hak_free_at(void* ptr, size_t hint_sz, const void* site);
extern SuperSlab* hak_super_lookup(void* ptr);
extern TinySlab* hak_tiny_owner_slab(void* ptr);
extern int g_use_superslab;
// External helpers
extern uint32_t tiny_self_u32(void);
extern pthread_t tiny_self_pt(void);
// External TLS variables (from Box 5)
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
// Box 5 helper (TLS push)
extern void tiny_alloc_fast_push(int class_idx, void* ptr);
// ========== Ownership Check ==========
// Check if ptr belongs to current thread (SuperSlab path)
// Returns: 1 if same-thread, 0 if cross-thread
//
// Box Boundary: This is the critical check that prevents TOCTOU races
// - owner_tid == my_tid → Safe to push to TLS freelist
// - owner_tid != my_tid → MUST delegate to remote path
//
// Invariant: This check MUST be atomic (no TOCTOU between check and push)
static inline int tiny_free_is_same_thread_ss(SuperSlab* ss, int slab_idx, uint32_t my_tid) {
TinySlabMeta* meta = &ss->slabs[slab_idx];
// Box 3 (Ownership): Load owner_tid atomically
uint32_t owner = tiny_atomic_load_u32_relaxed(&meta->owner_tid);
// Same thread check
return (owner == my_tid);
}
// Check if ptr belongs to current thread (Legacy TinySlab path)
// Returns: 1 if same-thread, 0 if cross-thread
static inline int tiny_free_is_same_thread_legacy(TinySlab* slab) {
pthread_t my_tid = tiny_self_pt();
return pthread_equal(slab->owner_tid, my_tid);
}
// ========== Fast Path: Same-Thread Free (2-3 instructions) ==========
// Free fast path for SuperSlab-backed allocation
// Returns: 1 on success (same-thread, pushed to TLS), 0 on failure (cross-thread)
//
// Assembly (x86-64, optimized):
// mov eax, DWORD PTR [meta->owner_tid] ; Load owner_tid
// cmp eax, my_tid ; Compare with my_tid
// jne .cross_thread ; If not equal, cross-thread
// mov rax, QWORD PTR g_tls_sll_head[cls] ; Load head
// mov QWORD PTR [ptr], rax ; ptr->next = head
// mov QWORD PTR g_tls_sll_head[cls], ptr ; head = ptr
// ret ; Done
// .cross_thread:
// ; Delegate to remote path
//
// Expected: 2-3 instructions on same-thread path (1 cmp, 1 load, 1 store)
static inline int tiny_free_fast_ss(SuperSlab* ss, int slab_idx, void* ptr, uint32_t my_tid) {
// BUGFIX: Validate slab_idx before array access (prevents buffer overflow at ss->slabs[-1])
int cap = ss_slabs_capacity(ss);
if (__builtin_expect(slab_idx < 0 || slab_idx >= cap, 0)) {
return 0; // Invalid index, reject
}
TinySlabMeta* meta = &ss->slabs[slab_idx];
// Debug: Track tiny_free_fast_ss calls
static __thread int free_ss_debug_count = 0;
if (getenv("HAKMEM_SFC_DEBUG") && free_ss_debug_count < 20) {
free_ss_debug_count++;
int is_same = tiny_free_is_same_thread_ss(ss, slab_idx, my_tid);
extern int g_sfc_enabled;
fprintf(stderr, "[FREE_SS] ptr=%p, cls=%d, same_thread=%d, sfc_enabled=%d\n",
ptr, ss->size_class, is_same, g_sfc_enabled);
}
// Box 6 Boundary: Ownership check (TOCTOU-safe)
if (__builtin_expect(!tiny_free_is_same_thread_ss(ss, slab_idx, my_tid), 0)) {
#if HAKMEM_DEBUG_COUNTERS
// Track cross-thread frees (compile-time gated)
g_free_via_ss_remote[ss->size_class]++;
#endif
return 0; // Cross-thread → caller should delegate to remote path
}
// Fast path: Same-thread free (2-3 instructions)
int class_idx = ss->size_class;
#if HAKMEM_DEBUG_COUNTERS
// Track same-thread frees (compile-time gated)
g_free_via_ss_local[class_idx]++;
#endif
// Box 5-NEW/5-OLD integration: Push to TLS freelist (SFC or SLL)
extern int g_sfc_enabled;
if (g_sfc_enabled) {
// Box 5-NEW: Try SFC (128 slots)
if (!sfc_free_push(class_idx, ptr)) {
// SFC full → skip caching, use slow path (return 0)
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
return 0;
}
} else {
// Box 5-OLD: Use SLL (16 slots)
tiny_alloc_fast_push(class_idx, ptr);
}
// Active accounting (Box 3: SuperSlab)
// This is relatively cheap (atomic decrement) and necessary for memory management
ss_active_dec_one(ss);
return 1; // Success
}
// Free fast path for Legacy TinySlab-backed allocation
// Returns: 1 on success (same-thread), 0 on failure (cross-thread)
static inline int tiny_free_fast_legacy(TinySlab* slab, void* ptr) {
// Box 6 Boundary: Ownership check
if (__builtin_expect(!tiny_free_is_same_thread_legacy(slab), 0)) {
return 0; // Cross-thread → caller should delegate to precise path
}
// Fast path: Same-thread free
int class_idx = slab->class_idx;
// Box 5-NEW/5-OLD integration: Push to TLS freelist (SFC or SLL)
extern int g_sfc_enabled;
if (g_sfc_enabled) {
// Box 5-NEW: Try SFC (128 slots)
if (!sfc_free_push(class_idx, ptr)) {
// SFC full → skip caching, use slow path (return 0)
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
return 0;
}
} else {
// Box 5-OLD: Use SLL (16 slots)
tiny_alloc_fast_push(class_idx, ptr);
}
return 1; // Success
}
// ========== Combined Fast Free (Lookup + Ownership + Push) ==========
// Complete fast free path (inline for zero-cost)
// Returns: none (delegates to backend on cross-thread or non-tiny)
//
// Flow:
// 1. Lookup ptr → SuperSlab or TinySlab
// 2. Ownership check (owner_tid == my_tid)
// 3. Same-thread → TLS freelist push (2-3 instructions)
// 4. Cross-thread → Delegate to Box 2 (Remote Queue)
// 5. Not Tiny → Delegate to backend (Mid/Large)
//
// Example usage:
// tiny_free_fast(ptr); // Always succeeds (delegates on failure)
static inline void tiny_free_fast(void* ptr) {
// Optional runtime gate to disable fast free and route to slow path
// Env: HAKMEM_TINY_FREE_FAST (default: 1). Additionally, if
// HAKMEM_TINY_FREE_TO_SS=1 is set, prefer SS path by disabling fast free.
static int s_free_fast_en = -1;
if (__builtin_expect(s_free_fast_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_FREE_FAST");
int v = (e && *e && *e != '0') ? 1 : 1; // default ON
const char* to_ss = getenv("HAKMEM_TINY_FREE_TO_SS");
if (to_ss && *to_ss && *to_ss != '0') v = 0; // FREE_TO_SS implies slow path
s_free_fast_en = v;
}
if (!s_free_fast_en) {
// Delegate to precise slow path (handles same/remote + publish)
hak_tiny_free(ptr);
return;
}
// 1. SuperSlab-backed tiny pointer?
if (__builtin_expect(g_use_superslab != 0, 1)) {
SuperSlab* ss = hak_super_lookup(ptr);
if (__builtin_expect(ss != NULL && ss->magic == SUPERSLAB_MAGIC, 0)) {
int slab_idx = slab_index_for(ss, ptr);
uint32_t self_tid = tiny_self_u32();
// Box 6 Boundary: Try same-thread fast path
if (tiny_free_fast_ss(ss, slab_idx, ptr, self_tid)) {
return; // Success: same-thread, pushed to TLS
}
// Cross-thread free → Box 2 (Remote Queue)
// Delegate to full tiny free (handles remote push)
hak_tiny_free(ptr);
return;
}
}
// 2. Legacy TinySlab-backed pointer?
TinySlab* slab = hak_tiny_owner_slab(ptr);
if (__builtin_expect(slab != NULL, 0)) {
// Box 6 Boundary: Try same-thread fast path
if (tiny_free_fast_legacy(slab, ptr)) {
return; // Success: same-thread, pushed to TLS
}
// Cross-thread free → precise path with known slab
hak_tiny_free_with_slab(ptr, slab);
return;
}
// 3. Not a tiny allocation → Delegate to backend (Mid/Large/Mmap)
hak_free_at(ptr, 0, 0);
}
// ========== Guard/Debug Variants ==========
// Free with additional safety checks (for debugging/testing)
// This variant includes:
// - Sentinel checks (0xBADA55)
// - Double-free detection
// - Ownership validation
//
// Usage: Enable with HAKMEM_SAFE_FREE=1 environment variable
static inline void tiny_free_fast_guarded(void* ptr) {
// TODO: Implement guard checks if needed
// For now, delegate to standard fast path
tiny_free_fast(ptr);
}
// ========== Statistics & Diagnostics ==========
// Free fast path stats (for profiling)
typedef struct {
uint64_t same_thread_count; // Same-thread frees (TLS push)
uint64_t cross_thread_count; // Cross-thread frees (remote queue)
uint64_t non_tiny_count; // Non-tiny frees (backend)
} TinyFreeFastStats;
// Get free fast path stats (TLS-local)
static __thread TinyFreeFastStats g_tiny_free_fast_stats = {0};
static inline TinyFreeFastStats tiny_free_fast_stats_get(void) {
return g_tiny_free_fast_stats;
}
// Reset free fast path stats (for testing/benchmarking)
static inline void tiny_free_fast_stats_reset(void) {
g_tiny_free_fast_stats.same_thread_count = 0;
g_tiny_free_fast_stats.cross_thread_count = 0;
g_tiny_free_fast_stats.non_tiny_count = 0;
}
// ========== Performance Notes ==========
//
// Expected metrics:
// - Same-thread hit rate: 80-90% (workload dependent)
// - Same-thread latency: 2-3 instructions (ownership check + push)
// - Cross-thread penalty: ~50-100 instructions (remote queue push)
// - Throughput improvement: +10-20% vs current multi-layer design
//
// Key optimizations:
// 1. Ownership check first (fail-fast on cross-thread)
// 2. `__builtin_expect` for branch prediction (same-thread is common)
// 3. `static inline` for zero-cost abstraction
// 4. TLS variables (no atomic ops in same-thread path)
//
// TOCTOU Race Prevention (Box 4 Boundary):
// - Ownership check is atomic (tiny_atomic_load_u32_relaxed)
// - No time window between check and push (single function)
// - Cross-thread frees are immediately delegated (no TLS touch)
//
// Comparison with current design:
// - Current: 38.43% free overhead (free.part.0 + mid_lookup + locks)
// - New: 2-3 instructions (ownership check + TLS push)
// - Reduction: -90% instructions in same-thread path
//
// Inspired by:
// - System tcache (glibc malloc) - fast same-thread free
// - Box Theory - Clear ownership boundaries
// - TOCTOU fix (Box 4) - Atomic ownership check