Files
hakmem/core/hakmem_tiny_bump.inc.h

108 lines
3.4 KiB
C
Raw Permalink Normal View History

// hakmem_tiny_bump.inc.h
// Layer 1: TLS Bump Allocator (Ultra-fast path)
//
// Purpose: 2-3 instruction allocation for hot classes (8B, 16B, 32B)
// Design: bcur += size; if (bcur <= bend) return old;
//
// Part of 3-layer architecture simplification (2025-11-01)
// Based on ChatGPT Pro UltraThink recommendations
#ifndef HAKMEM_TINY_BUMP_INC_H
#define HAKMEM_TINY_BUMP_INC_H
// likely/unlikely macros
#ifndef likely
#define likely(x) __builtin_expect(!!(x), 1)
#endif
#ifndef unlikely
#define unlikely(x) __builtin_expect(!!(x), 0)
#endif
// ============================================================================
// Data Structure
// ============================================================================
typedef struct {
void* bcur; // Current bump pointer
void* bend; // Bump end (exclusive)
} TinyBump;
// Per-class bump allocator (hot classes only: class 0, 1, 2 = 8B, 16B, 32B)
static __thread TinyBump g_tiny_bump[3] = {{NULL, NULL}, {NULL, NULL}, {NULL, NULL}};
// ============================================================================
// Layer 1: Ultra-fast bump allocation (2-3 instructions/op)
// ============================================================================
// Class 0: 8B
__attribute__((always_inline))
static inline void* tiny_bump_alloc_8B(void) {
void* old = g_tiny_bump[0].bcur;
void* new_cur = (char*)old + 8;
if (likely(new_cur <= g_tiny_bump[0].bend)) {
g_tiny_bump[0].bcur = new_cur;
return old;
}
return NULL; // Exhausted, fallback to Layer 2
}
// Class 1: 16B
__attribute__((always_inline))
static inline void* tiny_bump_alloc_16B(void) {
void* old = g_tiny_bump[1].bcur;
void* new_cur = (char*)old + 16;
if (likely(new_cur <= g_tiny_bump[1].bend)) {
g_tiny_bump[1].bcur = new_cur;
return old;
}
return NULL; // Exhausted, fallback to Layer 2
}
// Class 2: 32B
__attribute__((always_inline))
static inline void* tiny_bump_alloc_32B(void) {
void* old = g_tiny_bump[2].bcur;
void* new_cur = (char*)old + 32;
if (likely(new_cur <= g_tiny_bump[2].bend)) {
g_tiny_bump[2].bcur = new_cur;
return old;
}
return NULL; // Exhausted, fallback to Layer 2
}
// Generic bump alloc (for use in slow path)
__attribute__((always_inline))
static inline void* tiny_bump_alloc(int class_idx) {
if (class_idx == 0) return tiny_bump_alloc_8B();
if (class_idx == 1) return tiny_bump_alloc_16B();
if (class_idx == 2) return tiny_bump_alloc_32B();
return NULL; // Not a hot class
}
// ============================================================================
// Bump refill (called from Layer 3: slow path)
// ============================================================================
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
__attribute__((noinline, unused))
static void tiny_bump_refill(int class_idx, void* base, size_t size) {
if (class_idx < 0 || class_idx > 2) return; // Only hot classes
g_tiny_bump[class_idx].bcur = base;
g_tiny_bump[class_idx].bend = (char*)base + size;
}
// Reset bump allocator (e.g., thread shutdown)
static void tiny_bump_reset(int class_idx) {
if (class_idx < 0 || class_idx > 2) return;
g_tiny_bump[class_idx].bcur = NULL;
g_tiny_bump[class_idx].bend = NULL;
}
// Reset all bump allocators
CRITICAL FIX: TLS 未初期化による 4T SEGV を完全解消 **問題:** - Larson 4T で 100% SEGV (1T は 2.09M ops/s で完走) - System/mimalloc は 4T で 33.52M ops/s 正常動作 - SS OFF + Remote OFF でも 4T で SEGV **根本原因: (Task agent ultrathink 調査結果)** ``` CRASH: mov (%r15),%r13 R15 = 0x6261 ← ASCII "ba" (ゴミ値、未初期化TLS) ``` Worker スレッドの TLS 変数が未初期化: - `__thread void* g_tls_sll_head[TINY_NUM_CLASSES];` ← 初期化なし - pthread_create() で生成されたスレッドでゼロ初期化されない - NULL チェックが通過 (0x6261 != NULL) → dereference → SEGV **修正内容:** 全 TLS 配列に明示的初期化子 `= {0}` を追加: 1. **core/hakmem_tiny.c:** - `g_tls_sll_head[TINY_NUM_CLASSES] = {0}` - `g_tls_sll_count[TINY_NUM_CLASSES] = {0}` - `g_tls_live_ss[TINY_NUM_CLASSES] = {0}` - `g_tls_bcur[TINY_NUM_CLASSES] = {0}` - `g_tls_bend[TINY_NUM_CLASSES] = {0}` 2. **core/tiny_fastcache.c:** - `g_tiny_fast_cache[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_count[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_head[TINY_FAST_CLASS_COUNT] = {0}` - `g_tiny_fast_free_count[TINY_FAST_CLASS_COUNT] = {0}` 3. **core/hakmem_tiny_magazine.c:** - `g_tls_mags[TINY_NUM_CLASSES] = {0}` 4. **core/tiny_sticky.c:** - `g_tls_sticky_ss[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_idx[TINY_NUM_CLASSES][TINY_STICKY_RING] = {0}` - `g_tls_sticky_pos[TINY_NUM_CLASSES] = {0}` **効果:** ``` Before: 1T: 2.09M ✅ | 4T: SEGV 💀 After: 1T: 2.41M ✅ | 4T: 4.19M ✅ (+15% 1T, SEGV解消) ``` **テスト:** ```bash # 1 thread: 完走 ./larson_hakmem 2 8 128 1024 1 12345 1 → Throughput = 2,407,597 ops/s ✅ # 4 threads: 完走(以前は SEGV) ./larson_hakmem 2 8 128 1024 1 12345 4 → Throughput = 4,192,155 ops/s ✅ ``` **調査協力:** Task agent (ultrathink mode) による完璧な根本原因特定 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-07 01:27:04 +09:00
static __attribute__((unused)) void tiny_bump_reset_all(void) {
for (int i = 0; i < 3; i++) {
tiny_bump_reset(i);
}
}
#endif // HAKMEM_TINY_BUMP_INC_H