Files
hakmem/core/superslab/superslab_inline.h

150 lines
4.9 KiB
C
Raw Normal View History

#ifndef SUPERSLAB_INLINE_H
#define SUPERSLAB_INLINE_H
#include "superslab_types.h"
Fix: workset=8192 SEGV - Unify SuperSlab geometry to Box3 (partial fix) Problem: - bench_random_mixed_hakmem with workset=8192 causes SEGV - workset=256 works fine - Root cause identified by ChatGPT analysis Root Cause: SuperSlab geometry double definition caused slab_base misalignment: - Old: tiny_slab_base_for() used SLAB0_OFFSET + idx * SLAB_SIZE - New: Box3 tiny_slab_base_for_geometry() uses offset only for idx=0 - Result: slab_idx > 0 had +2048 byte offset error - Impact: Unified Cache carve stepped beyond slab boundary → SEGV Fix 1: core/superslab/superslab_inline.h ======================================== Delegate SuperSlab base calculation to Box3: static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) { if (!ss || slab_idx < 0) return NULL; return tiny_slab_base_for_geometry(ss, slab_idx); // ← Box3 unified } Effect: - All tiny_slab_base_for() calls now use single Box3 implementation - TLS slab_base and Box3 calculations perfectly aligned - Eliminates geometry mismatch between layers Fix 2: core/front/tiny_unified_cache.c ======================================== Enhanced fail-fast validation (debug builds only): - unified_refill_validate_base(): Use TLS as source of truth - Cross-check with registry lookup for safety - Validate: slab_base range, alignment, meta consistency - Box3 + TLS boundary consolidated to one place Fix 3: core/hakmem_tiny_superslab.h ======================================== Added forward declaration: - SuperSlab* superslab_refill(int class_idx); - Required by tiny_unified_cache.c Test Results: ============= workset=8192 SEGV threshold improved: Before fix: ❌ Immediate SEGV at any iteration count After fix: ✅ 100K iterations: OK (9.8M ops/s) ✅ 200K iterations: OK (15.5M ops/s) ❌ 300K iterations: SEGV (different bug exposed) Conclusion: - Box3 geometry unification fixed primary SEGV - Stability improved: 0 → 200K iterations - Remaining issue: 300K+ iterations hit different bug - Likely causes: memory pressure, different corruption pattern Known Issues: - Debug warnings still present: FREE_FAST_HDR_META_MISMATCH, NXT_HDR_MISMATCH - These are separate header consistency issues (not related to geometry) - 300K+ SEGV requires further investigation Performance: - No performance regression observed in stable range - workset=256 unaffected: 60M+ ops/s maintained Credit: Root cause analysis and fix strategy by ChatGPT 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-22 07:40:35 +09:00
#include "../tiny_box_geometry.h" // Box 3 geometry helpers (stride/base/capacity)
// Forward declaration for unsafe remote drain used by refill/handle paths
// Implemented in hakmem_tiny_superslab.c
void _ss_remote_drain_to_freelist_unsafe(SuperSlab* ss, int slab_idx, TinySlabMeta* meta);
// Optional debug counter (defined in hakmem_tiny_superslab.c)
extern _Atomic uint64_t g_ss_active_dec_calls;
// Return maximum number of slabs for this SuperSlab based on lg_size.
static inline int ss_slabs_capacity(SuperSlab* ss)
{
if (!ss) return 0;
size_t ss_size = (size_t)1 << ss->lg_size;
return (int)(ss_size / SLAB_SIZE);
}
// Compute slab base pointer for given (ss, slab_idx).
Fix: workset=8192 SEGV - Unify SuperSlab geometry to Box3 (partial fix) Problem: - bench_random_mixed_hakmem with workset=8192 causes SEGV - workset=256 works fine - Root cause identified by ChatGPT analysis Root Cause: SuperSlab geometry double definition caused slab_base misalignment: - Old: tiny_slab_base_for() used SLAB0_OFFSET + idx * SLAB_SIZE - New: Box3 tiny_slab_base_for_geometry() uses offset only for idx=0 - Result: slab_idx > 0 had +2048 byte offset error - Impact: Unified Cache carve stepped beyond slab boundary → SEGV Fix 1: core/superslab/superslab_inline.h ======================================== Delegate SuperSlab base calculation to Box3: static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) { if (!ss || slab_idx < 0) return NULL; return tiny_slab_base_for_geometry(ss, slab_idx); // ← Box3 unified } Effect: - All tiny_slab_base_for() calls now use single Box3 implementation - TLS slab_base and Box3 calculations perfectly aligned - Eliminates geometry mismatch between layers Fix 2: core/front/tiny_unified_cache.c ======================================== Enhanced fail-fast validation (debug builds only): - unified_refill_validate_base(): Use TLS as source of truth - Cross-check with registry lookup for safety - Validate: slab_base range, alignment, meta consistency - Box3 + TLS boundary consolidated to one place Fix 3: core/hakmem_tiny_superslab.h ======================================== Added forward declaration: - SuperSlab* superslab_refill(int class_idx); - Required by tiny_unified_cache.c Test Results: ============= workset=8192 SEGV threshold improved: Before fix: ❌ Immediate SEGV at any iteration count After fix: ✅ 100K iterations: OK (9.8M ops/s) ✅ 200K iterations: OK (15.5M ops/s) ❌ 300K iterations: SEGV (different bug exposed) Conclusion: - Box3 geometry unification fixed primary SEGV - Stability improved: 0 → 200K iterations - Remaining issue: 300K+ iterations hit different bug - Likely causes: memory pressure, different corruption pattern Known Issues: - Debug warnings still present: FREE_FAST_HDR_META_MISMATCH, NXT_HDR_MISMATCH - These are separate header consistency issues (not related to geometry) - 300K+ SEGV requires further investigation Performance: - No performance regression observed in stable range - workset=256 unaffected: 60M+ ops/s maintained Credit: Root cause analysis and fix strategy by ChatGPT 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-22 07:40:35 +09:00
// Box 5 wrapper: delegate to Box 3 canonical geometry helper.
static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx)
{
Fix: workset=8192 SEGV - Unify SuperSlab geometry to Box3 (partial fix) Problem: - bench_random_mixed_hakmem with workset=8192 causes SEGV - workset=256 works fine - Root cause identified by ChatGPT analysis Root Cause: SuperSlab geometry double definition caused slab_base misalignment: - Old: tiny_slab_base_for() used SLAB0_OFFSET + idx * SLAB_SIZE - New: Box3 tiny_slab_base_for_geometry() uses offset only for idx=0 - Result: slab_idx > 0 had +2048 byte offset error - Impact: Unified Cache carve stepped beyond slab boundary → SEGV Fix 1: core/superslab/superslab_inline.h ======================================== Delegate SuperSlab base calculation to Box3: static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) { if (!ss || slab_idx < 0) return NULL; return tiny_slab_base_for_geometry(ss, slab_idx); // ← Box3 unified } Effect: - All tiny_slab_base_for() calls now use single Box3 implementation - TLS slab_base and Box3 calculations perfectly aligned - Eliminates geometry mismatch between layers Fix 2: core/front/tiny_unified_cache.c ======================================== Enhanced fail-fast validation (debug builds only): - unified_refill_validate_base(): Use TLS as source of truth - Cross-check with registry lookup for safety - Validate: slab_base range, alignment, meta consistency - Box3 + TLS boundary consolidated to one place Fix 3: core/hakmem_tiny_superslab.h ======================================== Added forward declaration: - SuperSlab* superslab_refill(int class_idx); - Required by tiny_unified_cache.c Test Results: ============= workset=8192 SEGV threshold improved: Before fix: ❌ Immediate SEGV at any iteration count After fix: ✅ 100K iterations: OK (9.8M ops/s) ✅ 200K iterations: OK (15.5M ops/s) ❌ 300K iterations: SEGV (different bug exposed) Conclusion: - Box3 geometry unification fixed primary SEGV - Stability improved: 0 → 200K iterations - Remaining issue: 300K+ iterations hit different bug - Likely causes: memory pressure, different corruption pattern Known Issues: - Debug warnings still present: FREE_FAST_HDR_META_MISMATCH, NXT_HDR_MISMATCH - These are separate header consistency issues (not related to geometry) - 300K+ SEGV requires further investigation Performance: - No performance regression observed in stable range - workset=256 unaffected: 60M+ ops/s maintained Credit: Root cause analysis and fix strategy by ChatGPT 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-22 07:40:35 +09:00
if (!ss || slab_idx < 0) {
return NULL;
}
Fix: workset=8192 SEGV - Unify SuperSlab geometry to Box3 (partial fix) Problem: - bench_random_mixed_hakmem with workset=8192 causes SEGV - workset=256 works fine - Root cause identified by ChatGPT analysis Root Cause: SuperSlab geometry double definition caused slab_base misalignment: - Old: tiny_slab_base_for() used SLAB0_OFFSET + idx * SLAB_SIZE - New: Box3 tiny_slab_base_for_geometry() uses offset only for idx=0 - Result: slab_idx > 0 had +2048 byte offset error - Impact: Unified Cache carve stepped beyond slab boundary → SEGV Fix 1: core/superslab/superslab_inline.h ======================================== Delegate SuperSlab base calculation to Box3: static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx) { if (!ss || slab_idx < 0) return NULL; return tiny_slab_base_for_geometry(ss, slab_idx); // ← Box3 unified } Effect: - All tiny_slab_base_for() calls now use single Box3 implementation - TLS slab_base and Box3 calculations perfectly aligned - Eliminates geometry mismatch between layers Fix 2: core/front/tiny_unified_cache.c ======================================== Enhanced fail-fast validation (debug builds only): - unified_refill_validate_base(): Use TLS as source of truth - Cross-check with registry lookup for safety - Validate: slab_base range, alignment, meta consistency - Box3 + TLS boundary consolidated to one place Fix 3: core/hakmem_tiny_superslab.h ======================================== Added forward declaration: - SuperSlab* superslab_refill(int class_idx); - Required by tiny_unified_cache.c Test Results: ============= workset=8192 SEGV threshold improved: Before fix: ❌ Immediate SEGV at any iteration count After fix: ✅ 100K iterations: OK (9.8M ops/s) ✅ 200K iterations: OK (15.5M ops/s) ❌ 300K iterations: SEGV (different bug exposed) Conclusion: - Box3 geometry unification fixed primary SEGV - Stability improved: 0 → 200K iterations - Remaining issue: 300K+ iterations hit different bug - Likely causes: memory pressure, different corruption pattern Known Issues: - Debug warnings still present: FREE_FAST_HDR_META_MISMATCH, NXT_HDR_MISMATCH - These are separate header consistency issues (not related to geometry) - 300K+ SEGV requires further investigation Performance: - No performance regression observed in stable range - workset=256 unaffected: 60M+ ops/s maintained Credit: Root cause analysis and fix strategy by ChatGPT 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-22 07:40:35 +09:00
return tiny_slab_base_for_geometry(ss, slab_idx);
}
// Compute slab index for a pointer inside ss.
static inline int slab_index_for(SuperSlab* ss, void* ptr)
{
if (!ss || !ptr) return -1;
uintptr_t base = (uintptr_t)ss;
uintptr_t p = (uintptr_t)ptr;
size_t ss_size = (size_t)1 << ss->lg_size;
if (p < base + SUPERSLAB_SLAB0_DATA_OFFSET || p >= base + ss_size) {
return -1;
}
size_t rel = p - (base + SUPERSLAB_SLAB0_DATA_OFFSET);
int idx = (int)(rel / SLAB_SIZE);
if (idx < 0 || idx >= SLABS_PER_SUPERSLAB_MAX) {
return -1;
}
return idx;
}
// Simple ref helpers used by lifecycle paths.
static inline uint32_t superslab_ref_get(SuperSlab* ss)
{
return ss ? atomic_load_explicit(&ss->refcount, memory_order_acquire) : 0;
}
static inline void superslab_ref_inc(SuperSlab* ss)
{
if (ss) {
atomic_fetch_add_explicit(&ss->refcount, 1, memory_order_acq_rel);
}
}
static inline void superslab_ref_dec(SuperSlab* ss)
{
if (ss) {
uint32_t prev = atomic_fetch_sub_explicit(&ss->refcount, 1, memory_order_acq_rel);
(void)prev; // caller decides when to free; we just provide the primitive
Tiny: fix remote sentinel leak → SEGV; add defense-in-depth; PoolTLS: refill-boundary remote drain; build UX help; quickstart docs Summary - Fix SEGV root cause in Tiny random_mixed: TINY_REMOTE_SENTINEL leaked from Remote queue into freelist/TLS SLL. - Clear/guard sentinel at the single boundary where Remote merges to freelist. - Add minimal defense-in-depth in freelist_pop and TLS SLL pop. - Silence verbose prints behind debug gates to reduce noise in release runs. - Pool TLS: integrate Remote Queue drain at refill boundary to avoid unnecessary backend carve/OS calls when possible. - DX: strengthen build.sh with help/list/verify and add docs/BUILDING_QUICKSTART.md. Details - core/superslab/superslab_inline.h: guard head/node against TINY_REMOTE_SENTINEL; sanitize node[0] when splicing local chain; only print diagnostics when debug guard is enabled. - core/slab_handle.h: freelist_pop breaks on sentinel head (fail-fast under strict). - core/tiny_alloc_fast_inline.h: TLS SLL pop breaks on sentinel head (rare branch). - core/tiny_superslab_free.inc.h: sentinel scan log behind debug guard. - core/pool_refill.c: try pool_remote_pop_chain() before backend carve in pool_refill_and_alloc(). - core/tiny_adaptive_sizing.c: default adaptive logs off; enable via HAKMEM_ADAPTIVE_LOG=1. - build.sh: add help/list/verify; EXTRA_MAKEFLAGS passthrough; echo pinned flags. - docs/BUILDING_QUICKSTART.md: add one‑pager for targets/flags/env/perf/strace. Verification (high level) - Tiny random_mixed 10k 256/1024: SEGV resolved; runs complete. - Pool TLS 1T/4T perf: HAKMEM >= system (≈ +0.7% 1T, ≈ +2.9% 4T); syscall counts ~10–13. Known issues (to address next) - Tiny random_mixed perf is weak vs system: - 1T/500k/256: cycles/op ≈ 240 vs ~47 (≈5× slower), IPC ≈0.92, branch‑miss ≈11%. - 1T/500k/1024: cycles/op ≈ 149 vs ~53 (≈2.8× slower), IPC ≈0.82, branch‑miss ≈10.5%. - Hypothesis: frequent SuperSlab path for class7 (fast_cap=0), branchy refill/adopt, and hot-path divergence. - Proposed next steps: - Introduce fast_cap>0 for class7 (bounded TLS SLL) and a simpler batch refill. - Add env‑gated Remote Side OFF for 1T A/B (reduce side-table and guards). - Revisit likely/unlikely and unify adopt boundary sequencing (drain→bind→acquire) for Tiny.
2025-11-09 16:49:34 +09:00
}
}
// Ownership helpers (Box 3)
static inline int ss_owner_try_acquire(TinySlabMeta* m, uint32_t tid)
{
if (!m) return 0;
uint8_t want = (uint8_t)(tid & 0xFFu);
uint8_t expected = 0;
return __atomic_compare_exchange_n(&m->owner_tid_low, &expected, want,
false, __ATOMIC_ACQ_REL, __ATOMIC_RELAXED);
}
static inline void ss_owner_release(TinySlabMeta* m, uint32_t tid)
{
if (!m) return;
uint8_t expected = (uint8_t)(tid & 0xFFu);
(void)__atomic_compare_exchange_n(&m->owner_tid_low, &expected, 0u,
false, __ATOMIC_RELEASE, __ATOMIC_RELAXED);
}
static inline int ss_owner_is_mine(TinySlabMeta* m, uint32_t tid)
{
if (!m) return 0;
Phase 12: Shared SuperSlab Pool implementation (WIP - runtime crash) ## Summary Implemented Phase 12 Shared SuperSlab Pool (mimalloc-style) to address SuperSlab allocation churn (877 SuperSlabs → 100-200 target). ## Implementation (ChatGPT + Claude) 1. **Metadata changes** (superslab_types.h): - Added class_idx to TinySlabMeta (per-slab dynamic class) - Removed size_class from SuperSlab (no longer per-SuperSlab) - Changed owner_tid (16-bit) → owner_tid_low (8-bit) 2. **Shared Pool** (hakmem_shared_pool.{h,c}): - Global pool shared by all size classes - shared_pool_acquire_slab() - Get free slab for class_idx - shared_pool_release_slab() - Return slab when empty - Per-class hints for fast path optimization 3. **Integration** (23 files modified): - Updated all ss->size_class → meta->class_idx - Updated all meta->owner_tid → meta->owner_tid_low - superslab_refill() now uses shared pool - Free path releases empty slabs back to pool 4. **Build system** (Makefile): - Added hakmem_shared_pool.o to OBJS_BASE and TINY_BENCH_OBJS_BASE ## Status: ⚠️ Build OK, Runtime CRASH **Build**: ✅ SUCCESS - All 23 files compile without errors - Only warnings: superslab_allocate type mismatch (legacy code) **Runtime**: ❌ SEGFAULT - Crash location: sll_refill_small_from_ss() - Exit code: 139 (SIGSEGV) - Test case: ./bench_random_mixed_hakmem 1000 256 42 ## Known Issues 1. **SEGFAULT in refill path** - Likely shared_pool_acquire_slab() issue 2. **Legacy superslab_allocate()** still exists (type mismatch warning) 3. **Remaining TODOs** from design doc: - SuperSlab physical layout integration - slab_handle.h cleanup - Remove old per-class head implementation ## Next Steps 1. Debug SEGFAULT (gdb backtrace shows sll_refill_small_from_ss) 2. Fix shared_pool_acquire_slab() or superslab_init_slab() 3. Basic functionality test (1K → 100K iterations) 4. Measure SuperSlab count reduction (877 → 100-200) 5. Performance benchmark (+650-860% expected) ## Files Changed (25 files) core/box/free_local_box.c core/box/free_remote_box.c core/box/front_gate_classifier.c core/hakmem_super_registry.c core/hakmem_tiny.c core/hakmem_tiny_bg_spill.c core/hakmem_tiny_free.inc core/hakmem_tiny_lifecycle.inc core/hakmem_tiny_magazine.c core/hakmem_tiny_query.c core/hakmem_tiny_refill.inc.h core/hakmem_tiny_superslab.c core/hakmem_tiny_superslab.h core/hakmem_tiny_tls_ops.h core/slab_handle.h core/superslab/superslab_inline.h core/superslab/superslab_types.h core/tiny_debug.h core/tiny_free_fast.inc.h core/tiny_free_magazine.inc.h core/tiny_remote.c core/tiny_superslab_alloc.inc.h core/tiny_superslab_free.inc.h Makefile ## New Files (3 files) PHASE12_SHARED_SUPERSLAB_POOL_DESIGN.md core/hakmem_shared_pool.c core/hakmem_shared_pool.h 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> Co-Authored-By: ChatGPT <chatgpt@openai.com>
2025-11-13 16:33:03 +09:00
uint8_t cur = __atomic_load_n(&m->owner_tid_low, __ATOMIC_RELAXED);
return cur == (uint8_t)(tid & 0xFFu);
}
// Active block accounting (saturating dec by 1)
static inline void ss_active_dec_one(SuperSlab* ss)
{
if (!ss) return;
atomic_fetch_add_explicit(&g_ss_active_dec_calls, 1, memory_order_relaxed);
uint32_t cur = atomic_load_explicit(&ss->total_active_blocks, memory_order_relaxed);
while (cur != 0) {
if (atomic_compare_exchange_weak_explicit(&ss->total_active_blocks,
&cur,
cur - 1u,
memory_order_acq_rel,
memory_order_relaxed)) {
return;
}
// cur updated by failed CAS; loop
}
}
// Remote push helper (Box 2):
// - Enqueue node to per-slab MPSC stack
// - Returns 1 if transition empty->nonempty, otherwise 0
// - Also decrements ss->total_active_blocks once (free completed)
static inline int ss_remote_push(SuperSlab* ss, int slab_idx, void* node)
{
if (!ss || slab_idx < 0 || slab_idx >= SLABS_PER_SUPERSLAB_MAX || !node) {
return -1;
}
_Atomic uintptr_t* head = &ss->remote_heads[slab_idx];
uintptr_t old_head;
uintptr_t new_head;
int transitioned = 0;
do {
old_head = atomic_load_explicit(head, memory_order_acquire);
// next ポインタは tiny_next_ptr_box / tiny_nextptr 等で扱う前提だが、
// ここでは単純に単方向リストとして積む(上位が decode する)。
*(uintptr_t*)node = old_head;
new_head = (uintptr_t)node;
} while (!atomic_compare_exchange_weak_explicit(
head, &old_head, new_head,
memory_order_release, memory_order_relaxed));
transitioned = (old_head == 0) ? 1 : 0;
atomic_fetch_add_explicit(&ss->remote_counts[slab_idx], 1, memory_order_acq_rel);
// account active block removal once per free
ss_active_dec_one(ss);
return transitioned;
}
#endif // SUPERSLAB_INLINE_H