## Phase 6-2.3: Fix 4T Larson crash (active counter bug) ✅ **Problem:** 4T Larson crashed with "free(): invalid pointer", OOM errors **Root cause:** core/hakmem_tiny_refill_p0.inc.h:103 - P0 batch refill moved freelist blocks to TLS cache - Active counter NOT incremented → double-decrement on free - Counter underflows → SuperSlab appears full → OOM → crash **Fix:** Added ss_active_add(tls->ss, from_freelist); **Result:** 4T stable at 838K ops/s ✅ ## Phase 6-2.4: Fix SEGV in random_mixed/mid_large_mt benchmarks ✅ **Problem:** bench_random_mixed_hakmem, bench_mid_large_mt_hakmem → immediate SEGV **Root cause #1:** core/box/hak_free_api.inc.h:92-95 - "Guess loop" dereferenced unmapped memory when registry lookup failed **Root cause #2:** core/box/hak_free_api.inc.h:115 - Header magic check dereferenced unmapped memory **Fix:** 1. Removed dangerous guess loop (lines 92-95) 2. Added hak_is_memory_readable() check before dereferencing header (core/hakmem_internal.h:277-294 - uses mincore() syscall) **Result:** - random_mixed (2KB): SEGV → 2.22M ops/s ✅ - random_mixed (4KB): SEGV → 2.58M ops/s ✅ - Larson 4T: no regression (838K ops/s) ✅ ## Phase 6-2.5: Performance investigation + SuperSlab fix (WIP) ⚠️ **Problem:** Severe performance gaps (19-26x slower than system malloc) **Investigation:** Task agent identified root cause - hak_is_memory_readable() syscall overhead (100-300 cycles per free) - ALL frees hit unmapped_header_fallback path - SuperSlab lookup NEVER called - Why? g_use_superslab = 0 (disabled by diet mode) **Root cause:** core/hakmem_tiny_init.inc:104-105 - Diet mode (default ON) disables SuperSlab - SuperSlab defaults to 1 (hakmem_config.c:334) - BUT diet mode overrides it to 0 during init **Fix:** Separate SuperSlab from diet mode - SuperSlab: Performance-critical (fast alloc/free) - Diet mode: Memory efficiency (magazine capacity limits only) - Both are independent features, should not interfere **Status:** ⚠️ INCOMPLETE - New SEGV discovered after fix - SuperSlab lookup now works (confirmed via debug output) - But benchmark crashes (Exit 139) after ~20 lookups - Needs further investigation **Files modified:** - core/hakmem_tiny_init.inc:99-109 - Removed diet mode override - PERFORMANCE_INVESTIGATION_REPORT.md - Task agent analysis (303x instruction gap) **Next steps:** - Investigate new SEGV (likely SuperSlab free path bug) - OR: Revert Phase 6-2.5 changes if blocking progress 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
333 lines
17 KiB
C
333 lines
17 KiB
C
// tiny_superslab_free.inc.h - SuperSlab Free Layer
|
||
// Purpose: Same-thread and cross-thread free handling
|
||
// Extracted from: hakmem_tiny_free.inc lines 1171-1475
|
||
// Box Theory: Box 6 (Free Fast Path) + Box 2 (Remote Queue) integration
|
||
//
|
||
// Public functions:
|
||
// - hak_tiny_free_superslab(): Main SuperSlab free entry point
|
||
|
||
// Phase 6.22-B: SuperSlab fast free path
|
||
static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
|
||
// Route trace: count SuperSlab free entries (diagnostics only)
|
||
extern _Atomic uint64_t g_free_ss_enter;
|
||
atomic_fetch_add_explicit(&g_free_ss_enter, 1, memory_order_relaxed);
|
||
ROUTE_MARK(16); // free_enter
|
||
HAK_DBG_INC(g_superslab_free_count); // Phase 7.6: Track SuperSlab frees
|
||
// Get slab index (supports 1MB/2MB SuperSlabs)
|
||
int slab_idx = slab_index_for(ss, ptr);
|
||
size_t ss_size = (size_t)1ULL << ss->lg_size;
|
||
uintptr_t ss_base = (uintptr_t)ss;
|
||
if (__builtin_expect(slab_idx < 0, 0)) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xBAD1u, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
|
||
tiny_remote_watch_note("free_enter", ss, slab_idx, ptr, 0xA240u, tiny_self_u32(), 0);
|
||
extern __thread TinyTLSSlab g_tls_slabs[];
|
||
tiny_alloc_dump_tls_state(ss->size_class, "watch_free_enter", &g_tls_slabs[ss->size_class]);
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
extern __thread TinyTLSMag g_tls_mags[];
|
||
TinyTLSMag* watch_mag = &g_tls_mags[ss->size_class];
|
||
fprintf(stderr,
|
||
"[REMOTE_WATCH_MAG] cls=%u mag_top=%d cap=%d\n",
|
||
ss->size_class,
|
||
watch_mag->top,
|
||
watch_mag->cap);
|
||
#endif
|
||
}
|
||
// BUGFIX: Validate size_class before using as array index (prevents OOB)
|
||
if (__builtin_expect(ss->size_class < 0 || ss->size_class >= TINY_NUM_CLASSES, 0)) {
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, 0xF1, ptr, (uintptr_t)ss->size_class);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
if (__builtin_expect(g_tiny_safe_free, 0)) {
|
||
size_t blk = g_tiny_class_sizes[ss->size_class];
|
||
uint8_t* base = tiny_slab_base_for(ss, slab_idx);
|
||
uintptr_t delta = (uintptr_t)ptr - (uintptr_t)base;
|
||
int cap_ok = (meta->capacity > 0) ? 1 : 0;
|
||
int align_ok = (delta % blk) == 0;
|
||
int range_ok = cap_ok && (delta / blk) < meta->capacity;
|
||
if (!align_ok || !range_ok) {
|
||
uint32_t code = 0xA100u;
|
||
if (align_ok) code |= 0x2u;
|
||
if (range_ok) code |= 0x1u;
|
||
uintptr_t aux = tiny_remote_pack_diag(code, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
// Duplicate in freelist (best-effort scan up to 64)
|
||
void* scan = meta->freelist; int scanned = 0; int dup = 0;
|
||
while (scan && scanned < 64) { if (scan == ptr) { dup = 1; break; } scan = *(void**)scan; scanned++; }
|
||
if (dup) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xDFu, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
}
|
||
|
||
// Phase 6.23: Same-thread check
|
||
uint32_t my_tid = tiny_self_u32();
|
||
const int debug_guard = g_debug_remote_guard;
|
||
static __thread int g_debug_free_count = 0;
|
||
// If owner is not set yet, claim ownership to avoid spurious remote path in 1T
|
||
if (!g_tiny_force_remote && meta->owner_tid == 0) {
|
||
meta->owner_tid = my_tid;
|
||
}
|
||
if (!g_tiny_force_remote && meta->owner_tid != 0 && meta->owner_tid == my_tid) {
|
||
ROUTE_MARK(17); // free_same_thread
|
||
// Fast path: Direct freelist push (same-thread)
|
||
if (0 && debug_guard && g_debug_free_count < 1) {
|
||
fprintf(stderr, "[FREE_SS] SAME-THREAD: owner=%u my=%u\n",
|
||
meta->owner_tid, my_tid);
|
||
g_debug_free_count++;
|
||
}
|
||
if (__builtin_expect(meta->used == 0, 0)) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0x00u, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "local_free_enter", my_tid);
|
||
if (!tiny_remote_guard_allow_local_push(ss, slab_idx, meta, ptr, "local_free", my_tid)) {
|
||
#include "box/free_remote_box.h"
|
||
int transitioned = tiny_free_remote_box(ss, slab_idx, meta, ptr, my_tid);
|
||
if (transitioned) {
|
||
extern unsigned long long g_remote_free_transitions[];
|
||
g_remote_free_transitions[ss->size_class]++;
|
||
// Free-side route: remote transition observed
|
||
do {
|
||
static int g_route_free = -1; if (__builtin_expect(g_route_free == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_ROUTE_FREE");
|
||
g_route_free = (e && *e && *e != '0') ? 1 : 0; }
|
||
if (g_route_free) route_free_commit((int)ss->size_class, (1ull<<18), 0xE2);
|
||
} while (0);
|
||
}
|
||
return;
|
||
}
|
||
// Optional: MidTC (TLS tcache for 128..1024B) — allow bypass via env HAKMEM_TINY_FREE_TO_SS=1
|
||
do {
|
||
static int g_free_to_ss = -1;
|
||
if (__builtin_expect(g_free_to_ss == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_FREE_TO_SS");
|
||
g_free_to_ss = (e && *e && *e != '0') ? 1 : 0; // default OFF
|
||
}
|
||
if (!g_free_to_ss) {
|
||
int cls = (int)ss->size_class;
|
||
if (midtc_enabled() && cls >= 4) {
|
||
if (midtc_push(cls, ptr)) {
|
||
// Treat as returned to TLS cache (not SS freelist)
|
||
meta->used--;
|
||
ss_active_dec_one(ss);
|
||
return;
|
||
}
|
||
}
|
||
}
|
||
} while (0);
|
||
|
||
#include "box/free_local_box.h"
|
||
// Perform freelist push (+first-free publish if applicable)
|
||
void* prev_before = meta->freelist;
|
||
tiny_free_local_box(ss, slab_idx, meta, ptr, my_tid);
|
||
if (prev_before == NULL) {
|
||
ROUTE_MARK(19); // first_free_transition
|
||
extern unsigned long long g_first_free_transitions[];
|
||
g_first_free_transitions[ss->size_class]++;
|
||
ROUTE_MARK(20); // mailbox_publish
|
||
// Free-side route commit (one-shot)
|
||
do {
|
||
static int g_route_free = -1; if (__builtin_expect(g_route_free == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_ROUTE_FREE");
|
||
g_route_free = (e && *e && *e != '0') ? 1 : 0; }
|
||
int cls = (int)ss->size_class;
|
||
if (g_route_free) route_free_commit(cls, (1ull<<19) | (1ull<<20), 0xE1);
|
||
} while (0);
|
||
}
|
||
|
||
if (__builtin_expect(debug_guard, 0)) {
|
||
fprintf(stderr, "[REMOTE_LOCAL] cls=%u slab=%d owner=%u my=%u ptr=%p prev=%p used=%u\n",
|
||
ss->size_class, slab_idx, meta->owner_tid, my_tid, ptr, prev_before, meta->used);
|
||
}
|
||
|
||
// 空検出は別途(ホットパス除外)
|
||
} else {
|
||
ROUTE_MARK(18); // free_remote_transition
|
||
if (__builtin_expect(meta->owner_tid == my_tid && meta->owner_tid == 0, 0)) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xA300u, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (debug_guard) {
|
||
fprintf(stderr, "[REMOTE_OWNER_ZERO] cls=%u slab=%d ptr=%p my=%u used=%u\n",
|
||
ss->size_class, slab_idx, ptr, my_tid, (unsigned)meta->used);
|
||
}
|
||
}
|
||
tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "remote_free_enter", my_tid);
|
||
// Slow path: Remote free (cross-thread)
|
||
if (0 && debug_guard && g_debug_free_count < 5) {
|
||
fprintf(stderr, "[FREE_SS] CROSS-THREAD: owner=%u my=%u slab_idx=%d\n",
|
||
meta->owner_tid, my_tid, slab_idx);
|
||
g_debug_free_count++;
|
||
}
|
||
if (__builtin_expect(g_tiny_safe_free, 0)) {
|
||
// Best-effort duplicate scan in remote stack (up to 64 nodes)
|
||
uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
|
||
uintptr_t base = ss_base;
|
||
int scanned = 0; int dup = 0;
|
||
uintptr_t cur = head;
|
||
while (cur && scanned < 64) {
|
||
if ((cur < base) || (cur >= base + ss_size)) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xA200u, base, ss_size, cur);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
break;
|
||
}
|
||
if ((void*)cur == ptr) { dup = 1; break; }
|
||
if (__builtin_expect(g_remote_side_enable, 0)) {
|
||
if (!tiny_remote_sentinel_ok((void*)cur)) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xA202u, base, ss_size, cur);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
|
||
uintptr_t observed = atomic_load_explicit((_Atomic uintptr_t*)(void*)cur, memory_order_relaxed);
|
||
tiny_remote_report_corruption("scan", (void*)cur, observed);
|
||
fprintf(stderr,
|
||
"[REMOTE_SENTINEL] cls=%u slab=%d cur=%p head=%p ptr=%p scanned=%d observed=0x%016" PRIxPTR " owner=%u used=%u freelist=%p remote_head=%p\n",
|
||
ss->size_class,
|
||
slab_idx,
|
||
(void*)cur,
|
||
(void*)head,
|
||
ptr,
|
||
scanned,
|
||
observed,
|
||
meta->owner_tid,
|
||
(unsigned)meta->used,
|
||
meta->freelist,
|
||
(void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed));
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
break;
|
||
}
|
||
cur = tiny_remote_side_get(ss, slab_idx, (void*)cur);
|
||
} else {
|
||
if ((cur & (uintptr_t)(sizeof(void*) - 1)) != 0) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xA201u, base, ss_size, cur);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, (void*)cur, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
break;
|
||
}
|
||
cur = (uintptr_t)(*(void**)(void*)cur);
|
||
}
|
||
scanned++;
|
||
}
|
||
if (dup) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xD1u, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
}
|
||
if (__builtin_expect(meta->used == 0, 0)) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0x01u, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
static int g_ss_adopt_en2 = -1; // env cached
|
||
if (g_ss_adopt_en2 == -1) {
|
||
char* e = getenv("HAKMEM_TINY_SS_ADOPT");
|
||
// 既定: Remote Queueを使う(1)。env指定時のみ上書き。
|
||
g_ss_adopt_en2 = (e == NULL) ? 1 : ((*e != '0') ? 1 : 0);
|
||
if (__builtin_expect(debug_guard, 0)) {
|
||
fprintf(stderr, "[FREE_SS] g_ss_adopt_en2=%d (env='%s')\n", g_ss_adopt_en2, e ? e : "(null)");
|
||
}
|
||
}
|
||
// A/B gate: disable remote MPSC (use legacy freelist push)
|
||
do {
|
||
static int g_disable_remote = -1;
|
||
if (__builtin_expect(g_disable_remote == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_DISABLE_REMOTE");
|
||
g_disable_remote = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
if (__builtin_expect(g_disable_remote, 0)) {
|
||
g_ss_adopt_en2 = 0;
|
||
}
|
||
} while (0);
|
||
if (g_ss_adopt_en2) {
|
||
// Use remote queue
|
||
uintptr_t head_word = __atomic_load_n((uintptr_t*)ptr, __ATOMIC_RELAXED);
|
||
if (debug_guard) fprintf(stderr, "[REMOTE_PUSH_CALL] cls=%u slab=%d owner=%u my=%u ptr=%p used=%u remote_count=%u head=%p word=0x%016" PRIxPTR "\n",
|
||
ss->size_class,
|
||
slab_idx,
|
||
meta->owner_tid,
|
||
my_tid,
|
||
ptr,
|
||
(unsigned)meta->used,
|
||
atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_relaxed),
|
||
(void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed),
|
||
head_word);
|
||
int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, ptr);
|
||
if (!dup_remote && __builtin_expect(g_remote_side_enable, 0)) {
|
||
dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, ptr);
|
||
}
|
||
if (__builtin_expect(head_word == TINY_REMOTE_SENTINEL && !dup_remote && g_debug_remote_guard, 0)) {
|
||
tiny_remote_watch_note("dup_scan_miss", ss, slab_idx, ptr, 0xA215u, my_tid, 0);
|
||
}
|
||
if (dup_remote) {
|
||
uintptr_t aux = tiny_remote_pack_diag(0xA214u, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_remote_watch_mark(ptr, "dup_prevent", my_tid);
|
||
tiny_remote_watch_note("dup_prevent", ss, slab_idx, ptr, 0xA214u, my_tid, 0);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
if (__builtin_expect(g_remote_side_enable && (head_word & 0xFFFFu) == 0x6261u, 0)) {
|
||
// TLS guard scribble detected on the node's first word → same-pointer double free across routes
|
||
uintptr_t aux = tiny_remote_pack_diag(0xA213u, ss_base, ss_size, (uintptr_t)ptr);
|
||
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
|
||
tiny_remote_watch_mark(ptr, "pre_push", my_tid);
|
||
tiny_remote_watch_note("pre_push", ss, slab_idx, ptr, 0xA231u, my_tid, 0);
|
||
tiny_remote_report_corruption("pre_push", ptr, head_word);
|
||
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return; }
|
||
return;
|
||
}
|
||
if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
|
||
tiny_remote_watch_note("free_remote", ss, slab_idx, ptr, 0xA232u, my_tid, 0);
|
||
}
|
||
int was_empty = ss_remote_push(ss, slab_idx, ptr); // ss_active_dec_one() called inside
|
||
meta->used--;
|
||
// ss_active_dec_one(ss); // REMOVED: Already called inside ss_remote_push()
|
||
if (was_empty) {
|
||
extern unsigned long long g_remote_free_transitions[];
|
||
g_remote_free_transitions[ss->size_class]++;
|
||
ss_partial_publish((int)ss->size_class, ss);
|
||
}
|
||
} else {
|
||
// Fallback: direct freelist push (legacy)
|
||
if (debug_guard) fprintf(stderr, "[FREE_SS] Using LEGACY freelist push (not remote queue)\n");
|
||
void* prev = meta->freelist;
|
||
*(void**)ptr = prev;
|
||
meta->freelist = ptr;
|
||
tiny_failfast_log("free_local_legacy", ss->size_class, ss, meta, ptr, prev);
|
||
do {
|
||
static int g_mask_en = -1;
|
||
if (__builtin_expect(g_mask_en == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
|
||
g_mask_en = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
if (__builtin_expect(g_mask_en, 0) && prev == NULL) {
|
||
uint32_t bit = (1u << slab_idx);
|
||
atomic_fetch_or_explicit(&ss->freelist_mask, bit, memory_order_release);
|
||
}
|
||
} while (0);
|
||
meta->used--;
|
||
ss_active_dec_one(ss);
|
||
if (prev == NULL) {
|
||
ss_partial_publish((int)ss->size_class, ss);
|
||
}
|
||
}
|
||
|
||
// 空検出は別途(ホットパス除外)
|
||
}
|
||
}
|