WIP: Add TLS SLL validation and SuperSlab registry fallback
ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue. Current status: Partial mitigation, but root cause remains. Changes Applied: 1. SuperSlab Registry Fallback (hakmem_super_registry.h) - Added legacy table probe when hash map lookup misses - Prevents NULL returns for valid SuperSlabs during initialization - Status: ✅ Works but may hide underlying registration issues 2. TLS SLL Push Validation (tls_sll_box.h) - Reject push if SuperSlab lookup returns NULL - Reject push if class_idx mismatch detected - Added [TLS_SLL_PUSH_NO_SS] diagnostic message - Status: ✅ Prevents list corruption (defensive) 3. SuperSlab Allocation Class Fix (superslab_allocate.c) - Pass actual class_idx to sp_internal_allocate_superslab - Prevents dummy class=8 causing OOB access - Status: ✅ Root cause fix for allocation path 4. Debug Output Additions - First 256 push/pop operations traced - First 4 mismatches logged with details - SuperSlab registration state logged - Status: ✅ Diagnostic tool (not a fix) 5. TLS Hint Box Removed - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization) - Simplified to focus on stability first - Status: ⏳ Can be re-added after root cause fixed Current Problem (REMAINS UNSOLVED): - [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench - Pointer is 16 bytes offset from expected (class 1 → class 2 boundary) - hak_super_lookup returns NULL for that pointer - Suggests: Use-After-Free, Double-Free, or pointer arithmetic error Root Cause Analysis: - Pattern: Pointer offset by +16 (one class 1 stride) - Timing: Cumulative problem (appears after 60s, not immediately) - Location: Header corruption detected during TLS SLL pop Remaining Issues: ⚠️ Registry fallback is defensive (may hide registration bugs) ⚠️ Push validation prevents symptoms but not root cause ⚠️ 16-byte pointer offset source unidentified Next Steps for Investigation: 1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths) 2. Enhanced logging at HDR_RESET point: - Expected vs actual pointer value - Pointer provenance (where it came from) - Allocation trace for that block 3. Verify Headerless flag is OFF throughout build 4. Check for double-offset application in conversions Technical Assessment: - 60% root cause fixes (allocation class, validation) - 40% defensive mitigation (registry fallback, push rejection) Performance Impact: - Registry fallback: +10-30 cycles on cold path (negligible) - Push validation: +5-10 cycles per push (acceptable) - Overall: < 2% performance impact estimated Related Issues: - Phase 1 TLS Hint Box removed temporarily - Phase 2 Headerless blocked until stability achieved 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
@ -35,6 +35,7 @@
|
||||
#include "../ptr_trace.h"
|
||||
#include "../tiny_debug_ring.h"
|
||||
#include "../hakmem_super_registry.h"
|
||||
#include "ss_addr_map_box.h"
|
||||
#include "../superslab/superslab_inline.h"
|
||||
#include "tiny_next_ptr_box.h"
|
||||
#include "tiny_header_box.h" // Header Box: Single Source of Truth for header operations
|
||||
@ -250,6 +251,10 @@ static inline void tls_sll_diag_next(int class_idx, hak_base_ptr_t base, hak_bas
|
||||
|
||||
static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t capacity, const char* where)
|
||||
{
|
||||
static _Atomic int g_tls_push_trace = 0;
|
||||
if (atomic_fetch_add_explicit(&g_tls_push_trace, 1, memory_order_relaxed) < 256) {
|
||||
HAK_TRACE("[tls_sll_push_impl_enter]\n");
|
||||
}
|
||||
HAK_CHECK_CLASS_IDX(class_idx, "tls_sll_push");
|
||||
|
||||
// Class mask gate (narrow triage): if disallowed, reject push
|
||||
@ -275,6 +280,7 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
|
||||
void* raw_ptr = HAK_BASE_TO_RAW(ptr);
|
||||
|
||||
// Detect meta/class mismatch on push (first few only).
|
||||
bool push_valid = true;
|
||||
do {
|
||||
static _Atomic uint32_t g_tls_sll_push_meta_mis = 0;
|
||||
struct SuperSlab* ss = hak_super_lookup(raw_ptr);
|
||||
@ -283,6 +289,7 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
|
||||
if (sidx >= 0 && sidx < ss_slabs_capacity(ss)) {
|
||||
uint8_t meta_cls = ss->slabs[sidx].class_idx;
|
||||
if (meta_cls < TINY_NUM_CLASSES && meta_cls != (uint8_t)class_idx) {
|
||||
push_valid = false;
|
||||
uint32_t n = atomic_fetch_add_explicit(&g_tls_sll_push_meta_mis, 1, memory_order_relaxed);
|
||||
if (n < 4) {
|
||||
fprintf(stderr,
|
||||
@ -295,8 +302,27 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
push_valid = false;
|
||||
static _Atomic uint32_t g_tls_sll_push_no_ss = 0;
|
||||
uint32_t n = atomic_fetch_add_explicit(&g_tls_sll_push_no_ss, 1, memory_order_relaxed);
|
||||
if (n < 4) {
|
||||
extern int g_super_reg_initialized;
|
||||
extern SSAddrMap g_ss_addr_map;
|
||||
fprintf(stderr,
|
||||
"[TLS_SLL_PUSH_NO_SS] cls=%d base=%p from=%s reg_init=%d map_count=%zu\n",
|
||||
class_idx,
|
||||
raw_ptr,
|
||||
where ? where : "(null)",
|
||||
g_super_reg_initialized,
|
||||
g_ss_addr_map.count);
|
||||
fflush(stderr);
|
||||
}
|
||||
}
|
||||
} while (0);
|
||||
if (!push_valid) {
|
||||
return false; // Drop malformed pointer instead of corrupting TLS SLL
|
||||
}
|
||||
|
||||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||||
// Validate header on push - detect blocks pushed without header write
|
||||
@ -492,6 +518,10 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
|
||||
|
||||
static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const char* where)
|
||||
{
|
||||
static _Atomic int g_tls_pop_trace = 0;
|
||||
if (atomic_fetch_add_explicit(&g_tls_pop_trace, 1, memory_order_relaxed) < 256) {
|
||||
HAK_TRACE("[tls_sll_pop_impl_enter]\n");
|
||||
}
|
||||
HAK_CHECK_CLASS_IDX(class_idx, "tls_sll_pop");
|
||||
// Class mask gate: if disallowed, behave as empty
|
||||
if (__builtin_expect(((g_tls_sll_class_mask & (1u << class_idx)) == 0), 0)) {
|
||||
@ -594,7 +624,33 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch
|
||||
// PERF DEBUG: Count header corruption resets
|
||||
static _Atomic uint64_t g_hdr_reset_count = 0;
|
||||
uint64_t cnt = atomic_fetch_add_explicit(&g_hdr_reset_count, 1, memory_order_relaxed);
|
||||
if (cnt % 10000 == 0) {
|
||||
// Narrow diagnostics for early shots to root-cause corruption.
|
||||
static _Atomic uint32_t g_hdr_reset_diag = 0;
|
||||
uint32_t shot = atomic_fetch_add_explicit(&g_hdr_reset_diag, 1, memory_order_relaxed);
|
||||
if (shot < 8) {
|
||||
SuperSlab* ss_diag = hak_super_lookup(raw_base);
|
||||
int slab_idx = ss_diag ? slab_index_for(ss_diag, raw_base) : -1;
|
||||
uint8_t meta_cls = 0xff;
|
||||
if (ss_diag && slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss_diag)) {
|
||||
meta_cls = ss_diag->slabs[slab_idx].class_idx;
|
||||
}
|
||||
void* raw_next_diag = NULL;
|
||||
PTR_NEXT_READ("tls_hdr_reset_diag", class_idx, raw_base, 0, raw_next_diag);
|
||||
fprintf(stderr,
|
||||
"[TLS_SLL_HDR_RESET] shot=%u cls=%d base=%p got=0x%02x expect=0x%02x "
|
||||
"next=%p meta_cls=%u slab_idx=%d last_writer=%s last_push=%p count=%llu\n",
|
||||
shot + 1,
|
||||
class_idx,
|
||||
raw_base,
|
||||
got,
|
||||
expect,
|
||||
raw_next_diag,
|
||||
(unsigned)meta_cls,
|
||||
slab_idx,
|
||||
g_tls_sll_last_writer[class_idx] ? g_tls_sll_last_writer[class_idx] : "(null)",
|
||||
HAK_BASE_TO_RAW(s_tls_sll_last_push[class_idx]),
|
||||
(unsigned long long)cnt);
|
||||
} else if (cnt % 10000 == 0) {
|
||||
fprintf(stderr, "[TLS_SLL_HDR_RESET] cls=%d base=%p got=0x%02x expect=0x%02x count=%llu\n",
|
||||
class_idx, raw_base, got, expect, (unsigned long long)cnt);
|
||||
}
|
||||
@ -811,9 +867,9 @@ static inline bool tls_sll_push_guarded(int class_idx, hak_base_ptr_t ptr, uint3
|
||||
tls_sll_pop_impl((cls), (out), __func__)
|
||||
#else
|
||||
# define tls_sll_push(cls, ptr, cap) \
|
||||
tls_sll_push_impl((cls), (ptr), (cap), NULL)
|
||||
tls_sll_push_impl((cls), (ptr), (cap), __func__)
|
||||
# define tls_sll_pop(cls, out) \
|
||||
tls_sll_pop_impl((cls), (out), NULL)
|
||||
tls_sll_pop_impl((cls), (out), __func__)
|
||||
#endif
|
||||
|
||||
#endif // TLS_SLL_BOX_H
|
||||
#endif // TLS_SLL_BOX_H
|
||||
|
||||
Reference in New Issue
Block a user