WIP: Add TLS SLL validation and SuperSlab registry fallback

ChatGPT's diagnostic changes to address TLS_SLL_HDR_RESET issue.
Current status: Partial mitigation, but root cause remains.

Changes Applied:
1. SuperSlab Registry Fallback (hakmem_super_registry.h)
   - Added legacy table probe when hash map lookup misses
   - Prevents NULL returns for valid SuperSlabs during initialization
   - Status:  Works but may hide underlying registration issues

2. TLS SLL Push Validation (tls_sll_box.h)
   - Reject push if SuperSlab lookup returns NULL
   - Reject push if class_idx mismatch detected
   - Added [TLS_SLL_PUSH_NO_SS] diagnostic message
   - Status:  Prevents list corruption (defensive)

3. SuperSlab Allocation Class Fix (superslab_allocate.c)
   - Pass actual class_idx to sp_internal_allocate_superslab
   - Prevents dummy class=8 causing OOB access
   - Status:  Root cause fix for allocation path

4. Debug Output Additions
   - First 256 push/pop operations traced
   - First 4 mismatches logged with details
   - SuperSlab registration state logged
   - Status:  Diagnostic tool (not a fix)

5. TLS Hint Box Removed
   - Deleted ss_tls_hint_box.{c,h} (Phase 1 optimization)
   - Simplified to focus on stability first
   - Status:  Can be re-added after root cause fixed

Current Problem (REMAINS UNSOLVED):
- [TLS_SLL_HDR_RESET] still occurs after ~60 seconds of sh8bench
- Pointer is 16 bytes offset from expected (class 1 → class 2 boundary)
- hak_super_lookup returns NULL for that pointer
- Suggests: Use-After-Free, Double-Free, or pointer arithmetic error

Root Cause Analysis:
- Pattern: Pointer offset by +16 (one class 1 stride)
- Timing: Cumulative problem (appears after 60s, not immediately)
- Location: Header corruption detected during TLS SLL pop

Remaining Issues:
⚠️ Registry fallback is defensive (may hide registration bugs)
⚠️ Push validation prevents symptoms but not root cause
⚠️ 16-byte pointer offset source unidentified

Next Steps for Investigation:
1. Full pointer arithmetic audit (Magazine ⇔ TLS SLL paths)
2. Enhanced logging at HDR_RESET point:
   - Expected vs actual pointer value
   - Pointer provenance (where it came from)
   - Allocation trace for that block
3. Verify Headerless flag is OFF throughout build
4. Check for double-offset application in conversions

Technical Assessment:
- 60% root cause fixes (allocation class, validation)
- 40% defensive mitigation (registry fallback, push rejection)

Performance Impact:
- Registry fallback: +10-30 cycles on cold path (negligible)
- Push validation: +5-10 cycles per push (acceptable)
- Overall: < 2% performance impact estimated

Related Issues:
- Phase 1 TLS Hint Box removed temporarily
- Phase 2 Headerless blocked until stability achieved

🤖 Generated with Claude Code (https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-03 20:42:28 +09:00
parent 2624dcce62
commit 0546454168
27 changed files with 543 additions and 46 deletions

View File

@ -35,6 +35,7 @@
#include "../ptr_trace.h"
#include "../tiny_debug_ring.h"
#include "../hakmem_super_registry.h"
#include "ss_addr_map_box.h"
#include "../superslab/superslab_inline.h"
#include "tiny_next_ptr_box.h"
#include "tiny_header_box.h" // Header Box: Single Source of Truth for header operations
@ -250,6 +251,10 @@ static inline void tls_sll_diag_next(int class_idx, hak_base_ptr_t base, hak_bas
static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t capacity, const char* where)
{
static _Atomic int g_tls_push_trace = 0;
if (atomic_fetch_add_explicit(&g_tls_push_trace, 1, memory_order_relaxed) < 256) {
HAK_TRACE("[tls_sll_push_impl_enter]\n");
}
HAK_CHECK_CLASS_IDX(class_idx, "tls_sll_push");
// Class mask gate (narrow triage): if disallowed, reject push
@ -275,6 +280,7 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
void* raw_ptr = HAK_BASE_TO_RAW(ptr);
// Detect meta/class mismatch on push (first few only).
bool push_valid = true;
do {
static _Atomic uint32_t g_tls_sll_push_meta_mis = 0;
struct SuperSlab* ss = hak_super_lookup(raw_ptr);
@ -283,6 +289,7 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
if (sidx >= 0 && sidx < ss_slabs_capacity(ss)) {
uint8_t meta_cls = ss->slabs[sidx].class_idx;
if (meta_cls < TINY_NUM_CLASSES && meta_cls != (uint8_t)class_idx) {
push_valid = false;
uint32_t n = atomic_fetch_add_explicit(&g_tls_sll_push_meta_mis, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr,
@ -295,8 +302,27 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
fflush(stderr);
}
}
} else {
push_valid = false;
static _Atomic uint32_t g_tls_sll_push_no_ss = 0;
uint32_t n = atomic_fetch_add_explicit(&g_tls_sll_push_no_ss, 1, memory_order_relaxed);
if (n < 4) {
extern int g_super_reg_initialized;
extern SSAddrMap g_ss_addr_map;
fprintf(stderr,
"[TLS_SLL_PUSH_NO_SS] cls=%d base=%p from=%s reg_init=%d map_count=%zu\n",
class_idx,
raw_ptr,
where ? where : "(null)",
g_super_reg_initialized,
g_ss_addr_map.count);
fflush(stderr);
}
}
} while (0);
if (!push_valid) {
return false; // Drop malformed pointer instead of corrupting TLS SLL
}
#if HAKMEM_TINY_HEADER_CLASSIDX
// Validate header on push - detect blocks pushed without header write
@ -492,6 +518,10 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t
static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const char* where)
{
static _Atomic int g_tls_pop_trace = 0;
if (atomic_fetch_add_explicit(&g_tls_pop_trace, 1, memory_order_relaxed) < 256) {
HAK_TRACE("[tls_sll_pop_impl_enter]\n");
}
HAK_CHECK_CLASS_IDX(class_idx, "tls_sll_pop");
// Class mask gate: if disallowed, behave as empty
if (__builtin_expect(((g_tls_sll_class_mask & (1u << class_idx)) == 0), 0)) {
@ -594,7 +624,33 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch
// PERF DEBUG: Count header corruption resets
static _Atomic uint64_t g_hdr_reset_count = 0;
uint64_t cnt = atomic_fetch_add_explicit(&g_hdr_reset_count, 1, memory_order_relaxed);
if (cnt % 10000 == 0) {
// Narrow diagnostics for early shots to root-cause corruption.
static _Atomic uint32_t g_hdr_reset_diag = 0;
uint32_t shot = atomic_fetch_add_explicit(&g_hdr_reset_diag, 1, memory_order_relaxed);
if (shot < 8) {
SuperSlab* ss_diag = hak_super_lookup(raw_base);
int slab_idx = ss_diag ? slab_index_for(ss_diag, raw_base) : -1;
uint8_t meta_cls = 0xff;
if (ss_diag && slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss_diag)) {
meta_cls = ss_diag->slabs[slab_idx].class_idx;
}
void* raw_next_diag = NULL;
PTR_NEXT_READ("tls_hdr_reset_diag", class_idx, raw_base, 0, raw_next_diag);
fprintf(stderr,
"[TLS_SLL_HDR_RESET] shot=%u cls=%d base=%p got=0x%02x expect=0x%02x "
"next=%p meta_cls=%u slab_idx=%d last_writer=%s last_push=%p count=%llu\n",
shot + 1,
class_idx,
raw_base,
got,
expect,
raw_next_diag,
(unsigned)meta_cls,
slab_idx,
g_tls_sll_last_writer[class_idx] ? g_tls_sll_last_writer[class_idx] : "(null)",
HAK_BASE_TO_RAW(s_tls_sll_last_push[class_idx]),
(unsigned long long)cnt);
} else if (cnt % 10000 == 0) {
fprintf(stderr, "[TLS_SLL_HDR_RESET] cls=%d base=%p got=0x%02x expect=0x%02x count=%llu\n",
class_idx, raw_base, got, expect, (unsigned long long)cnt);
}
@ -811,9 +867,9 @@ static inline bool tls_sll_push_guarded(int class_idx, hak_base_ptr_t ptr, uint3
tls_sll_pop_impl((cls), (out), __func__)
#else
# define tls_sll_push(cls, ptr, cap) \
tls_sll_push_impl((cls), (ptr), (cap), NULL)
tls_sll_push_impl((cls), (ptr), (cap), __func__)
# define tls_sll_pop(cls, out) \
tls_sll_pop_impl((cls), (out), NULL)
tls_sll_pop_impl((cls), (out), __func__)
#endif
#endif // TLS_SLL_BOX_H
#endif // TLS_SLL_BOX_H