From 4d2784c52f1e6ce9403831ca82f6da5739e4a6c3 Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Wed, 3 Dec 2025 21:01:25 +0900 Subject: [PATCH] Enhance TLS SLL diagnostic logging to detect head corruption source MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Critical discovery: TLS SLL head itself is getting corrupted with invalid pointers, not a next-pointer offset issue. Added defensive sanitization and detailed logging. Changes: 1. tls_sll_sanitize_head() - New defensive function - Validates TLS head against SuperSlab metadata - Checks header magic byte consistency - Resets corrupted list immediately on detection - Called at push_enter and pop_enter (defensive walls) 2. Enhanced HDR_RESET diagnostics - Dump both next pointers (offset 0 and tiny_next_off()) - Show first 8 bytes of block (raw dump) - Include next_off value and pointer values - Better correlation with SuperSlab metadata Key Findings from Diagnostic Run (/tmp/sh8_short.log): - TLS head becomes unregistered garbage value at pop_enter - Example: head=0x749fe96c0990 meta_cls=255 idx=-1 ss=(nil) - Sanitize detects and resets the list - SuperSlab registration is SUCCESSFUL (map_count=4) - But head gets corrupted AFTER registration Root Cause Analysis: ✅ NOT a next-pointer offset issue (would be consistent) ❌ TLS head is being OVERWRITTEN by external code - Candidates: TLS variable collision, memset overflow, stray write Corruption Pattern: 1. Superslab initialized successfully (verified by map_count) 2. TLS head is initially correct 3. Between registration and pop_enter: head gets corrupted 4. Corruption value is garbage (unregistered pointer) 5. Lower bytes damaged (0xe1/0x31 patterns) Next Steps: - Check TLS layout and variable boundaries (stack overflow?) - Audit all writes to g_tls_sll array - Look for memset/memcpy operating on wrong range - Consider thread-local storage fragmentation Technical Impact: - Sanitize prevents list propagation (defensive) - But underlying corruption source remains - May be in TLS initialization, variable layout, or external overwrite Performance: Negligible (sanitize is once per pop_enter) 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude --- core/box/tls_sll_box.h | 69 ++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 67 insertions(+), 2 deletions(-) diff --git a/core/box/tls_sll_box.h b/core/box/tls_sll_box.h index 34731723..364cc391 100644 --- a/core/box/tls_sll_box.h +++ b/core/box/tls_sll_box.h @@ -176,6 +176,50 @@ static inline int tls_sll_head_valid(hak_base_ptr_t head) return (a >= 4096 && a <= 0x00007fffffffffffULL); } +// Defensive: validate current TLS head before using it. +// If invalid, drop the list to avoid propagating corruption. +static inline void tls_sll_sanitize_head(int class_idx, const char* stage) +{ + if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) { + return; + } + hak_base_ptr_t head = g_tls_sll[class_idx].head; + if (hak_base_is_null(head)) return; + + void* raw = HAK_BASE_TO_RAW(head); + SuperSlab* ss = hak_super_lookup(raw); + int cap = ss ? ss_slabs_capacity(ss) : 0; + int idx = (ss && ss->magic == SUPERSLAB_MAGIC) ? slab_index_for(ss, raw) : -1; + uint8_t meta_cls = (idx >= 0 && idx < cap) ? ss->slabs[idx].class_idx : 0xff; + + int reset = 0; + if (!ss || ss->magic != SUPERSLAB_MAGIC || idx < 0 || idx >= cap || meta_cls != (uint8_t)class_idx) { + reset = 1; + } +#if HAKMEM_TINY_HEADER_CLASSIDX + if (!reset) { + uint8_t hdr = *(uint8_t*)raw; + uint8_t expect = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK); + if (hdr != expect) { + reset = 1; + } + } +#endif + if (reset) { + fprintf(stderr, + "[TLS_SLL_SANITIZE] stage=%s cls=%d head=%p meta_cls=%u idx=%d ss=%p\n", + stage ? stage : "(null)", + class_idx, + raw, + (unsigned)meta_cls, + idx, + (void*)ss); + g_tls_sll[class_idx].head = HAK_BASE_FROM_RAW(NULL); + g_tls_sll[class_idx].count = 0; + tls_sll_record_writer(class_idx, "sanitize"); + } +} + static inline void tls_sll_log_hdr_mismatch(int class_idx, hak_base_ptr_t base, uint8_t got, uint8_t expect, const char* stage) { static _Atomic uint32_t g_hdr_mismatch_log = 0; @@ -262,6 +306,9 @@ static inline bool tls_sll_push_impl(int class_idx, hak_base_ptr_t ptr, uint32_t return false; } + // Defensive: ensure current head is sane before linking new node. + tls_sll_sanitize_head(class_idx, "push"); + // Capacity semantics: // - capacity == 0 → disabled (reject) // - capacity > 1<<20 → treat as "unbounded" sentinel (no limit) @@ -529,6 +576,9 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch } atomic_fetch_add(&g_integrity_check_class_bounds, 1); + // Defensive: ensure current head is sane before accessing it. + tls_sll_sanitize_head(class_idx, "pop_enter"); + hak_base_ptr_t base = g_tls_sll[class_idx].head; if (hak_base_is_null(base)) { return false; @@ -628,6 +678,15 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch static _Atomic uint32_t g_hdr_reset_diag = 0; uint32_t shot = atomic_fetch_add_explicit(&g_hdr_reset_diag, 1, memory_order_relaxed); if (shot < 8) { + // Extra diagnostics: dump raw next pointers at offsets 0 and tiny_next_off() + uintptr_t next_raw_off0 = 0; + uintptr_t next_raw_off1 = 0; + size_t next_off = tiny_next_off(class_idx); + memcpy(&next_raw_off0, raw_base, sizeof(next_raw_off0)); + memcpy(&next_raw_off1, (uint8_t*)raw_base + next_off, sizeof(next_raw_off1)); + uint8_t dump8[8] = {0}; + memcpy(dump8, raw_base, sizeof(dump8)); + SuperSlab* ss_diag = hak_super_lookup(raw_base); int slab_idx = ss_diag ? slab_index_for(ss_diag, raw_base) : -1; uint8_t meta_cls = 0xff; @@ -638,7 +697,8 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch PTR_NEXT_READ("tls_hdr_reset_diag", class_idx, raw_base, 0, raw_next_diag); fprintf(stderr, "[TLS_SLL_HDR_RESET] shot=%u cls=%d base=%p got=0x%02x expect=0x%02x " - "next=%p meta_cls=%u slab_idx=%d last_writer=%s last_push=%p count=%llu\n", + "next=%p meta_cls=%u slab_idx=%d last_writer=%s last_push=%p count=%llu " + "next_off=%zu next_raw0=%p next_raw1=%p bytes=%02x%02x%02x%02x%02x%02x%02x%02x\n", shot + 1, class_idx, raw_base, @@ -649,7 +709,12 @@ static inline bool tls_sll_pop_impl(int class_idx, hak_base_ptr_t* out, const ch slab_idx, g_tls_sll_last_writer[class_idx] ? g_tls_sll_last_writer[class_idx] : "(null)", HAK_BASE_TO_RAW(s_tls_sll_last_push[class_idx]), - (unsigned long long)cnt); + (unsigned long long)cnt, + next_off, + (void*)next_raw_off0, + (void*)next_raw_off1, + dump8[0], dump8[1], dump8[2], dump8[3], + dump8[4], dump8[5], dump8[6], dump8[7]); } else if (cnt % 10000 == 0) { fprintf(stderr, "[TLS_SLL_HDR_RESET] cls=%d base=%p got=0x%02x expect=0x%02x count=%llu\n", class_idx, raw_base, got, expect, (unsigned long long)cnt);