From d5302e9c874412cc4ab81c76050f1b0ebdcd9c5d Mon Sep 17 00:00:00 2001
From: "Moe Charm (CI)" <moecharm@example.com>
Date: Mon, 10 Nov 2025 18:21:32 +0900
Subject: [PATCH] Phase 7 follow-up: header-aware in BG spill, TLS drain, and
 aggressive inline macros
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

- bg_spill: link/traverse next at base+1 for C0–C6, base for C7
- lifecycle: drain TLS SLL and fast caches reading next with header-aware offsets
- tiny_alloc_fast_inline: POP/PUSH macros made header-aware to match tls_sll_box rules
- add optional FREE_WRAP_ENTER trace (HAKMEM_FREE_WRAP_TRACE) for early triage

Result: 0xa0/…0099 bogus free logs gone; remaining SIGBUS appears in free path early. Next: instrument early libc fallback or guard invalid pointers during init to pinpoint source.
---
 core/box/hak_free_api.inc.h    | 12 ++++++++++++
 core/hakmem_tiny_bg_spill.c    | 17 ++++++++++++-----
 core/hakmem_tiny_bg_spill.h    | 16 ++++++++++++++--
 core/hakmem_tiny_lifecycle.inc | 14 ++++++++++++--
 core/tiny_alloc_fast_inline.h  | 14 ++++++++++++--
 5 files changed, 62 insertions(+), 11 deletions(-)

diff --git a/core/box/hak_free_api.inc.h b/core/box/hak_free_api.inc.h
index 0887c1ca..25c85a5d 100644
--- a/core/box/hak_free_api.inc.h
+++ b/core/box/hak_free_api.inc.h
@@ -72,6 +72,18 @@ void hak_free_at(void* ptr, size_t size, hak_callsite_t site) {
     HKM_TIME_START(t0);
 #endif
     (void)site; (void)size;
+    // Optional lightweight trace of early free calls (first few only)
+    static int free_trace_en = -1; static _Atomic int free_trace_count = 0;
+    if (__builtin_expect(free_trace_en == -1, 0)) {
+        const char* e = getenv("HAKMEM_FREE_WRAP_TRACE");
+        free_trace_en = (e && *e && *e != '0') ? 1 : 0;
+    }
+    if (free_trace_en) {
+        int n = atomic_fetch_add(&free_trace_count, 1);
+        if (n < 8) {
+            fprintf(stderr, "[FREE_WRAP_ENTER] ptr=%p\n", ptr);
+        }
+    }
     if (!ptr) {
 #if HAKMEM_DEBUG_TIMING
         HKM_TIME_END(HKM_CAT_HAK_FREE, t0);
diff --git a/core/hakmem_tiny_bg_spill.c b/core/hakmem_tiny_bg_spill.c
index 46132a45..f983f97d 100644
--- a/core/hakmem_tiny_bg_spill.c
+++ b/core/hakmem_tiny_bg_spill.c
@@ -45,19 +45,25 @@ void bg_spill_drain_class(int class_idx, pthread_mutex_t* lock) {
     void* rest = NULL;
     void* cur = (void*)chain;
     void* prev = NULL;
+    // Phase 7: header-aware next pointer (C0-C6: base+1, C7: base)
+#if HAKMEM_TINY_HEADER_CLASSIDX
+    const size_t next_off = (class_idx == 7) ? 0 : 1;
+#else
+    const size_t next_off = 0;
+#endif
     while (cur && processed < g_bg_spill_max_batch) {
         prev = cur;
-        cur = *(void**)cur;
+        cur = *(void**)((uint8_t*)cur + next_off);
         processed++;
     }
-    if (cur != NULL) { rest = cur; *(void**)prev = NULL; }
+    if (cur != NULL) { rest = cur; *(void**)((uint8_t*)prev + next_off) = NULL; }
 
     // Return processed nodes to SS freelists
     pthread_mutex_lock(lock);
     uint32_t self_tid = tiny_self_u32_guard();
     void* node = (void*)chain;
     while (node) {
-        void* next = *(void**)node;
+        void* next = *(void**)((uint8_t*)node + next_off);
         SuperSlab* owner_ss = hak_super_lookup(node);
         if (owner_ss && owner_ss->magic == SUPERSLAB_MAGIC) {
             int slab_idx = slab_index_for(owner_ss, node);
@@ -69,6 +75,7 @@ void bg_spill_drain_class(int class_idx, pthread_mutex_t* lock) {
                 continue;
             }
             void* prev = meta->freelist;
+            // SuperSlab freelist uses base offset (no header while free)
             *(void**)node = prev;
             meta->freelist = node;
             tiny_failfast_log("bg_spill", owner_ss->size_class, owner_ss, meta, node, prev);
@@ -87,10 +94,10 @@ void bg_spill_drain_class(int class_idx, pthread_mutex_t* lock) {
         // Prepend remainder back to head
         uintptr_t old_head;
         void* tail = rest;
-        while (*(void**)tail) tail = *(void**)tail;
+        while (*(void**)((uint8_t*)tail + next_off)) tail = *(void**)((uint8_t*)tail + next_off);
         do {
             old_head = atomic_load_explicit(&g_bg_spill_head[class_idx], memory_order_acquire);
-            *(void**)tail = (void*)old_head;
+            *(void**)((uint8_t*)tail + next_off) = (void*)old_head;
         } while (!atomic_compare_exchange_weak_explicit(&g_bg_spill_head[class_idx], &old_head,
                                                         (uintptr_t)rest,
                                                         memory_order_release, memory_order_relaxed));
diff --git a/core/hakmem_tiny_bg_spill.h b/core/hakmem_tiny_bg_spill.h
index 4434ab3d..a378c09e 100644
--- a/core/hakmem_tiny_bg_spill.h
+++ b/core/hakmem_tiny_bg_spill.h
@@ -24,7 +24,13 @@ static inline void bg_spill_push_one(int class_idx, void* p) {
     uintptr_t old_head;
     do {
         old_head = atomic_load_explicit(&g_bg_spill_head[class_idx], memory_order_acquire);
-        *(void**)p = (void*)old_head;
+        // Phase 7: header-aware next placement (C0-C6: base+1, C7: base)
+#if HAKMEM_TINY_HEADER_CLASSIDX
+        const size_t next_off = (class_idx == 7) ? 0 : 1;
+#else
+        const size_t next_off = 0;
+#endif
+        *(void**)((uint8_t*)p + next_off) = (void*)old_head;
     } while (!atomic_compare_exchange_weak_explicit(&g_bg_spill_head[class_idx], &old_head,
                                                     (uintptr_t)p,
                                                     memory_order_release, memory_order_relaxed));
@@ -36,7 +42,13 @@ static inline void bg_spill_push_chain(int class_idx, void* head, void* tail, in
     uintptr_t old_head;
     do {
         old_head = atomic_load_explicit(&g_bg_spill_head[class_idx], memory_order_acquire);
-        *(void**)tail = (void*)old_head;
+        // Phase 7: header-aware next placement for tail link
+#if HAKMEM_TINY_HEADER_CLASSIDX
+        const size_t next_off = (class_idx == 7) ? 0 : 1;
+#else
+        const size_t next_off = 0;
+#endif
+        *(void**)((uint8_t*)tail + next_off) = (void*)old_head;
     } while (!atomic_compare_exchange_weak_explicit(&g_bg_spill_head[class_idx], &old_head,
                                                     (uintptr_t)head,
                                                     memory_order_release, memory_order_relaxed));
diff --git a/core/hakmem_tiny_lifecycle.inc b/core/hakmem_tiny_lifecycle.inc
index 094b807b..2015162d 100644
--- a/core/hakmem_tiny_lifecycle.inc
+++ b/core/hakmem_tiny_lifecycle.inc
@@ -149,7 +149,12 @@ static void tiny_tls_cache_drain(int class_idx) {
     g_tls_sll_head[class_idx] = NULL;
     g_tls_sll_count[class_idx] = 0;
     while (sll) {
-        void* next = *(void**)sll;
+#if HAKMEM_TINY_HEADER_CLASSIDX
+        const size_t next_off_sll = (class_idx == 7) ? 0 : 1;
+#else
+        const size_t next_off_sll = 0;
+#endif
+        void* next = *(void**)((uint8_t*)sll + next_off_sll);
         tiny_tls_list_guard_push(class_idx, tls, sll);
         tls_list_push(tls, sll);
         sll = next;
@@ -160,7 +165,12 @@ static void tiny_tls_cache_drain(int class_idx) {
     g_fast_head[class_idx] = NULL;
     g_fast_count[class_idx] = 0;
     while (fast) {
-        void* next = *(void**)fast;
+#if HAKMEM_TINY_HEADER_CLASSIDX
+        const size_t next_off_fast = (class_idx == 7) ? 0 : 1;
+#else
+        const size_t next_off_fast = 0;
+#endif
+        void* next = *(void**)((uint8_t*)fast + next_off_fast);
         tiny_tls_list_guard_push(class_idx, tls, fast);
         tls_list_push(tls, fast);
         fast = next;
diff --git a/core/tiny_alloc_fast_inline.h b/core/tiny_alloc_fast_inline.h
index 0197f5cd..53932a8a 100644
--- a/core/tiny_alloc_fast_inline.h
+++ b/core/tiny_alloc_fast_inline.h
@@ -49,7 +49,12 @@ extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
             if (g_tls_sll_count[(class_idx)] > 0) g_tls_sll_count[(class_idx)]--; \
             (ptr_out) = NULL; \
         } else { \
-            void* _next = *(void**)_head; \
+            /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \
+            size_t _off = 0; \
+#if HAKMEM_TINY_HEADER_CLASSIDX \
+            _off = ((class_idx) == 7) ? 0 : 1; \
+#endif \
+            void* _next = *(void**)((uint8_t*)_head + _off); \
             g_tls_sll_head[(class_idx)] = _next; \
             if (g_tls_sll_count[(class_idx)] > 0) { \
                 g_tls_sll_count[(class_idx)]--; \
@@ -81,7 +86,12 @@ extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
 //     mov    %rsi, g_tls_sll_head(%rdi)
 //
 #define TINY_ALLOC_FAST_PUSH_INLINE(class_idx, ptr) do { \
-    *(void**)(ptr) = g_tls_sll_head[(class_idx)]; \
+    /* Phase 7: header-aware next (C0-C6: base+1, C7: base) */ \
+    size_t _off = 0; \
+#if HAKMEM_TINY_HEADER_CLASSIDX \
+    _off = ((class_idx) == 7) ? 0 : 1; \
+#endif \
+    *(void**)((uint8_t*)(ptr) + _off) = g_tls_sll_head[(class_idx)]; \
     g_tls_sll_head[(class_idx)] = (ptr); \
     g_tls_sll_count[(class_idx)]++; \
 } while(0)