Superslab free path base-normalization: use block base for C0–C6 in tiny_free_fast_ss, tiny_free_fast_legacy, same-thread freelist push, midtc push, remote queue push/dup checks; ensures next-pointer writes never hit user header. Addresses residual SEGV beyond TLS-SLL box.

This commit is contained in:
Moe Charm (CI)
2025-11-10 17:02:25 +09:00
parent b09ba4d40d
commit d739ea7769
3 changed files with 53 additions and 25 deletions

View File

@ -36,12 +36,18 @@ extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
// Push pointer to TLS SLL
// Returns: true on success, false if C7 or capacity exceeded
//
// CRITICAL Phase 7 Header Design:
// - C0-C6 (header classes): [1B header][user data]
// ^base ^ptr (caller passes this)
// - SLL stores "base" (ptr-1) to avoid overwriting header
// - C7 (headerless): ptr == base (no offset)
//
// Safety:
// - C7 always rejected (headerless, first 8 bytes = user data)
// - Capacity check prevents overflow
// - Caller must handle fallback (e.g., meta->freelist)
// - Header protection: stores base (ptr-1) for C0-C6
//
// Performance: 2-3 cycles (C0-C6), < 1 cycle (C7 fast rejection)
// Performance: 3-4 cycles (C0-C6), < 1 cycle (C7 fast rejection)
static inline bool tls_sll_push(int class_idx, void* ptr, uint32_t capacity) {
// CRITICAL: C7 (1KB) is headerless - MUST NOT use TLS SLL
// Reason: SLL stores next pointer in first 8 bytes (user data for C7)
@ -54,7 +60,11 @@ static inline bool tls_sll_push(int class_idx, void* ptr, uint32_t capacity) {
return false; // SLL full
}
// Push to SLL (standard linked list push)
// CRITICAL: Caller must pass "base" pointer (NOT user ptr)
// Phase 7 carve operations return base (stride includes header)
// SLL stores base to avoid overwriting header with next pointer
// Push to SLL (standard linked list push using base)
*(void**)ptr = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = ptr;
g_tls_sll_count[class_idx]++;
@ -65,21 +75,27 @@ static inline bool tls_sll_push(int class_idx, void* ptr, uint32_t capacity) {
// ========== Pop ==========
// Pop pointer from TLS SLL
// Returns: true on success (writes to *out), false if empty
// Returns: true on success (writes user ptr to *out), false if empty
//
// CRITICAL Phase 7 Header Design:
// - SLL stores "base" (ptr-1) for C0-C6
// - Must return "ptr" (base+1) to user
// - C7: base == ptr (no offset)
//
// Safety:
// - C7 protection: clears first 8 bytes on pop (prevents next pointer leak)
// - Header protection: returns ptr (base+1) for C0-C6
// - NULL check before deref
//
// Performance: 3-4 cycles
// Performance: 4-5 cycles
static inline bool tls_sll_pop(int class_idx, void** out) {
void* head = g_tls_sll_head[class_idx];
if (!head) {
void* base = g_tls_sll_head[class_idx];
if (!base) {
return false; // SLL empty
}
// Pop from SLL
void* next = *(void**)head;
// Pop from SLL (reads next from base)
void* next = *(void**)base;
g_tls_sll_head[class_idx] = next;
if (g_tls_sll_count[class_idx] > 0) {
g_tls_sll_count[class_idx]--;
@ -91,11 +107,12 @@ static inline bool tls_sll_pop(int class_idx, void** out) {
// Cost: 1 store instruction (~1 cycle), only for C7 (~1% of allocations)
//
// Note: C0-C6 have 1-byte header, so first 8 bytes are safe (header hides next)
// Caller responsibility: Convert base → ptr (base+1) for C0-C6 before returning to user
if (__builtin_expect(class_idx == 7, 0)) {
*(void**)head = NULL;
*(void**)base = NULL;
}
*out = head;
*out = base; // Return base (caller converts to ptr if needed)
return true;
}
@ -104,10 +121,16 @@ static inline bool tls_sll_pop(int class_idx, void** out) {
// Splice chain of pointers to TLS SLL (batch push)
// Returns: actual count moved (0 for C7 or if capacity exceeded)
//
// CRITICAL Phase 7 Header Design:
// - Caller MUST pass chain of "base" pointers (ptr-1 for C0-C6)
// - Chain links are stored at base (*(void**)base = next_base)
// - SLL head stores base pointers
//
// Safety:
// - C7 always returns 0 (no splice)
// - Capacity check limits splice size
// - Chain traversal with safety (breaks on NULL)
// - Assumes chain is already linked using base pointers
//
// Performance: ~5 cycles + O(count) for chain traversal
static inline uint32_t tls_sll_splice(int class_idx, void* chain_head, uint32_t count, uint32_t capacity) {
@ -127,6 +150,7 @@ static inline uint32_t tls_sll_splice(int class_idx, void* chain_head, uint32_t
uint32_t to_move = (count < available) ? count : available;
// Find chain tail (traverse to_move - 1 nodes)
// NOTE: Chain MUST be linked using base pointers (caller responsibility)
void* tail = chain_head;
for (uint32_t i = 1; i < to_move; i++) {
void* next = *(void**)tail;

View File

@ -117,6 +117,7 @@ static inline int tiny_free_fast_ss(SuperSlab* ss, int slab_idx, void* ptr, uint
// Fast path: Same-thread free (2-3 instructions)
int class_idx = ss->size_class;
void* base = (class_idx == 7) ? ptr : (void*)((uint8_t*)ptr - 1);
#if HAKMEM_DEBUG_COUNTERS
// Track same-thread frees (compile-time gated)
@ -127,14 +128,14 @@ static inline int tiny_free_fast_ss(SuperSlab* ss, int slab_idx, void* ptr, uint
extern int g_sfc_enabled;
if (g_sfc_enabled) {
// Box 5-NEW: Try SFC (128 slots)
if (!sfc_free_push(class_idx, ptr)) {
if (!sfc_free_push(class_idx, base)) {
// SFC full → skip caching, use slow path (return 0)
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
return 0;
}
} else {
// Box 5-OLD: Use SLL (16 slots)
tiny_alloc_fast_push(class_idx, ptr);
tiny_alloc_fast_push(class_idx, base);
}
// Active accounting (Box 3: SuperSlab)
@ -154,19 +155,20 @@ static inline int tiny_free_fast_legacy(TinySlab* slab, void* ptr) {
// Fast path: Same-thread free
int class_idx = slab->class_idx;
void* base = (class_idx == 7) ? ptr : (void*)((uint8_t*)ptr - 1);
// Box 5-NEW/5-OLD integration: Push to TLS freelist (SFC or SLL)
extern int g_sfc_enabled;
if (g_sfc_enabled) {
// Box 5-NEW: Try SFC (128 slots)
if (!sfc_free_push(class_idx, ptr)) {
if (!sfc_free_push(class_idx, base)) {
// SFC full → skip caching, use slow path (return 0)
// Do NOT fall back to SLL - it has no capacity check and would grow unbounded!
return 0;
}
} else {
// Box 5-OLD: Use SLL (16 slots)
tiny_alloc_fast_push(class_idx, ptr);
tiny_alloc_fast_push(class_idx, base);
}
return 1; // Success

View File

@ -24,6 +24,8 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
return;
}
TinySlabMeta* meta = &ss->slabs[slab_idx];
// Normalize to block base for header classes (C0-C6)
void* base = (ss->size_class == 7) ? ptr : (void*)((uint8_t*)ptr - 1);
if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
tiny_remote_watch_note("free_enter", ss, slab_idx, ptr, 0xA240u, tiny_self_u32(), 0);
extern __thread TinyTLSSlab g_tls_slabs[];
@ -67,7 +69,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
// Duplicate in freelist (best-effort scan up to 64)
// NOTE: This O(n) scan is VERY expensive (can scan 64 pointers per free!)
void* scan = meta->freelist; int scanned = 0; int dup = 0;
while (scan && scanned < 64) { if (scan == ptr) { dup = 1; break; } scan = *(void**)scan; scanned++; }
while (scan && scanned < 64) { if (scan == base) { dup = 1; break; } scan = *(void**)scan; scanned++; }
if (dup) {
uintptr_t aux = tiny_remote_pack_diag(0xDFu, ss_base, ss_size, (uintptr_t)ptr);
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)ss->size_class, ptr, aux);
@ -119,7 +121,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
tiny_remote_track_expect_alloc(ss, slab_idx, ptr, "local_free_enter", my_tid);
if (!tiny_remote_guard_allow_local_push(ss, slab_idx, meta, ptr, "local_free", my_tid)) {
#include "box/free_remote_box.h"
int transitioned = tiny_free_remote_box(ss, slab_idx, meta, ptr, my_tid);
int transitioned = tiny_free_remote_box(ss, slab_idx, meta, base, my_tid);
if (transitioned) {
extern unsigned long long g_remote_free_transitions[];
g_remote_free_transitions[ss->size_class]++;
@ -143,7 +145,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
if (!g_free_to_ss) {
int cls = (int)ss->size_class;
if (midtc_enabled() && cls >= 4) {
if (midtc_push(cls, ptr)) {
if (midtc_push(cls, base)) {
// Treat as returned to TLS cache (not SS freelist)
meta->used--;
ss_active_dec_one(ss);
@ -156,7 +158,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
#include "box/free_local_box.h"
// Perform freelist push (+first-free publish if applicable)
void* prev_before = meta->freelist;
tiny_free_local_box(ss, slab_idx, meta, ptr, my_tid);
tiny_free_local_box(ss, slab_idx, meta, base, my_tid);
if (prev_before == NULL) {
ROUTE_MARK(19); // first_free_transition
extern unsigned long long g_first_free_transitions[];
@ -280,7 +282,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
} while (0);
if (g_ss_adopt_en2) {
// Use remote queue
uintptr_t head_word = __atomic_load_n((uintptr_t*)ptr, __ATOMIC_RELAXED);
uintptr_t head_word = __atomic_load_n((uintptr_t*)base, __ATOMIC_RELAXED);
if (debug_guard) fprintf(stderr, "[REMOTE_PUSH_CALL] cls=%u slab=%d owner=%u my=%u ptr=%p used=%u remote_count=%u head=%p word=0x%016" PRIxPTR "\n",
ss->size_class,
slab_idx,
@ -291,9 +293,9 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
atomic_load_explicit(&ss->remote_counts[slab_idx], memory_order_relaxed),
(void*)atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed),
head_word);
int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, ptr);
int dup_remote = tiny_remote_queue_contains_guard(ss, slab_idx, base);
if (!dup_remote && __builtin_expect(g_remote_side_enable, 0)) {
dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, ptr);
dup_remote = (head_word == TINY_REMOTE_SENTINEL) || tiny_remote_side_contains(ss, slab_idx, base);
}
if (__builtin_expect(head_word == TINY_REMOTE_SENTINEL && !dup_remote && g_debug_remote_guard, 0)) {
tiny_remote_watch_note("dup_scan_miss", ss, slab_idx, ptr, 0xA215u, my_tid, 0);
@ -317,7 +319,7 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
if (__builtin_expect(tiny_remote_watch_is(ptr), 0)) {
tiny_remote_watch_note("free_remote", ss, slab_idx, ptr, 0xA232u, my_tid, 0);
}
int was_empty = ss_remote_push(ss, slab_idx, ptr); // ss_active_dec_one() called inside
int was_empty = ss_remote_push(ss, slab_idx, base); // ss_active_dec_one() called inside
meta->used--;
// ss_active_dec_one(ss); // REMOVED: Already called inside ss_remote_push()
if (was_empty) {
@ -329,8 +331,8 @@ static inline void hak_tiny_free_superslab(void* ptr, SuperSlab* ss) {
// Fallback: direct freelist push (legacy)
if (debug_guard) fprintf(stderr, "[FREE_SS] Using LEGACY freelist push (not remote queue)\n");
void* prev = meta->freelist;
*(void**)ptr = prev;
meta->freelist = ptr;
*(void**)base = prev;
meta->freelist = base;
tiny_failfast_log("free_local_legacy", ss->size_class, ss, meta, ptr, prev);
do {
static int g_mask_en = -1;