// tiny_refill_opt.h - Inline helpers to batch and splice refill chains // Box: Refill Boundary optimization helpers (kept header-only) #pragma once #include #include #include #include #include "hakmem_build_flags.h" // Ensure flags are overridden #include "tiny_region_id.h" // For HEADER_MAGIC, HEADER_CLASS_MASK (Fix #6) #include "ptr_track.h" // Pointer tracking for debugging header corruption #include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write #include "box/slab_freelist_atomic.h" // Phase 1: Atomic freelist accessor #include "hakmem_debug_master.h" // For unified debug level control #ifndef HAKMEM_TINY_REFILL_OPT #define HAKMEM_TINY_REFILL_OPT 1 #endif // Local chain structure (head/tail pointers) typedef struct TinyRefillChain { void* head; void* tail; uint32_t count; } TinyRefillChain; static inline void trc_init(TinyRefillChain* c) { c->head = NULL; c->tail = NULL; c->count = 0; } static inline void refill_opt_dbg(const char* stage, int class_idx, uint32_t n) { #if HAKMEM_TINY_REFILL_OPT && !HAKMEM_BUILD_RELEASE static int en = -1; static _Atomic int printed = 0; if (__builtin_expect(en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_REFILL_OPT_DEBUG"); en = (e && *e && *e != '0') ? 1 : 0; } if (!en) return; int exp = 0; if (atomic_compare_exchange_strong(&printed, &exp, 1)) { fprintf(stderr, "[REFILL_OPT] stage=%s cls=%d n=%u\n", stage ? stage : "(null)", class_idx, (unsigned)n); fflush(stderr); } #else (void)stage; (void)class_idx; (void)n; #endif } // Phase 7 header-aware push_front: link using base+1 for C0-C6 (C7 not used here) static inline void trc_push_front(TinyRefillChain* c, void* node, int class_idx) { if (c->head == NULL) { c->head = node; c->tail = node; tiny_next_write(class_idx, node, NULL); c->count = 1; } else { tiny_next_write(class_idx, node, c->head); c->head = node; c->count++; } } // Forward declaration of guard function static inline int trc_refill_guard_enabled(void); // Forward declare Box TLS-SLL API #include "box/tls_sll_box.h" // Splice local chain into TLS SLL using Box TLS-SLL API (C7-safe) static inline void trc_splice_to_sll(int class_idx, TinyRefillChain* c, void** sll_head, uint32_t* sll_count) { if (!c || c->head == NULL) return; // CORRUPTION DEBUG: Log chain splice (alignment check removed - false positive) // NOTE: Blocks are stride-aligned from slab base, not absolutely aligned // A slab at 0x1000 with 513B blocks is valid: 0x1000, 0x1201, 0x1402, etc. if (__builtin_expect(trc_refill_guard_enabled(), 0)) { fprintf(stderr, "[SPLICE_TO_SLL] cls=%d head=%p tail=%p count=%u\n", class_idx, c->head, c->tail, c->count); } // DEBUG: Validate chain is properly NULL-terminated BEFORE splicing // Phase 34A: Compile-out splice debug counter (default OFF) #if HAKMEM_SPLICE_DEBUG_COMPILED static _Atomic uint64_t g_splice_count = 0; uint64_t splice_num = atomic_fetch_add(&g_splice_count, 1); if (splice_num > 40 && splice_num < 80 && class_idx == 0) { fprintf(stderr, "[SPLICE_DEBUG] splice=%lu cls=%d head=%p tail=%p count=%u\n", splice_num, class_idx, c->head, c->tail, c->count); // Walk chain to verify NULL termination void* cursor = c->head; uint32_t walked = 0; while (cursor && walked < c->count + 5) { void* next = tiny_next_read(class_idx, cursor); fprintf(stderr, "[SPLICE_WALK] node=%p next=%p walked=%u/%u\n", cursor, next, walked, c->count); if (walked == c->count - 1 && next != NULL) { fprintf(stderr, "[SPLICE_ERROR] Tail not NULL-terminated! tail=%p next=%p\n", cursor, next); abort(); } cursor = next; walked++; } fflush(stderr); } #else (void)0; // No-op when compiled out #endif // 🐛 DEBUG: Log splice call BEFORE calling tls_sll_splice() #if !HAKMEM_BUILD_RELEASE { static _Atomic uint64_t g_splice_call_count = 0; uint64_t call_num = atomic_fetch_add(&g_splice_call_count, 1); if (call_num < 10) { // Log first 10 calls fprintf(stderr, "[TRC_SPLICE #%lu] BEFORE: cls=%d count=%u sll_count_before=%u\n", call_num, class_idx, c->count, g_tls_sll[class_idx].count); fflush(stderr); } } #endif // CRITICAL: Use Box TLS-SLL API for splice (C7-safe, no race) // Note: tls_sll_splice() requires capacity parameter (use large value for refill) uint32_t moved = tls_sll_splice(class_idx, c->head, c->count, 4096); // 🐛 DEBUG: Log splice result AFTER calling tls_sll_splice() #if !HAKMEM_BUILD_RELEASE { static _Atomic uint64_t g_splice_result_count = 0; uint64_t result_num = atomic_fetch_add(&g_splice_result_count, 1); if (result_num < 10) { // Log first 10 results fprintf(stderr, "[TRC_SPLICE #%lu] AFTER: cls=%d moved=%u/%u sll_count_after=%u\n", result_num, class_idx, moved, c->count, g_tls_sll[class_idx].count); fflush(stderr); } } #endif // Update sll_count if provided (Box API already updated g_tls_sll internally) // Note: sll_count parameter is typically &g_tls_sll[class_idx].count, already updated (void)sll_count; // Suppress unused warning (void)sll_head; // Suppress unused warning // If splice was partial, warn about orphaned nodes // Note: Orphaned nodes remain in chain but are not in SLL. // Caller must handle cleanup if needed (typically by returning to freelist). if (__builtin_expect(moved < c->count, 0)) { uint32_t orphaned = c->count - moved; fprintf(stderr, "[SPLICE_PARTIAL] CRITICAL: Only moved %u/%u blocks (cls=%d)\n", moved, c->count, class_idx); fprintf(stderr, "[SPLICE_PARTIAL] %u blocks orphaned - potential memory leak!\n", orphaned); fprintf(stderr, "[SPLICE_PARTIAL] Orphan chain starts at node %u in original chain\n", moved); // Log orphan chain head for debugging void* scan = c->head; for (uint32_t i = 0; i < moved && scan; i++) { void* next = tiny_next_read(class_idx, scan); if (i == moved - 1) { void* orphan_head = tiny_next_read(class_idx, scan); fprintf(stderr, "[SPLICE_PARTIAL] Orphan chain head: %p (after node %u)\n", orphan_head, i); break; } scan = next; } fflush(stderr); // In debug builds, consider aborting on orphaned nodes #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[SPLICE_PARTIAL] Aborting to prevent memory corruption\n"); abort(); #endif } } static inline int trc_refill_guard_enabled(void) { // FIX: Allow runtime override even in release builds for debugging // Enabled via HAKMEM_DEBUG_LEVEL >= 4 (DEBUG level) // Legacy: HAKMEM_TINY_REFILL_FAILFAST still works for compatibility static int g_trc_guard = -1; if (__builtin_expect(g_trc_guard == -1, 0)) { #if HAKMEM_BUILD_RELEASE // Release: Use unified debug level g_trc_guard = hak_debug_check_level("HAKMEM_TINY_REFILL_FAILFAST", 4); #else // Debug: Default ON g_trc_guard = 1; #endif #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[TRC_GUARD] failfast=%d mode=%s\n", g_trc_guard, HAKMEM_BUILD_RELEASE ? "release" : "debug"); fflush(stderr); #endif } return g_trc_guard; } static inline int trc_ptr_is_valid(uintptr_t base, uintptr_t limit, size_t blk, const void* node) { if (!node || limit <= base) return 1; uintptr_t addr = (uintptr_t)node; if (addr < base || addr >= limit) return 0; if (blk == 0) return 1; return ((addr - base) % blk) == 0; } static inline void trc_failfast_abort(const char* stage, int class_idx, uintptr_t base, uintptr_t limit, const void* node) { fprintf(stderr, "[TRC_FAILFAST] stage=%s cls=%d node=%p base=%p limit=%p\n", stage ? stage : "(null)", class_idx, node, (void*)base, (void*)limit); fflush(stderr); abort(); } // Pop up to 'want' nodes from freelist into local chain static inline uint32_t trc_pop_from_freelist(struct TinySlabMeta* meta, int class_idx, uintptr_t ss_base, uintptr_t ss_limit, size_t block_size, uint32_t want, TinyRefillChain* out) { if (!out || want == 0) return 0; trc_init(out); uint32_t taken = 0; // Phase 1: Use lock-free atomic POP (MT-safe) while (taken < want) { void* p = slab_freelist_pop_lockfree(meta, class_idx); if (!p) break; // Freelist empty or CAS race lost if (__builtin_expect(trc_refill_guard_enabled() && !trc_ptr_is_valid(ss_base, ss_limit, block_size, p), 0)) { fprintf(stderr, "[FREELIST_CORRUPT] Reading freelist head: p=%p (ss_base=%p ss_limit=%p blk=%zu)\n", p, (void*)ss_base, (void*)ss_limit, block_size); fprintf(stderr, "[FREELIST_CORRUPT] Head pointer is corrupted (invalid range/alignment)\n"); trc_failfast_abort("freelist_head", class_idx, ss_base, ss_limit, p); } // Phase 1: slab_freelist_pop_lockfree() already unlinked the node internally // No need to manually update meta->freelist (already done atomically) // Phase E1-CORRECT: Restore header BEFORE trc_push_front // ROOT CAUSE: Freelist stores next at base (offset 0), overwriting header. // trc_push_front() uses offset=1 for ALL classes, expecting header at base. // Without restoration, offset=1 contains garbage → chain corruption → SEGV! // // SOLUTION: Restore header AFTER reading freelist next, BEFORE chain push. // Cost: 1 byte write per freelist block (~1-2 cycles, negligible). // ALL classes (C0-C7) need header restoration! #if HAKMEM_TINY_HEADER_CLASSIDX // DEBUG: Log header restoration for class 2 uint8_t before = *(uint8_t*)p; PTR_TRACK_FREELIST_POP(p, class_idx); tiny_header_write_if_preserved(p, class_idx); // Box API PTR_TRACK_HEADER_WRITE(p, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); static _Atomic uint64_t g_freelist_count_c2 = 0; if (class_idx == 2) { uint64_t fl_num = atomic_fetch_add(&g_freelist_count_c2, 1); if (fl_num < 100) { // Log first 100 freelist pops extern _Atomic uint64_t malloc_count; uint64_t call_num = atomic_load(&malloc_count); fprintf(stderr, "[FREELIST_HEADER_RESTORE] fl#%lu call=%lu cls=%d ptr=%p before=0x%02x after=0x%02x\n", fl_num, call_num, class_idx, p, before, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); fflush(stderr); } } #endif trc_push_front(out, p, class_idx); taken++; } // DEBUG REMOVED: refill_opt_dbg causes -26% regression (atomic CAS overhead) return taken; } // Carve a contiguous batch of size 'batch' from linear area, return as chain // Phase 7 header-aware carve: link chain using header-safe next location // class_idx is required to decide headerless (C7) vs headered (C0-C6) static inline uint32_t trc_linear_carve(uint8_t* base, size_t bs, struct TinySlabMeta* meta, uint32_t batch, int class_idx, TinyRefillChain* out) { if (!out || batch == 0) return 0; trc_init(out); // FIX: Use carved (monotonic) instead of used (decrements on free) // CORRUPTION DEBUG: Validate capacity before carving if (__builtin_expect(trc_refill_guard_enabled(), 0)) { if (meta->carved + batch > meta->capacity) { fprintf(stderr, "[LINEAR_CARVE_CORRUPT] Carving beyond capacity!\n"); fprintf(stderr, "[LINEAR_CARVE_CORRUPT] carved=%u batch=%u capacity=%u (would be %u)\n", meta->carved, batch, meta->capacity, meta->carved + batch); fprintf(stderr, "[LINEAR_CARVE_CORRUPT] base=%p bs=%zu\n", (void*)base, bs); abort(); } } // FIX: Use carved counter (monotonic) instead of used (which decrements on free) // Caller passes bs as the effective stride already (includes header when enabled) size_t stride = bs; uint8_t* cursor = base + ((size_t)meta->carved * stride); void* head = (void*)cursor; // CORRUPTION DEBUG: Log carve operation if (__builtin_expect(trc_refill_guard_enabled(), 0)) { fprintf(stderr, "[LINEAR_CARVE] base=%p carved=%u batch=%u cursor=%p\n", (void*)base, meta->carved, batch, (void*)cursor); } // Phase E1-CORRECT: Write headers to carved blocks BEFORE linking // ALL classes (C0-C7) have 1-byte headers now // ROOT CAUSE: tls_sll_splice() checks byte 0 for header magic to determine // next_offset. Without headers, it finds 0x00 and uses next_offset=0 (WRONG!), // reading garbage pointers from wrong offset, causing SEGV. // SOLUTION: Write headers to ALL carved blocks (including C7) so splice detection works correctly. #if HAKMEM_TINY_HEADER_CLASSIDX // Write headers to all batch blocks (ALL classes C0-C7) #if HAKMEM_BUILD_RELEASE static _Atomic uint64_t g_carve_count __attribute__((unused)) = 0; #else static _Atomic uint64_t g_carve_count = 0; #endif for (uint32_t i = 0; i < batch; i++) { uint8_t* block = cursor + (i * stride); PTR_TRACK_CARVE((void*)block, class_idx); tiny_header_write_if_preserved((void*)block, class_idx); // Box API PTR_TRACK_HEADER_WRITE((void*)block, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK)); #if !HAKMEM_BUILD_RELEASE // ✅ Option C: Class 2 inline logs - CARVE operation (DEBUG ONLY) if (class_idx == 2) { uint64_t carve_id = atomic_fetch_add(&g_carve_count, 1); extern _Atomic uint64_t malloc_count; uint64_t call = atomic_load(&malloc_count); fprintf(stderr, "[C2_CARVE] ptr=%p header=0xa2 batch_idx=%u/%u carve_id=%lu call=%lu\n", (void*)block, i+1, batch, carve_id, call); fflush(stderr); } #endif } #endif // CRITICAL FIX (Phase 7): header-aware next pointer placement // For header classes (C0-C6), the first byte at base is the 1-byte header. // Store the SLL next pointer at base+1 to avoid clobbering the header. // For C7 (headerless), store at base. for (uint32_t i = 1; i < batch; i++) { uint8_t* next = cursor + stride; tiny_next_write(class_idx, (void*)cursor, (void*)next); cursor = next; } void* tail = (void*)cursor; // ✅ FIX #2: NULL-terminate the tail to prevent garbage pointer traversal // ROOT CAUSE: Without this, tail's next pointer contains GARBAGE from previous // allocation, causing SEGV when TLS SLL is traversed (crash at iteration 38,985). // The loop above only links blocks 0→1, 1→2, ..., (batch-2)→(batch-1). // It does NOT write to tail's next pointer, leaving stale data! tiny_next_write(class_idx, tail, NULL); // Debug: validate first link #if !HAKMEM_BUILD_RELEASE if (batch >= 2) { void* first_next = tiny_next_read(class_idx, head); fprintf(stderr, "[LINEAR_LINK] cls=%d head=%p next=%p tail=%p\n", class_idx, head, first_next, tail); } else { fprintf(stderr, "[LINEAR_LINK] cls=%d head=%p next=%p tail=%p\n", class_idx, head, (void*)0, tail); } #endif // FIX: Update both carved (monotonic) and used (active count) meta->carved += batch; // Phase 1: Atomic increment for MT safety atomic_fetch_add_explicit(&meta->used, batch, memory_order_relaxed); out->head = head; out->tail = tail; out->count = batch; // DEBUG REMOVED: refill_opt_dbg causes -26% regression (atomic CAS overhead) return batch; }