hakmem/core/tiny_refill_opt.h

// tiny_refill_opt.h - Inline helpers to batch and splice refill chains
// Box: Refill Boundary optimization helpers (kept header-only)
#pragma once

#include <stdint.h>
#include <stdio.h>
#include <stdatomic.h>
#include <stdlib.h>
#include "tiny_region_id.h"  // For HEADER_MAGIC, HEADER_CLASS_MASK (Fix #6)
#include "ptr_track.h"        // Pointer tracking for debugging header corruption
#include "box/tiny_next_ptr_box.h"  // Box API: Next pointer read/write
#include "box/slab_freelist_atomic.h" // Phase 1: Atomic freelist accessor
#include "hakmem_debug_master.h"  // For unified debug level control

#ifndef HAKMEM_TINY_REFILL_OPT
#define HAKMEM_TINY_REFILL_OPT 1
#endif

// Local chain structure (head/tail pointers)
typedef struct TinyRefillChain {
    void* head;
    void* tail;
    uint32_t count;
} TinyRefillChain;

static inline void trc_init(TinyRefillChain* c) {
    c->head = NULL; c->tail = NULL; c->count = 0;
}

static inline void refill_opt_dbg(const char* stage, int class_idx, uint32_t n) {
#if HAKMEM_TINY_REFILL_OPT && !HAKMEM_BUILD_RELEASE
    static int en = -1;
    static _Atomic int printed = 0;
    if (__builtin_expect(en == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_REFILL_OPT_DEBUG");
        en = (e && *e && *e != '0') ? 1 : 0;
    }
    if (!en) return;
    int exp = 0;
    if (atomic_compare_exchange_strong(&printed, &exp, 1)) {
        fprintf(stderr, "[REFILL_OPT] stage=%s cls=%d n=%u\n", stage ? stage : "(null)", class_idx, (unsigned)n);
        fflush(stderr);
    }
#else
    (void)stage; (void)class_idx; (void)n;
#endif
}

// Phase 7 header-aware push_front: link using base+1 for C0-C6 (C7 not used here)
static inline void trc_push_front(TinyRefillChain* c, void* node, int class_idx) {
    if (c->head == NULL) {
        c->head = node; c->tail = node; tiny_next_write(class_idx, node, NULL); c->count = 1;
    } else {
        tiny_next_write(class_idx, node, c->head); c->head = node; c->count++;
    }
}

// Forward declaration of guard function
static inline int trc_refill_guard_enabled(void);

// Forward declare Box TLS-SLL API
#include "box/tls_sll_box.h"

// Splice local chain into TLS SLL using Box TLS-SLL API (C7-safe)
static inline void trc_splice_to_sll(int class_idx, TinyRefillChain* c,
                                     void** sll_head, uint32_t* sll_count) {
    if (!c || c->head == NULL) return;

    // CORRUPTION DEBUG: Log chain splice (alignment check removed - false positive)
    // NOTE: Blocks are stride-aligned from slab base, not absolutely aligned
    // A slab at 0x1000 with 513B blocks is valid: 0x1000, 0x1201, 0x1402, etc.
    if (__builtin_expect(trc_refill_guard_enabled(), 0)) {
        fprintf(stderr, "[SPLICE_TO_SLL] cls=%d head=%p tail=%p count=%u\n",
                class_idx, c->head, c->tail, c->count);
    }

    // DEBUG: Validate chain is properly NULL-terminated BEFORE splicing
    static _Atomic uint64_t g_splice_count = 0;
    uint64_t splice_num = atomic_fetch_add(&g_splice_count, 1);
    if (splice_num > 40 && splice_num < 80 && class_idx == 0) {
        fprintf(stderr, "[SPLICE_DEBUG] splice=%lu cls=%d head=%p tail=%p count=%u\n",
                splice_num, class_idx, c->head, c->tail, c->count);
        // Walk chain to verify NULL termination
        void* cursor = c->head;
        uint32_t walked = 0;
        while (cursor && walked < c->count + 5) {
            void* next = tiny_next_read(class_idx, cursor);
            fprintf(stderr, "[SPLICE_WALK] node=%p next=%p walked=%u/%u\n",
                    cursor, next, walked, c->count);
            if (walked == c->count - 1 && next != NULL) {
                fprintf(stderr, "[SPLICE_ERROR] Tail not NULL-terminated! tail=%p next=%p\n",
                        cursor, next);
                abort();
            }
            cursor = next;
            walked++;
        }
        fflush(stderr);
    }

    // 🐛 DEBUG: Log splice call BEFORE calling tls_sll_splice()
    #if !HAKMEM_BUILD_RELEASE
    {
        static _Atomic uint64_t g_splice_call_count = 0;
        uint64_t call_num = atomic_fetch_add(&g_splice_call_count, 1);
        if (call_num < 10) {  // Log first 10 calls
            fprintf(stderr, "[TRC_SPLICE #%lu] BEFORE: cls=%d count=%u sll_count_before=%u\n",
                    call_num, class_idx, c->count, g_tls_sll[class_idx].count);
            fflush(stderr);
        }
    }
    #endif

    // CRITICAL: Use Box TLS-SLL API for splice (C7-safe, no race)
    // Note: tls_sll_splice() requires capacity parameter (use large value for refill)
    uint32_t moved = tls_sll_splice(class_idx, c->head, c->count, 4096);

    // 🐛 DEBUG: Log splice result AFTER calling tls_sll_splice()
    #if !HAKMEM_BUILD_RELEASE
    {
        static _Atomic uint64_t g_splice_result_count = 0;
        uint64_t result_num = atomic_fetch_add(&g_splice_result_count, 1);
        if (result_num < 10) {  // Log first 10 results
            fprintf(stderr, "[TRC_SPLICE #%lu] AFTER: cls=%d moved=%u/%u sll_count_after=%u\n",
                    result_num, class_idx, moved, c->count, g_tls_sll[class_idx].count);
            fflush(stderr);
        }
    }
    #endif

    // Update sll_count if provided (Box API already updated g_tls_sll internally)
    // Note: sll_count parameter is typically &g_tls_sll[class_idx].count, already updated
    (void)sll_count;  // Suppress unused warning
    (void)sll_head;   // Suppress unused warning

    // If splice was partial, warn about orphaned nodes
    // Note: Orphaned nodes remain in chain but are not in SLL.
    // Caller must handle cleanup if needed (typically by returning to freelist).
    if (__builtin_expect(moved < c->count, 0)) {
        uint32_t orphaned = c->count - moved;
        fprintf(stderr, "[SPLICE_PARTIAL] CRITICAL: Only moved %u/%u blocks (cls=%d)\n",
                moved, c->count, class_idx);
        fprintf(stderr, "[SPLICE_PARTIAL] %u blocks orphaned - potential memory leak!\n",
                orphaned);
        fprintf(stderr, "[SPLICE_PARTIAL] Orphan chain starts at node %u in original chain\n",
                moved);

        // Log orphan chain head for debugging
        void* scan = c->head;
        for (uint32_t i = 0; i < moved && scan; i++) {
            void* next = tiny_next_read(class_idx, scan);
            if (i == moved - 1) {
                void* orphan_head = tiny_next_read(class_idx, scan);
                fprintf(stderr, "[SPLICE_PARTIAL] Orphan chain head: %p (after node %u)\n",
                        orphan_head, i);
                break;
            }
            scan = next;
        }
        fflush(stderr);

        // In debug builds, consider aborting on orphaned nodes
        #if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[SPLICE_PARTIAL] Aborting to prevent memory corruption\n");
        abort();
        #endif
    }
}

static inline int trc_refill_guard_enabled(void) {
    // FIX: Allow runtime override even in release builds for debugging
    // Enabled via HAKMEM_DEBUG_LEVEL >= 4 (DEBUG level)
    // Legacy: HAKMEM_TINY_REFILL_FAILFAST still works for compatibility
    static int g_trc_guard = -1;
    if (__builtin_expect(g_trc_guard == -1, 0)) {
#if HAKMEM_BUILD_RELEASE
        // Release: Use unified debug level
        g_trc_guard = hak_debug_check_level("HAKMEM_TINY_REFILL_FAILFAST", 4);
#else
        // Debug: Default ON
        g_trc_guard = 1;
#endif
        #if !HAKMEM_BUILD_RELEASE
        fprintf(stderr, "[TRC_GUARD] failfast=%d mode=%s\n",
                g_trc_guard, HAKMEM_BUILD_RELEASE ? "release" : "debug");
        fflush(stderr);
        #endif
    }
    return g_trc_guard;
}

static inline int trc_ptr_is_valid(uintptr_t base, uintptr_t limit, size_t blk, const void* node) {
    if (!node || limit <= base) return 1;
    uintptr_t addr = (uintptr_t)node;
    if (addr < base || addr >= limit) return 0;
    if (blk == 0) return 1;
    return ((addr - base) % blk) == 0;
}

static inline void trc_failfast_abort(const char* stage,
                                      int class_idx,
                                      uintptr_t base,
                                      uintptr_t limit,
                                      const void* node) {
    fprintf(stderr,
            "[TRC_FAILFAST] stage=%s cls=%d node=%p base=%p limit=%p\n",
            stage ? stage : "(null)",
            class_idx,
            node,
            (void*)base,
            (void*)limit);
    fflush(stderr);
    abort();
}

// Pop up to 'want' nodes from freelist into local chain
static inline uint32_t trc_pop_from_freelist(struct TinySlabMeta* meta,
                                             int class_idx,
                                             uintptr_t ss_base,
                                             uintptr_t ss_limit,
                                             size_t block_size,
                                             uint32_t want,
                                             TinyRefillChain* out) {
    if (!out || want == 0) return 0;
    trc_init(out);
    uint32_t taken = 0;
    // Phase 1: Use lock-free atomic POP (MT-safe)
    while (taken < want) {
        void* p = slab_freelist_pop_lockfree(meta, class_idx);
        if (!p) break;  // Freelist empty or CAS race lost
        if (__builtin_expect(trc_refill_guard_enabled() &&
                                 !trc_ptr_is_valid(ss_base, ss_limit, block_size, p),
                             0)) {
            fprintf(stderr, "[FREELIST_CORRUPT] Reading freelist head: p=%p (ss_base=%p ss_limit=%p blk=%zu)\n",
                    p, (void*)ss_base, (void*)ss_limit, block_size);
            fprintf(stderr, "[FREELIST_CORRUPT] Head pointer is corrupted (invalid range/alignment)\n");
            trc_failfast_abort("freelist_head", class_idx, ss_base, ss_limit, p);
        }
        // Phase 1: slab_freelist_pop_lockfree() already unlinked the node internally
        // No need to manually update meta->freelist (already done atomically)

        // Phase E1-CORRECT: Restore header BEFORE trc_push_front
        // ROOT CAUSE: Freelist stores next at base (offset 0), overwriting header.
        // trc_push_front() uses offset=1 for ALL classes, expecting header at base.
        // Without restoration, offset=1 contains garbage → chain corruption → SEGV!
        //
        // SOLUTION: Restore header AFTER reading freelist next, BEFORE chain push.
        // Cost: 1 byte write per freelist block (~1-2 cycles, negligible).
        // ALL classes (C0-C7) need header restoration!
#if HAKMEM_TINY_HEADER_CLASSIDX
        // DEBUG: Log header restoration for class 2
        uint8_t before = *(uint8_t*)p;
        PTR_TRACK_FREELIST_POP(p, class_idx);
        *(uint8_t*)p = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
        PTR_TRACK_HEADER_WRITE(p, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
        static _Atomic uint64_t g_freelist_count_c2 = 0;
        if (class_idx == 2) {
            uint64_t fl_num = atomic_fetch_add(&g_freelist_count_c2, 1);
            if (fl_num < 100) {  // Log first 100 freelist pops
                extern _Atomic uint64_t malloc_count;
                uint64_t call_num = atomic_load(&malloc_count);
                fprintf(stderr, "[FREELIST_HEADER_RESTORE] fl#%lu call=%lu cls=%d ptr=%p before=0x%02x after=0x%02x\n",
                        fl_num, call_num, class_idx, p, before, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));
                fflush(stderr);
            }
        }
#endif

        trc_push_front(out, p, class_idx);
        taken++;
    }
    // DEBUG REMOVED: refill_opt_dbg causes -26% regression (atomic CAS overhead)
    return taken;
}

// Carve a contiguous batch of size 'batch' from linear area, return as chain
// Phase 7 header-aware carve: link chain using header-safe next location
// class_idx is required to decide headerless (C7) vs headered (C0-C6)
static inline uint32_t trc_linear_carve(uint8_t* base, size_t bs,
                                        struct TinySlabMeta* meta,
                                        uint32_t batch,
                                        int class_idx,
                                        TinyRefillChain* out) {
    if (!out || batch == 0) return 0;
    trc_init(out);

    // FIX: Use carved (monotonic) instead of used (decrements on free)
    // CORRUPTION DEBUG: Validate capacity before carving
    if (__builtin_expect(trc_refill_guard_enabled(), 0)) {
        if (meta->carved + batch > meta->capacity) {
            fprintf(stderr, "[LINEAR_CARVE_CORRUPT] Carving beyond capacity!\n");
            fprintf(stderr, "[LINEAR_CARVE_CORRUPT] carved=%u batch=%u capacity=%u (would be %u)\n",
                    meta->carved, batch, meta->capacity, meta->carved + batch);
            fprintf(stderr, "[LINEAR_CARVE_CORRUPT] base=%p bs=%zu\n", (void*)base, bs);
            abort();
        }
    }

    // FIX: Use carved counter (monotonic) instead of used (which decrements on free)
    // Caller passes bs as the effective stride already (includes header when enabled)
    size_t stride = bs;
    uint8_t* cursor = base + ((size_t)meta->carved * stride);
    void* head = (void*)cursor;

    // CORRUPTION DEBUG: Log carve operation
    if (__builtin_expect(trc_refill_guard_enabled(), 0)) {
        fprintf(stderr, "[LINEAR_CARVE] base=%p carved=%u batch=%u cursor=%p\n",
                (void*)base, meta->carved, batch, (void*)cursor);
    }

    // Phase E1-CORRECT: Write headers to carved blocks BEFORE linking
    // ALL classes (C0-C7) have 1-byte headers now
    // ROOT CAUSE: tls_sll_splice() checks byte 0 for header magic to determine
    // next_offset. Without headers, it finds 0x00 and uses next_offset=0 (WRONG!),
    // reading garbage pointers from wrong offset, causing SEGV.
    // SOLUTION: Write headers to ALL carved blocks (including C7) so splice detection works correctly.
#if HAKMEM_TINY_HEADER_CLASSIDX
    // Write headers to all batch blocks (ALL classes C0-C7)
    static _Atomic uint64_t g_carve_count = 0;
    for (uint32_t i = 0; i < batch; i++) {
        uint8_t* block = cursor + (i * stride);
        PTR_TRACK_CARVE((void*)block, class_idx);
        *block = HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK);
        PTR_TRACK_HEADER_WRITE((void*)block, HEADER_MAGIC | (class_idx & HEADER_CLASS_MASK));

#if !HAKMEM_BUILD_RELEASE
        // ✅ Option C: Class 2 inline logs - CARVE operation (DEBUG ONLY)
        if (class_idx == 2) {
            uint64_t carve_id = atomic_fetch_add(&g_carve_count, 1);
            extern _Atomic uint64_t malloc_count;
            uint64_t call = atomic_load(&malloc_count);
            fprintf(stderr, "[C2_CARVE] ptr=%p header=0xa2 batch_idx=%u/%u carve_id=%lu call=%lu\n",
                    (void*)block, i+1, batch, carve_id, call);
            fflush(stderr);
        }
#endif
    }
#endif

    // CRITICAL FIX (Phase 7): header-aware next pointer placement
    // For header classes (C0-C6), the first byte at base is the 1-byte header.
    // Store the SLL next pointer at base+1 to avoid clobbering the header.
    // For C7 (headerless), store at base.
    for (uint32_t i = 1; i < batch; i++) {
        uint8_t* next = cursor + stride;
        tiny_next_write(class_idx, (void*)cursor, (void*)next);
        cursor = next;
    }
    void* tail = (void*)cursor;

    // ✅ FIX #2: NULL-terminate the tail to prevent garbage pointer traversal
    // ROOT CAUSE: Without this, tail's next pointer contains GARBAGE from previous
    // allocation, causing SEGV when TLS SLL is traversed (crash at iteration 38,985).
    // The loop above only links blocks 0→1, 1→2, ..., (batch-2)→(batch-1).
    // It does NOT write to tail's next pointer, leaving stale data!
    tiny_next_write(class_idx, tail, NULL);

    // Debug: validate first link
#if !HAKMEM_BUILD_RELEASE
    if (batch >= 2) {
        void* first_next = tiny_next_read(class_idx, head);
        fprintf(stderr, "[LINEAR_LINK] cls=%d head=%p next=%p tail=%p\n",
                class_idx, head, first_next, tail);
    } else {
        fprintf(stderr, "[LINEAR_LINK] cls=%d head=%p next=%p tail=%p\n",
                class_idx, head, (void*)0, tail);
    }
#endif
    // FIX: Update both carved (monotonic) and used (active count)
    meta->carved += batch;
    // Phase 1: Atomic increment for MT safety
    atomic_fetch_add_explicit(&meta->used, batch, memory_order_relaxed);
    out->head = head;
    out->tail = tail;
    out->count = batch;
    // DEBUG REMOVED: refill_opt_dbg causes -26% regression (atomic CAS overhead)
    return batch;
}