hakmem/core/hakmem_tiny_refill_p0.inc.h

// hakmem_tiny_refill_p0.inc.h
// ChatGPT Pro P0: Complete Batch Refill (SLL用)
//
// Purpose: Optimize sll_refill_small_from_ss with batch carving
// Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126)
//
// Key optimization: ss_active_inc × 64 → ss_active_add × 1
//
// Maintains: Existing g_tls_sll_head fast path (no changes to hot path!)
//
// Enable P0 by default for testing (set to 0 to disable)
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
#define HAKMEM_TINY_P0_BATCH_REFILL 0
#endif

#ifndef HAKMEM_TINY_REFILL_P0_INC_H
#define HAKMEM_TINY_REFILL_P0_INC_H

#include "tiny_box_geometry.h"  // Box 3: Geometry & Capacity Calculator

// Debug counters (compile-time gated)
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_hit_slab[];
// Diagnostic counters for refill early returns
extern unsigned long long g_rf_early_no_ss[];      // Line 27: !g_use_superslab
extern unsigned long long g_rf_early_no_meta[];    // Line 35: !meta
extern unsigned long long g_rf_early_no_room[];    // Line 40: room <= 0
extern unsigned long long g_rf_early_want_zero[];  // Line 55: want == 0
#endif

// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
#include "tiny_refill_opt.h"
#include "tiny_fc_api.h"
#include "superslab/superslab_inline.h"  // For _ss_remote_drain_to_freelist_unsafe()
// Optional P0 diagnostic logging helper
static inline int p0_should_log(void) {
    static int en = -1;
    if (__builtin_expect(en == -1, 0)) {
        const char* e = getenv("HAKMEM_TINY_P0_LOG");
        en = (e && *e && *e != '0') ? 1 : 0;
    }
    return en;
}

static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
    // CRITICAL: C7 (1KB) is headerless - incompatible with TLS SLL refill
    // Reason: TLS SLL stores next pointer in first 8 bytes (user data for C7)
    // Solution: Skip refill for C7, force slow path allocation
    if (__builtin_expect(class_idx == 7, 0)) {
        return 0;  // C7 uses slow path exclusively
    }

    // Runtime A/B kill switch (defensive). Set HAKMEM_TINY_P0_DISABLE=1 to bypass P0 path.
    do {
        static int g_p0_disable = -1;
        if (__builtin_expect(g_p0_disable == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_P0_DISABLE");
            g_p0_disable = (e && *e && *e != '0') ? 1 : 0;
        }
        if (__builtin_expect(g_p0_disable, 0)) {
            return 0;
        }
    } while (0);
    if (!g_use_superslab || max_take <= 0) {
#if HAKMEM_DEBUG_COUNTERS
        if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
#endif
        return 0;
    }

    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
    uint32_t active_before = 0;
    if (tls->ss) {
        active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
    }

    // CRITICAL DEBUG: Log class 7 pre-warm
    if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
        fprintf(stderr, "[P0_DEBUG_C7] Entry: tls->ss=%p tls->meta=%p max_take=%d\n",
                (void*)tls->ss, (void*)tls->meta, max_take);
    }

    if (!tls->ss) {
        // Try to obtain a SuperSlab for this class
        if (superslab_refill(class_idx) == NULL) {
            if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
                fprintf(stderr, "[P0_DEBUG_C7] superslab_refill() returned NULL\n");
            }
            return 0;
        }
        if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
            fprintf(stderr, "[P0_DEBUG_C7] After superslab_refill(): tls->ss=%p tls->meta=%p\n",
                    (void*)tls->ss, (void*)tls->meta);
        }
    }
    TinySlabMeta* meta = tls->meta;
    if (!meta) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_no_meta[class_idx]++;
#endif
        if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) {
            fprintf(stderr, "[P0_DEBUG_C7] meta is NULL after superslab_refill, returning 0\n");
        }
        return 0;
    }

    // Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B)
    // env:
    //  - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5)
    //  - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7)
    do {
        static int g_direct_fc = -1;
        static int g_direct_fc_c7 = -1;
        if (__builtin_expect(g_direct_fc == -1, 0)) {
            const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
            // Default ON when unset
            g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
        }
        if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
            const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
            // Default OFF for class7 (1KB) until stability is fully verified; opt-in via env
            g_direct_fc_c7 = (e7 && *e7) ? ((*e7 == '0') ? 0 : 1) : 0;
        }
        if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) {
            int room = tiny_fc_room(class_idx);
            if (room <= 0) return 0;
            // Drain only if above threshold
            uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
            static int g_drain_th = -1;
            if (__builtin_expect(g_drain_th == -1, 0)) {
                const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
                g_drain_th = (e && *e) ? atoi(e) : 64;
                if (g_drain_th < 0) g_drain_th = 0;
            }
            if (rmt >= (uint32_t)g_drain_th) {
                static int no_drain = -1;
                if (__builtin_expect(no_drain == -1, 0)) {
                    const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
                    no_drain = (e && *e && *e != '0') ? 1 : 0;
                }
                if (!no_drain) {
                    _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta);
                }
            }
            // Gather pointers without writing into objects
            void* out[128]; int produced = 0;
            TinySlabMeta* m = tls->meta;
            // Box 3: Get stride (block size + header, except C7 which is headerless)
            size_t bs = tiny_stride_for_class(class_idx);
            uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
            while (produced < room) {
                if (__builtin_expect(m->freelist != NULL, 0)) {
                    void* p = m->freelist; m->freelist = *(void**)p; m->used++;
                    out[produced++] = p;
                    continue;
                }
                if (__builtin_expect(m->carved < m->capacity, 1)) {
                    void* p = (void*)(base + ((size_t)m->carved * bs));
                    m->carved++; m->used++;
                    out[produced++] = p;
                    continue;
                }
                // Need to move to another slab with space
                if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
                // Rebind
                tls = &g_tls_slabs[class_idx];
                m = tls->meta;
                base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
            }
            if (produced > 0) {
                ss_active_add(tls->ss, (uint32_t)produced);
                int pushed = tiny_fc_push_bulk(class_idx, out, produced);
                (void)pushed; // roomに合わせているので一致するはず
                if (p0_should_log()) {
                    static _Atomic int g_logged = 0;
                    int exp = 0;
                    if (atomic_compare_exchange_strong(&g_logged, &exp, 1)) {
                        fprintf(stderr, "[P0_DIRECT_FC_TAKE] cls=%d take=%d room=%d drain_th=%d remote_cnt=%u\n",
                                class_idx, produced, room, g_drain_th, rmt);
                    }
                }
                return produced;
            }
            // fallthrough to regular path
        }
    } while (0);

    // Compute how many we can actually push into SLL without overflow
    uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
    int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
    if (room <= 0) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_no_room[class_idx]++;
#endif
        return 0;
    }

    // For hot tiny classes (0..3), allow an env override to increase batch size
    uint32_t want = (uint32_t)max_take;
    if (class_idx <= 3) {
        static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
        if (__builtin_expect(g_hot_override == -2, 0)) {
            const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
            int v = (e && *e) ? atoi(e) : -1;
            if (v < 0) v = -1; if (v > 256) v = 256; // clamp
            g_hot_override = v;
        }
        if (g_hot_override > 0) want = (uint32_t)g_hot_override;
    } else {
        // Mid classes (>=4): optional override for batch size
        static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
        if (__builtin_expect(g_mid_override == -2, 0)) {
            const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
            int v = (e && *e) ? atoi(e) : -1;
            if (v < 0) v = -1; if (v > 256) v = 256; // clamp
            g_mid_override = v;
        }
        if (g_mid_override > 0) want = (uint32_t)g_mid_override;
    }
    if (want > (uint32_t)room) want = (uint32_t)room;
    if (want == 0) {
#if HAKMEM_DEBUG_COUNTERS
        g_rf_early_want_zero[class_idx]++;
#endif
        return 0;
    }

    // Box 3: Get stride (block size + header, except C7 which is headerless)
    size_t bs = tiny_stride_for_class(class_idx);
    int total_taken = 0;

    // === P0 Batch Carving Loop ===
    while (want > 0) {
        // Calculate slab base for validation (accounts for 2048 offset in slab 0)
        uintptr_t ss_base = 0;
        uintptr_t ss_limit = 0;
        if (tls->ss && tls->slab_idx >= 0) {
            // Box 3: Get slab base (handles Slab 0 offset)
            uint8_t* slab_base = tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
            ss_base = (uintptr_t)slab_base;
            // Box 3: Get usable bytes for limit calculation
            ss_limit = ss_base + tiny_usable_bytes_for_slab(tls->slab_idx);
        }

        // CRITICAL FIX: Drain remote queue BEFORE popping from freelist
        // Without this, blocks in both freelist and remote queue can be double-allocated
        // (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data)
        // OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter)
        if (tls->ss && tls->slab_idx >= 0) {
            uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
            if (remote_count > 0) {
                // Runtime A/B: allow skipping remote drain for切り分け
                static int no_drain = -1;
                if (__builtin_expect(no_drain == -1, 0)) {
                    const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
                    no_drain = (e && *e && *e != '0') ? 1 : 0;
                }
                if (!no_drain) {
                    _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
                }
            }
        }

        // Handle freelist items first (usually 0)
        TinyRefillChain chain;
        uint32_t from_freelist = trc_pop_from_freelist(
            meta, class_idx, ss_base, ss_limit, bs, want, &chain);
        if (from_freelist > 0) {
            trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
            // FIX: Blocks from freelist were decremented when freed, must increment when allocated
            ss_active_add(tls->ss, from_freelist);
            // FIX: Keep TinySlabMeta::used consistent with non-P0 path
            meta->used = (uint16_t)((uint32_t)meta->used + from_freelist);
            extern unsigned long long g_rf_freelist_items[];
            g_rf_freelist_items[class_idx] += from_freelist;
            total_taken += from_freelist;
            want -= from_freelist;
            if (want == 0) break;
        }

        // === Linear Carve (P0 Key Optimization!) ===
        // Use monotonic 'carved' to track linear progression (used can decrement on free)
        if (meta->carved >= meta->capacity) {
            // Slab exhausted, try to get another
            if (superslab_refill(class_idx) == NULL) break;
            // CRITICAL FIX: Reload tls pointer after superslab_refill() binds new slab
            tls = &g_tls_slabs[class_idx];
            meta = tls->meta;
            if (!meta) break;
            continue;
        }

        uint32_t available = meta->capacity - meta->carved;
        uint32_t batch = want;
        if (batch > available) batch = available;
        if (batch == 0) break;

        // Get slab base
        uint8_t* slab_base = tls->slab_base ? tls->slab_base
                                            : tiny_slab_base_for(tls->ss, tls->slab_idx);

        // Diagnostic log (one-shot)
        #if !HAKMEM_BUILD_RELEASE
        static _Atomic int g_carve_log_printed = 0;
        if (atomic_load(&g_carve_log_printed) == 0 &&
            atomic_exchange(&g_carve_log_printed, 1) == 0) {
            fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n",
                    class_idx, tls->slab_idx, meta->used, meta->capacity, batch,
                    (void*)slab_base, bs);
            fflush(stderr);
        }
        #endif

        TinyRefillChain carve;
        trc_linear_carve(slab_base, bs, meta, batch, class_idx, &carve);

        // One-shot sanity: validate first few nodes are within the slab and stride-aligned
#if !HAKMEM_BUILD_RELEASE
        do {
            static _Atomic int g_once = 0;
            int exp = 0;
            if (atomic_compare_exchange_strong(&g_once, &exp, 1)) {
                uintptr_t base_chk = (uintptr_t)(tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx));
                uintptr_t limit_chk = base_chk + tiny_usable_bytes_for_slab(tls->slab_idx);
                void* node = carve.head;
                for (int i = 0; i < 3 && node; i++) {
                    uintptr_t a = (uintptr_t)node;
                    if (!(a >= base_chk && a < limit_chk)) {
                        fprintf(stderr, "[P0_SANITY_FAIL] out_of_range cls=%d node=%p base=%p limit=%p bs=%zu\n",
                                class_idx, node, (void*)base_chk, (void*)limit_chk, bs);
                        abort();
                    }
                    size_t off = (size_t)(a - base_chk);
                    if ((off % bs) != 0) {
                        fprintf(stderr, "[P0_SANITY_FAIL] misaligned cls=%d node=%p off=%zu bs=%zu base=%p\n",
                                class_idx, node, off, bs, (void*)base_chk);
                        abort();
                    }
#if HAKMEM_TINY_HEADER_CLASSIDX
                    const size_t next_off = (class_idx == 7) ? 0 : 1;
#else
                    const size_t next_off = 0;
#endif
                    node = *(void**)((uint8_t*)node + next_off);
                }
            }
        } while (0);
#endif
        trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
        // FIX: Update SuperSlab active counter (was missing!)
        ss_active_add(tls->ss, batch);
        extern unsigned long long g_rf_carve_items[];
        g_rf_carve_items[class_idx] += batch;

        total_taken += batch;
        want -= batch;
    }

#if HAKMEM_DEBUG_COUNTERS
    // Track successful SLL refills from SuperSlab (compile-time gated)
    // NOTE: Increment unconditionally to verify counter is working
    g_rf_hit_slab[class_idx]++;
#endif

    if (tls->ss && p0_should_log()) {
        uint32_t active_after = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
        int32_t delta = (int32_t)active_after - (int32_t)active_before;
        if ((int32_t)total_taken != delta) {
            fprintf(stderr,
                    "[P0_COUNTER_MISMATCH] cls=%d slab=%d taken=%d active_delta=%d used=%u carved=%u cap=%u freelist=%p\n",
                    class_idx, tls->slab_idx, total_taken, delta,
                    (unsigned)meta->used, (unsigned)meta->carved, (unsigned)meta->capacity,
                    meta->freelist);
        } else {
            fprintf(stderr,
                    "[P0_COUNTER_OK] cls=%d slab=%d taken=%d active_delta=%d\n",
                    class_idx, tls->slab_idx, total_taken, delta);
        }
    }
    return total_taken;
}

#endif // HAKMEM_TINY_REFILL_P0_INC_H