hakmem/core/hakmem_tiny_refill.inc.h

// hakmem_tiny_refill.inc.h
// Phase 2D-1: Hot-path inline functions - Refill operations
//
// This file contains hot-path refill functions for various allocation tiers.
// These functions are extracted from hakmem_tiny.c to improve maintainability and
// reduce the main file size by approximately 280 lines.
//
// Functions handle:
// - tiny_fast_refill_and_take: Fast cache refill (lines 584-622, 39 lines)
// - quick_refill_from_sll: Quick slot refill from SLL (lines 918-936, 19 lines)
// - quick_refill_from_mag: Quick slot refill from magazine (lines 938-949, 12 lines)
// - sll_refill_small_from_ss: SLL refill from superslab (lines 952-996, 45 lines)
// - superslab_tls_bump_fast: TLS bump allocation (lines 1016-1060, 45 lines)
// - frontend_refill_fc: Frontend fast cache refill (lines 1063-1106, 44 lines)
// - bulk_mag_to_sll_if_room: Magazine to SLL bulk transfer (lines 1133-1154, 22 lines)
// - ultra_refill_sll: Ultra-mode SLL refill (lines 1178-1233, 56 lines)

#ifndef HAKMEM_TINY_REFILL_INC_H
#define HAKMEM_TINY_REFILL_INC_H

#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include "hakmem_tiny_magazine.h"
#include "hakmem_tiny_tls_list.h"
#include <stdint.h>
#include <pthread.h>

// External declarations for TLS variables and globals
extern int g_fast_enable;
extern uint16_t g_fast_cap[TINY_NUM_CLASSES];
extern __thread void* g_fast_head[TINY_NUM_CLASSES];
extern __thread uint16_t g_fast_count[TINY_NUM_CLASSES];

extern int g_tls_list_enable;
extern int g_tls_sll_enable;
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];

extern int g_use_superslab;
extern int g_ultra_bump_shadow;
extern int g_bump_chunk;
extern __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES];
extern __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES];

extern int g_fastcache_enable;
extern int g_quick_enable;

// External variable declarations
// Note: TinyTLSSlab, TinyFastCache, and TinyQuickSlot types must be defined before including this file
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];
extern TinyPool g_tiny_pool;
extern PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
extern __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
extern __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES];

// Frontend fill target
extern _Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES];

// Debug counters
#if HAKMEM_DEBUG_COUNTERS
extern uint64_t g_bump_hits[TINY_NUM_CLASSES];
extern uint64_t g_bump_arms[TINY_NUM_CLASSES];
extern uint64_t g_path_refill_calls[TINY_NUM_CLASSES];
extern uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES];
#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0)
#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0)
extern int g_path_debug_enabled;
#else
#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0)
#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0)
#endif

// Tracepoint macros
#ifndef HAK_TP1
#define HAK_TP1(name, idx) do { (void)(idx); } while(0)
#endif

// Forward declarations for functions used in this file
static inline void* tiny_fast_pop(int class_idx);
static inline int tiny_fast_push(int class_idx, void* ptr);
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap);
static SuperSlab* superslab_refill(int class_idx);
static void* slab_data_start(SuperSlab* ss, int slab_idx);
static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx);
static inline void ss_active_add(SuperSlab* ss, uint32_t n);
static inline void ss_active_inc(SuperSlab* ss);
static TinySlab* allocate_new_slab(int class_idx);
static void move_to_full_list(int class_idx, struct TinySlab* target_slab);
static int hak_tiny_find_free_block(TinySlab* slab);
static void hak_tiny_set_used(TinySlab* slab, int block_idx);
static inline int ultra_batch_for_class(int class_idx);
static inline int ultra_sll_cap_for_class(int class_idx);
// Note: tiny_small_mags_init_once and tiny_mag_init_if_needed are declared in hakmem_tiny_magazine.h
static void eventq_push(int class_idx, uint32_t size);

// Fast cache refill and take operation
static inline void* tiny_fast_refill_and_take(int class_idx, TinyTLSList* tls) {
    void* direct = tiny_fast_pop(class_idx);
    if (direct) return direct;
    uint16_t cap = g_fast_cap[class_idx];
    if (cap == 0) return NULL;
    uint16_t count = g_fast_count[class_idx];
    uint16_t need = cap > count ? (uint16_t)(cap - count) : 0;
    if (need == 0) return NULL;
    uint32_t have = tls->count;
    if (have < need) {
        uint32_t want = need - have;
        uint32_t thresh = tls_list_refill_threshold(tls);
        if (want < thresh) want = thresh;
        tls_refill_from_tls_slab(class_idx, tls, want);
    }
    void* batch_head = NULL;
    void* batch_tail = NULL;
    uint32_t taken = tls_list_bulk_take(tls, need, &batch_head, &batch_tail);
    if (taken == 0u || batch_head == NULL) {
        return NULL;
    }

    void* ret = batch_head;
    void* node = *(void**)ret;
    uint32_t remaining = (taken > 0u) ? (taken - 1u) : 0u;

    while (node && remaining > 0u) {
        void* next = *(void**)node;
        if (tiny_fast_push(class_idx, node)) {
            node = next;
            remaining--;
        } else {
            // Push failed, return remaining to TLS
            tls_list_bulk_put(tls, node, batch_tail, remaining);
            return ret;
        }
    }
    return ret;
}

// Quick slot refill from SLL
static inline int quick_refill_from_sll(int class_idx) {
    if (!g_tls_sll_enable) return 0;
    TinyQuickSlot* qs = &g_tls_quick[class_idx];
    int room = (int)(QUICK_CAP - qs->top);
    if (room <= 0) return 0;
    // Limit burst to a tiny constant to reduce loop/branches
    if (room > 2) room = 2;
    int filled = 0;
    while (room > 0) {
        void* head = g_tls_sll_head[class_idx];
        if (!head) break;
        g_tls_sll_head[class_idx] = *(void**)head;
        if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
        qs->items[qs->top++] = head;
        room--; filled++;
    }
    if (filled > 0) HAK_TP1(quick_refill_sll, class_idx);
    return filled;
}

// Quick slot refill from magazine
static inline int quick_refill_from_mag(int class_idx) {
    TinyTLSMag* mag = &g_tls_mags[class_idx];
    if (mag->top <= 0) return 0;
    TinyQuickSlot* qs = &g_tls_quick[class_idx];
    int room = (int)(QUICK_CAP - qs->top);
    if (room <= 0) return 0;
    // Only a single transfer from magazine to minimize overhead
    int take = (mag->top > 0 && room > 0) ? 1 : 0;
    for (int i = 0; i < take; i++) { qs->items[qs->top++] = mag->items[--mag->top].ptr; }
    if (take > 0) HAK_TP1(quick_refill_mag, class_idx);
    return take;
}

// P0 optimization: Batch refill (enabled by default, set HAKMEM_TINY_P0_BATCH_REFILL=0 to disable)
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
#define HAKMEM_TINY_P0_BATCH_REFILL 1  // Enable P0 by default (verified +5.16% improvement)
#endif

#if HAKMEM_TINY_P0_BATCH_REFILL
#include "hakmem_tiny_refill_p0.inc.h"
// Alias for compatibility
#define sll_refill_small_from_ss sll_refill_batch_from_ss
#endif

// Refill a few nodes directly into TLS SLL from TLS-cached SuperSlab (owner-thread only)
// Note: If HAKMEM_TINY_P0_BATCH_REFILL is enabled, sll_refill_batch_from_ss is used instead
#if !HAKMEM_TINY_P0_BATCH_REFILL
// Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c)
// Note: Force non-inline to provide linkable definition for LTO
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
__attribute__((noinline)) int sll_refill_small_from_ss(int class_idx, int max_take) {
#else
static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
#endif
    if (!g_use_superslab || max_take <= 0) return 0;
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
    if (!tls->ss) {
        // Try to obtain a SuperSlab for this class
        if (superslab_refill(class_idx) == NULL) return 0;
    }
    TinySlabMeta* meta = tls->meta;
    if (!meta) return 0;

    // Compute how many we can actually push into SLL without overflow
    uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
    int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
    if (room <= 0) return 0;
    int take = max_take < room ? max_take : room;

    int taken = 0;
    size_t bs = g_tiny_class_sizes[class_idx];
    while (taken < take) {
        void* p = NULL;
        if (meta->freelist) {
            p = meta->freelist; meta->freelist = *(void**)p; meta->used++;
            // Track active blocks reserved into TLS SLL
            ss_active_inc(tls->ss);
        } else if (meta->used < meta->capacity) {
            void* slab_start = slab_data_start(tls->ss, tls->slab_idx);
            if (tls->slab_idx == 0) slab_start = (char*)slab_start + 1024;
            p = (char*)slab_start + ((size_t)meta->used * bs);
            meta->used++;
            // Track active blocks reserved into TLS SLL
            ss_active_inc(tls->ss);
        } else {
            // Move to another slab with space
            if (superslab_refill(class_idx) == NULL) break;
            meta = tls->meta; // refresh after refill
            continue;
        }
        if (!p) break;
        *(void**)p = g_tls_sll_head[class_idx];
        g_tls_sll_head[class_idx] = p;
        g_tls_sll_count[class_idx]++;
        taken++;
    }
    return taken;
}
#endif // !HAKMEM_TINY_P0_BATCH_REFILL

// Ultra-Bump TLS shadow try: returns pointer when a TLS bump window is armed
// or can be armed by reserving a small chunk from the current SuperSlab meta.
static inline void* superslab_tls_bump_fast(int class_idx) {
    if (!g_ultra_bump_shadow || !g_use_superslab) return NULL;
    // Serve from armed TLS window if present
    uint8_t* cur = g_tls_bcur[class_idx];
    if (__builtin_expect(cur != NULL, 0)) {
        uint8_t* end = g_tls_bend[class_idx];
        size_t bs = g_tiny_class_sizes[class_idx];
        if (__builtin_expect(cur <= end - bs, 1)) {
            g_tls_bcur[class_idx] = cur + bs;
#if HAKMEM_DEBUG_COUNTERS
            g_bump_hits[class_idx]++;
#endif
            HAK_TP1(bump_hit, class_idx);
            return (void*)cur;
        }
        // Window exhausted
        g_tls_bcur[class_idx] = NULL;
        g_tls_bend[class_idx] = NULL;
    }
    // Arm a new window from TLS-cached SuperSlab meta (linear mode only)
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
    TinySlabMeta* meta = tls->meta;
    if (!meta || meta->freelist != NULL) return NULL;  // linear mode only
    uint16_t used = meta->used;
    uint16_t cap  = meta->capacity;
    if (used >= cap) return NULL;
    uint32_t avail = (uint32_t)cap - (uint32_t)used;
    uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u);
    if (chunk > avail) chunk = avail;
    size_t bs = g_tiny_class_sizes[tls->ss->size_class];
    void* slab_start = slab_data_start(tls->ss, tls->slab_idx);
    if (tls->slab_idx == 0) slab_start = (char*)slab_start + 1024;
    uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
    uint8_t* start = base + ((size_t)used * bs);
    // Reserve the chunk once in header (keeps remote-free accounting valid)
    meta->used = (uint16_t)(used + (uint16_t)chunk);
    // Account all reserved blocks as active in SuperSlab
    ss_active_add(tls->ss, chunk);
#if HAKMEM_DEBUG_COUNTERS
    g_bump_arms[class_idx]++;
#endif
    g_tls_bcur[class_idx] = start + bs;
    g_tls_bend[class_idx] = base + (size_t)chunk * bs;
    return (void*)start;
}

// Frontend: refill FastCache directly from TLS active slab (owner-only) or adopt a slab
static inline int frontend_refill_fc(int class_idx) {
    TinyFastCache* fc = &g_fast_cache[class_idx];
    int room = TINY_FASTCACHE_CAP - fc->top;
    if (room <= 0) return 0;
    // Target refill (conservative for safety)
    int need = ultra_batch_for_class(class_idx);
    int tgt = atomic_load_explicit(&g_frontend_fill_target[class_idx], memory_order_relaxed);
    if (tgt > 0 && tgt < need) need = tgt;
    if (need > room) need = room;
    if (need <= 0) return 0;

    int filled = 0;

    // Step A: First bulk transfer from TLS SLL to FastCache (lock-free, O(1))
    if (g_tls_sll_enable) {
        while (need > 0 && g_tls_sll_head[class_idx] != NULL) {
            void* h = g_tls_sll_head[class_idx];
            g_tls_sll_head[class_idx] = *(void**)h;
            if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;  // underflow prevention
            fc->items[fc->top++] = h;
            need--; filled++;
            if (fc->top >= TINY_FASTCACHE_CAP) break;
        }
    }

    // Step B: If still not enough, transfer from TLS Magazine (lock-free, O(1))
    if (need > 0) {
        tiny_small_mags_init_once();
        if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
        TinyTLSMag* mag = &g_tls_mags[class_idx];
        while (need > 0 && mag->top > 0 && fc->top < TINY_FASTCACHE_CAP) {
            void* p = mag->items[--mag->top].ptr;
            fc->items[fc->top++] = p;
            need--; filled++;
        }
    }

    if (filled > 0) {
        eventq_push(class_idx, (uint32_t)g_tiny_class_sizes[class_idx]);
        HAK_PATHDBG_INC(g_path_refill_calls, class_idx);
        return 1;
    }
    return 0;
}

// Move up to 'n' items from TLS magazine to SLL if SLL has room (lock-free).
static inline int bulk_mag_to_sll_if_room(int class_idx, TinyTLSMag* mag, int n) {
    if (g_tls_list_enable) return 0;
    if (!g_tls_sll_enable || n <= 0) return 0;
    uint32_t cap = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
    uint32_t have = g_tls_sll_count[class_idx];
    if (have >= cap) return 0;
    int room = (int)(cap - have);
    int avail = mag->top;
    // Hysteresis: avoid frequent tiny moves; take at least 8 if possible
    int take = (n < room ? n : room);
    if (take < 8 && avail >= 8 && room >= 8) take = 8;
    if (take > avail) take = avail;
    if (take <= 0) return 0;
    for (int i = 0; i < take; i++) {
        void* p = mag->items[--mag->top].ptr;
        *(void**)p = g_tls_sll_head[class_idx];
        g_tls_sll_head[class_idx] = p;
        g_tls_sll_count[class_idx]++;
    }
    HAK_PATHDBG_INC(g_path_refill_calls, class_idx);
    return take;
}

// Ultra-mode (SLL-only) refill operation
static inline void ultra_refill_sll(int class_idx) {
    int need = ultra_batch_for_class(class_idx);
    HAK_ULTRADBG_INC(g_ultra_refill_calls, class_idx);
    int sll_cap = ultra_sll_cap_for_class(class_idx);
    pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
    pthread_mutex_lock(lock);
    TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
    if (!slab) {
        slab = allocate_new_slab(class_idx);
        if (slab) {
            slab->next = g_tiny_pool.free_slabs[class_idx];
            g_tiny_pool.free_slabs[class_idx] = slab;
        }
    }
    if (slab) {
        size_t bs = g_tiny_class_sizes[class_idx];
        int remaining = need;
        while (remaining > 0 && slab->free_count > 0) {
            if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;
            int first = hak_tiny_find_free_block(slab);
            if (first < 0) break;
            // Allocate the first found block
            hak_tiny_set_used(slab, first);
            slab->free_count--;
            void* p0 = (char*)slab->base + ((size_t)first * bs);
            *(void**)p0 = g_tls_sll_head[class_idx];
            g_tls_sll_head[class_idx] = p0;
            g_tls_sll_count[class_idx]++;
            remaining--;
            // Try to allocate more from the same word to amortize scanning
            int word_idx = first / 64;
            uint64_t used = slab->bitmap[word_idx];
            uint64_t free_bits = ~used;
            while (remaining > 0 && free_bits && slab->free_count > 0) {
                if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;
                int bit_idx = __builtin_ctzll(free_bits);
                int block_idx = word_idx * 64 + bit_idx;
                hak_tiny_set_used(slab, block_idx);
                slab->free_count--;
                void* p = (char*)slab->base + ((size_t)block_idx * bs);
                *(void**)p = g_tls_sll_head[class_idx];
                g_tls_sll_head[class_idx] = p;
                g_tls_sll_count[class_idx]++;
                remaining--;
                // Update free_bits for next iteration
                used = slab->bitmap[word_idx];
                free_bits = ~used;
            }
            if (slab->free_count == 0) {
                move_to_full_list(class_idx, slab);
                break;
            }
        }
    }
    pthread_mutex_unlock(lock);
}

#endif // HAKMEM_TINY_REFILL_INC_H
Debug Counters Implementation - Clean History Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com> 2025-11-05 12:31:14 +09:00			`// hakmem_tiny_refill.inc.h`
			`// Phase 2D-1: Hot-path inline functions - Refill operations`
			`//`
			`// This file contains hot-path refill functions for various allocation tiers.`
			`// These functions are extracted from hakmem_tiny.c to improve maintainability and`
			`// reduce the main file size by approximately 280 lines.`
			`//`
			`// Functions handle:`
			`// - tiny_fast_refill_and_take: Fast cache refill (lines 584-622, 39 lines)`
			`// - quick_refill_from_sll: Quick slot refill from SLL (lines 918-936, 19 lines)`
			`// - quick_refill_from_mag: Quick slot refill from magazine (lines 938-949, 12 lines)`
			`// - sll_refill_small_from_ss: SLL refill from superslab (lines 952-996, 45 lines)`
			`// - superslab_tls_bump_fast: TLS bump allocation (lines 1016-1060, 45 lines)`
			`// - frontend_refill_fc: Frontend fast cache refill (lines 1063-1106, 44 lines)`
			`// - bulk_mag_to_sll_if_room: Magazine to SLL bulk transfer (lines 1133-1154, 22 lines)`
			`// - ultra_refill_sll: Ultra-mode SLL refill (lines 1178-1233, 56 lines)`

			`#ifndef HAKMEM_TINY_REFILL_INC_H`
			`#define HAKMEM_TINY_REFILL_INC_H`

			`#include "hakmem_tiny.h"`
			`#include "hakmem_tiny_superslab.h"`
			`#include "hakmem_tiny_magazine.h"`
			`#include "hakmem_tiny_tls_list.h"`
			`#include <stdint.h>`
			`#include <pthread.h>`

			`// External declarations for TLS variables and globals`
			`extern int g_fast_enable;`
			`extern uint16_t g_fast_cap[TINY_NUM_CLASSES];`
			`extern __thread void* g_fast_head[TINY_NUM_CLASSES];`
			`extern __thread uint16_t g_fast_count[TINY_NUM_CLASSES];`

			`extern int g_tls_list_enable;`
			`extern int g_tls_sll_enable;`
			`extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];`
			`extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];`

			`extern int g_use_superslab;`
			`extern int g_ultra_bump_shadow;`
			`extern int g_bump_chunk;`
			`extern __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES];`
			`extern __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES];`

			`extern int g_fastcache_enable;`
			`extern int g_quick_enable;`

			`// External variable declarations`
			`// Note: TinyTLSSlab, TinyFastCache, and TinyQuickSlot types must be defined before including this file`
			`extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];`
			`extern TinyPool g_tiny_pool;`
			`extern PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];`
			`extern __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];`
			`extern __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES];`

			`// Frontend fill target`
			`extern _Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES];`

			`// Debug counters`
			`#if HAKMEM_DEBUG_COUNTERS`
			`extern uint64_t g_bump_hits[TINY_NUM_CLASSES];`
			`extern uint64_t g_bump_arms[TINY_NUM_CLASSES];`
			`extern uint64_t g_path_refill_calls[TINY_NUM_CLASSES];`
			`extern uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES];`
			`#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0)`
			`#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0)`
			`extern int g_path_debug_enabled;`
			`#else`
			`#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0)`
			`#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0)`
			`#endif`

			`// Tracepoint macros`
			`#ifndef HAK_TP1`
			`#define HAK_TP1(name, idx) do { (void)(idx); } while(0)`
			`#endif`

			`// Forward declarations for functions used in this file`
			`static inline void* tiny_fast_pop(int class_idx);`
			`static inline int tiny_fast_push(int class_idx, void* ptr);`
			`static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);`
			`static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap);`
			`static SuperSlab* superslab_refill(int class_idx);`
			`static void* slab_data_start(SuperSlab* ss, int slab_idx);`
			`static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx);`
			`static inline void ss_active_add(SuperSlab* ss, uint32_t n);`
			`static inline void ss_active_inc(SuperSlab* ss);`
			`static TinySlab* allocate_new_slab(int class_idx);`
			`static void move_to_full_list(int class_idx, struct TinySlab* target_slab);`
			`static int hak_tiny_find_free_block(TinySlab* slab);`
			`static void hak_tiny_set_used(TinySlab* slab, int block_idx);`
			`static inline int ultra_batch_for_class(int class_idx);`
			`static inline int ultra_sll_cap_for_class(int class_idx);`
			`// Note: tiny_small_mags_init_once and tiny_mag_init_if_needed are declared in hakmem_tiny_magazine.h`
			`static void eventq_push(int class_idx, uint32_t size);`

			`// Fast cache refill and take operation`
			`static inline void* tiny_fast_refill_and_take(int class_idx, TinyTLSList* tls) {`
			`void* direct = tiny_fast_pop(class_idx);`
			`if (direct) return direct;`
			`uint16_t cap = g_fast_cap[class_idx];`
			`if (cap == 0) return NULL;`
			`uint16_t count = g_fast_count[class_idx];`
			`uint16_t need = cap > count ? (uint16_t)(cap - count) : 0;`
			`if (need == 0) return NULL;`
			`uint32_t have = tls->count;`
			`if (have < need) {`
			`uint32_t want = need - have;`
			`uint32_t thresh = tls_list_refill_threshold(tls);`
			`if (want < thresh) want = thresh;`
			`tls_refill_from_tls_slab(class_idx, tls, want);`
			`}`
			`void* batch_head = NULL;`
			`void* batch_tail = NULL;`
			`uint32_t taken = tls_list_bulk_take(tls, need, &batch_head, &batch_tail);`
			`if (taken == 0u \|\| batch_head == NULL) {`
			`return NULL;`
			`}`

			`void* ret = batch_head;`
			`void* node = (void*)ret;`
			`uint32_t remaining = (taken > 0u) ? (taken - 1u) : 0u;`

			`while (node && remaining > 0u) {`
			`void* next = (void*)node;`
			`if (tiny_fast_push(class_idx, node)) {`
			`node = next;`
			`remaining--;`
			`} else {`
			`// Push failed, return remaining to TLS`
			`tls_list_bulk_put(tls, node, batch_tail, remaining);`
			`return ret;`
			`}`
			`}`
			`return ret;`
			`}`

			`// Quick slot refill from SLL`
			`static inline int quick_refill_from_sll(int class_idx) {`
			`if (!g_tls_sll_enable) return 0;`
			`TinyQuickSlot* qs = &g_tls_quick[class_idx];`
			`int room = (int)(QUICK_CAP - qs->top);`
			`if (room <= 0) return 0;`
			`// Limit burst to a tiny constant to reduce loop/branches`
			`if (room > 2) room = 2;`
			`int filled = 0;`
			`while (room > 0) {`
			`void* head = g_tls_sll_head[class_idx];`
			`if (!head) break;`
			`g_tls_sll_head[class_idx] = (void*)head;`
			`if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;`
			`qs->items[qs->top++] = head;`
			`room--; filled++;`
			`}`
			`if (filled > 0) HAK_TP1(quick_refill_sll, class_idx);`
			`return filled;`
			`}`

			`// Quick slot refill from magazine`
			`static inline int quick_refill_from_mag(int class_idx) {`
			`TinyTLSMag* mag = &g_tls_mags[class_idx];`
			`if (mag->top <= 0) return 0;`
			`TinyQuickSlot* qs = &g_tls_quick[class_idx];`
			`int room = (int)(QUICK_CAP - qs->top);`
			`if (room <= 0) return 0;`
			`// Only a single transfer from magazine to minimize overhead`
			`int take = (mag->top > 0 && room > 0) ? 1 : 0;`
			`for (int i = 0; i < take; i++) { qs->items[qs->top++] = mag->items[--mag->top].ptr; }`
			`if (take > 0) HAK_TP1(quick_refill_mag, class_idx);`
			`return take;`
			`}`

			`// P0 optimization: Batch refill (enabled by default, set HAKMEM_TINY_P0_BATCH_REFILL=0 to disable)`
			`#ifndef HAKMEM_TINY_P0_BATCH_REFILL`
			`#define HAKMEM_TINY_P0_BATCH_REFILL 1 // Enable P0 by default (verified +5.16% improvement)`
			`#endif`

			`#if HAKMEM_TINY_P0_BATCH_REFILL`
			`#include "hakmem_tiny_refill_p0.inc.h"`
			`// Alias for compatibility`
			`#define sll_refill_small_from_ss sll_refill_batch_from_ss`
			`#endif`

			`// Refill a few nodes directly into TLS SLL from TLS-cached SuperSlab (owner-thread only)`
			`// Note: If HAKMEM_TINY_P0_BATCH_REFILL is enabled, sll_refill_batch_from_ss is used instead`
			`#if !HAKMEM_TINY_P0_BATCH_REFILL`
			`// Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c)`
			`// Note: Force non-inline to provide linkable definition for LTO`
			`#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR`
			`__attribute__((noinline)) int sll_refill_small_from_ss(int class_idx, int max_take) {`
			`#else`
			`static inline int sll_refill_small_from_ss(int class_idx, int max_take) {`
			`#endif`
			`if (!g_use_superslab \|\| max_take <= 0) return 0;`
			`TinyTLSSlab* tls = &g_tls_slabs[class_idx];`
			`if (!tls->ss) {`
			`// Try to obtain a SuperSlab for this class`
			`if (superslab_refill(class_idx) == NULL) return 0;`
			`}`
			`TinySlabMeta* meta = tls->meta;`
			`if (!meta) return 0;`

			`// Compute how many we can actually push into SLL without overflow`
			`uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);`
			`int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];`
			`if (room <= 0) return 0;`
			`int take = max_take < room ? max_take : room;`

			`int taken = 0;`
			`size_t bs = g_tiny_class_sizes[class_idx];`
			`while (taken < take) {`
			`void* p = NULL;`
			`if (meta->freelist) {`
			`p = meta->freelist; meta->freelist = (void*)p; meta->used++;`
			`// Track active blocks reserved into TLS SLL`
			`ss_active_inc(tls->ss);`
			`} else if (meta->used < meta->capacity) {`
			`void* slab_start = slab_data_start(tls->ss, tls->slab_idx);`
			`if (tls->slab_idx == 0) slab_start = (char*)slab_start + 1024;`
			`p = (char)slab_start + ((size_t)meta->used bs);`
			`meta->used++;`
			`// Track active blocks reserved into TLS SLL`
			`ss_active_inc(tls->ss);`
			`} else {`
			`// Move to another slab with space`
			`if (superslab_refill(class_idx) == NULL) break;`
			`meta = tls->meta; // refresh after refill`
			`continue;`
			`}`
			`if (!p) break;`
			`(void*)p = g_tls_sll_head[class_idx];`
			`g_tls_sll_head[class_idx] = p;`
			`g_tls_sll_count[class_idx]++;`
			`taken++;`
			`}`
			`return taken;`
			`}`
			`#endif // !HAKMEM_TINY_P0_BATCH_REFILL`

			`// Ultra-Bump TLS shadow try: returns pointer when a TLS bump window is armed`
			`// or can be armed by reserving a small chunk from the current SuperSlab meta.`
			`static inline void* superslab_tls_bump_fast(int class_idx) {`
			`if (!g_ultra_bump_shadow \|\| !g_use_superslab) return NULL;`
			`// Serve from armed TLS window if present`
			`uint8_t* cur = g_tls_bcur[class_idx];`
			`if (__builtin_expect(cur != NULL, 0)) {`
			`uint8_t* end = g_tls_bend[class_idx];`
			`size_t bs = g_tiny_class_sizes[class_idx];`
			`if (__builtin_expect(cur <= end - bs, 1)) {`
			`g_tls_bcur[class_idx] = cur + bs;`
			`#if HAKMEM_DEBUG_COUNTERS`
			`g_bump_hits[class_idx]++;`
			`#endif`
			`HAK_TP1(bump_hit, class_idx);`
			`return (void*)cur;`
			`}`
			`// Window exhausted`
			`g_tls_bcur[class_idx] = NULL;`
			`g_tls_bend[class_idx] = NULL;`
			`}`
			`// Arm a new window from TLS-cached SuperSlab meta (linear mode only)`
			`TinyTLSSlab* tls = &g_tls_slabs[class_idx];`
			`TinySlabMeta* meta = tls->meta;`
			`if (!meta \|\| meta->freelist != NULL) return NULL; // linear mode only`
			`uint16_t used = meta->used;`
			`uint16_t cap = meta->capacity;`
			`if (used >= cap) return NULL;`
			`uint32_t avail = (uint32_t)cap - (uint32_t)used;`
			`uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u);`
			`if (chunk > avail) chunk = avail;`
			`size_t bs = g_tiny_class_sizes[tls->ss->size_class];`
			`void* slab_start = slab_data_start(tls->ss, tls->slab_idx);`
			`if (tls->slab_idx == 0) slab_start = (char*)slab_start + 1024;`
			`uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);`
			`uint8_t* start = base + ((size_t)used * bs);`
			`// Reserve the chunk once in header (keeps remote-free accounting valid)`
			`meta->used = (uint16_t)(used + (uint16_t)chunk);`
			`// Account all reserved blocks as active in SuperSlab`
			`ss_active_add(tls->ss, chunk);`
			`#if HAKMEM_DEBUG_COUNTERS`
			`g_bump_arms[class_idx]++;`
			`#endif`
			`g_tls_bcur[class_idx] = start + bs;`
			`g_tls_bend[class_idx] = base + (size_t)chunk * bs;`
			`return (void*)start;`
			`}`

			`// Frontend: refill FastCache directly from TLS active slab (owner-only) or adopt a slab`
			`static inline int frontend_refill_fc(int class_idx) {`
			`TinyFastCache* fc = &g_fast_cache[class_idx];`
			`int room = TINY_FASTCACHE_CAP - fc->top;`
			`if (room <= 0) return 0;`
			`// Target refill (conservative for safety)`
			`int need = ultra_batch_for_class(class_idx);`
			`int tgt = atomic_load_explicit(&g_frontend_fill_target[class_idx], memory_order_relaxed);`
			`if (tgt > 0 && tgt < need) need = tgt;`
			`if (need > room) need = room;`
			`if (need <= 0) return 0;`

			`int filled = 0;`

			`// Step A: First bulk transfer from TLS SLL to FastCache (lock-free, O(1))`
			`if (g_tls_sll_enable) {`
			`while (need > 0 && g_tls_sll_head[class_idx] != NULL) {`
			`void* h = g_tls_sll_head[class_idx];`
			`g_tls_sll_head[class_idx] = (void*)h;`
			`if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--; // underflow prevention`
			`fc->items[fc->top++] = h;`
			`need--; filled++;`
			`if (fc->top >= TINY_FASTCACHE_CAP) break;`
			`}`
			`}`

			`// Step B: If still not enough, transfer from TLS Magazine (lock-free, O(1))`
			`if (need > 0) {`
			`tiny_small_mags_init_once();`
			`if (class_idx > 3) tiny_mag_init_if_needed(class_idx);`
			`TinyTLSMag* mag = &g_tls_mags[class_idx];`
			`while (need > 0 && mag->top > 0 && fc->top < TINY_FASTCACHE_CAP) {`
			`void* p = mag->items[--mag->top].ptr;`
			`fc->items[fc->top++] = p;`
			`need--; filled++;`
			`}`
			`}`

			`if (filled > 0) {`
			`eventq_push(class_idx, (uint32_t)g_tiny_class_sizes[class_idx]);`
			`HAK_PATHDBG_INC(g_path_refill_calls, class_idx);`
			`return 1;`
			`}`
			`return 0;`
			`}`

			`// Move up to 'n' items from TLS magazine to SLL if SLL has room (lock-free).`
			`static inline int bulk_mag_to_sll_if_room(int class_idx, TinyTLSMag* mag, int n) {`
			`if (g_tls_list_enable) return 0;`
			`if (!g_tls_sll_enable \|\| n <= 0) return 0;`
			`uint32_t cap = sll_cap_for_class(class_idx, (uint32_t)mag->cap);`
			`uint32_t have = g_tls_sll_count[class_idx];`
			`if (have >= cap) return 0;`
			`int room = (int)(cap - have);`
			`int avail = mag->top;`
			`// Hysteresis: avoid frequent tiny moves; take at least 8 if possible`
			`int take = (n < room ? n : room);`
			`if (take < 8 && avail >= 8 && room >= 8) take = 8;`
			`if (take > avail) take = avail;`
			`if (take <= 0) return 0;`
			`for (int i = 0; i < take; i++) {`
			`void* p = mag->items[--mag->top].ptr;`
			`(void*)p = g_tls_sll_head[class_idx];`
			`g_tls_sll_head[class_idx] = p;`
			`g_tls_sll_count[class_idx]++;`
			`}`
			`HAK_PATHDBG_INC(g_path_refill_calls, class_idx);`
			`return take;`
			`}`

			`// Ultra-mode (SLL-only) refill operation`
			`static inline void ultra_refill_sll(int class_idx) {`
			`int need = ultra_batch_for_class(class_idx);`
			`HAK_ULTRADBG_INC(g_ultra_refill_calls, class_idx);`
			`int sll_cap = ultra_sll_cap_for_class(class_idx);`
			`pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;`
			`pthread_mutex_lock(lock);`
			`TinySlab* slab = g_tiny_pool.free_slabs[class_idx];`
			`if (!slab) {`
			`slab = allocate_new_slab(class_idx);`
			`if (slab) {`
			`slab->next = g_tiny_pool.free_slabs[class_idx];`
			`g_tiny_pool.free_slabs[class_idx] = slab;`
			`}`
			`}`
			`if (slab) {`
			`size_t bs = g_tiny_class_sizes[class_idx];`
			`int remaining = need;`
			`while (remaining > 0 && slab->free_count > 0) {`
			`if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;`
			`int first = hak_tiny_find_free_block(slab);`
			`if (first < 0) break;`
			`// Allocate the first found block`
			`hak_tiny_set_used(slab, first);`
			`slab->free_count--;`
			`void* p0 = (char)slab->base + ((size_t)first bs);`
			`(void*)p0 = g_tls_sll_head[class_idx];`
			`g_tls_sll_head[class_idx] = p0;`
			`g_tls_sll_count[class_idx]++;`
			`remaining--;`
			`// Try to allocate more from the same word to amortize scanning`
			`int word_idx = first / 64;`
			`uint64_t used = slab->bitmap[word_idx];`
			`uint64_t free_bits = ~used;`
			`while (remaining > 0 && free_bits && slab->free_count > 0) {`
			`if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;`
			`int bit_idx = __builtin_ctzll(free_bits);`
			`int block_idx = word_idx * 64 + bit_idx;`
			`hak_tiny_set_used(slab, block_idx);`
			`slab->free_count--;`
			`void* p = (char)slab->base + ((size_t)block_idx bs);`
			`(void*)p = g_tls_sll_head[class_idx];`
			`g_tls_sll_head[class_idx] = p;`
			`g_tls_sll_count[class_idx]++;`
			`remaining--;`
			`// Update free_bits for next iteration`
			`used = slab->bitmap[word_idx];`
			`free_bits = ~used;`
			`}`
			`if (slab->free_count == 0) {`
			`move_to_full_list(class_idx, slab);`
			`break;`
			`}`
			`}`
			`}`
			`pthread_mutex_unlock(lock);`
			`}`

			`#endif // HAKMEM_TINY_REFILL_INC_H`