// hakmem_tiny_refill.inc.h // Phase 2D-1: Hot-path inline functions - Refill operations // // This file contains hot-path refill functions for various allocation tiers. // These functions are extracted from hakmem_tiny.c to improve maintainability and // reduce the main file size by approximately 280 lines. // // Functions handle: // - tiny_fast_refill_and_take: Fast cache refill (lines 584-622, 39 lines) // - quick_refill_from_sll: Quick slot refill from SLL (lines 918-936, 19 lines) // - quick_refill_from_mag: Quick slot refill from magazine (lines 938-949, 12 lines) // - sll_refill_small_from_ss: SLL refill from superslab (lines 952-996, 45 lines) // - superslab_tls_bump_fast: TLS bump allocation (lines 1016-1060, 45 lines) // - frontend_refill_fc: Frontend fast cache refill (lines 1063-1106, 44 lines) // - bulk_mag_to_sll_if_room: Magazine to SLL bulk transfer (lines 1133-1154, 22 lines) // - ultra_refill_sll: Ultra-mode SLL refill (lines 1178-1233, 56 lines) #ifndef HAKMEM_TINY_REFILL_INC_H #define HAKMEM_TINY_REFILL_INC_H #include "hakmem_tiny.h" #include "hakmem_tiny_superslab.h" #include "hakmem_tiny_magazine.h" #include "hakmem_tiny_tls_list.h" #include #include // External declarations for TLS variables and globals extern int g_fast_enable; extern uint16_t g_fast_cap[TINY_NUM_CLASSES]; extern __thread void* g_fast_head[TINY_NUM_CLASSES]; extern __thread uint16_t g_fast_count[TINY_NUM_CLASSES]; extern int g_tls_list_enable; extern int g_tls_sll_enable; extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES]; extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES]; extern int g_use_superslab; extern int g_ultra_bump_shadow; extern int g_bump_chunk; extern __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES]; extern __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES]; extern int g_fastcache_enable; extern int g_quick_enable; // External variable declarations // Note: TinyTLSSlab, TinyFastCache, and TinyQuickSlot types must be defined before including this file extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; extern TinyPool g_tiny_pool; extern PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES]; extern __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES]; extern __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // Frontend fill target extern _Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES]; // Debug counters #if HAKMEM_DEBUG_COUNTERS extern uint64_t g_bump_hits[TINY_NUM_CLASSES]; extern uint64_t g_bump_arms[TINY_NUM_CLASSES]; extern uint64_t g_path_refill_calls[TINY_NUM_CLASSES]; extern uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES]; #define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0) #define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0) extern int g_path_debug_enabled; #else #define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0) #define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0) #endif // Tracepoint macros #ifndef HAK_TP1 #define HAK_TP1(name, idx) do { (void)(idx); } while(0) #endif // Forward declarations for functions used in this file static inline void* tiny_fast_pop(int class_idx); static inline int tiny_fast_push(int class_idx, void* ptr); static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want); static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap); static SuperSlab* superslab_refill(int class_idx); static void* slab_data_start(SuperSlab* ss, int slab_idx); static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx); static inline void ss_active_add(SuperSlab* ss, uint32_t n); static inline void ss_active_inc(SuperSlab* ss); static TinySlab* allocate_new_slab(int class_idx); static void move_to_full_list(int class_idx, struct TinySlab* target_slab); static int hak_tiny_find_free_block(TinySlab* slab); static void hak_tiny_set_used(TinySlab* slab, int block_idx); static inline int ultra_batch_for_class(int class_idx); static inline int ultra_sll_cap_for_class(int class_idx); // Note: tiny_small_mags_init_once and tiny_mag_init_if_needed are declared in hakmem_tiny_magazine.h static void eventq_push(int class_idx, uint32_t size); // Fast cache refill and take operation static inline void* tiny_fast_refill_and_take(int class_idx, TinyTLSList* tls) { void* direct = tiny_fast_pop(class_idx); if (direct) return direct; uint16_t cap = g_fast_cap[class_idx]; if (cap == 0) return NULL; uint16_t count = g_fast_count[class_idx]; uint16_t need = cap > count ? (uint16_t)(cap - count) : 0; if (need == 0) return NULL; uint32_t have = tls->count; if (have < need) { uint32_t want = need - have; uint32_t thresh = tls_list_refill_threshold(tls); if (want < thresh) want = thresh; tls_refill_from_tls_slab(class_idx, tls, want); } void* batch_head = NULL; void* batch_tail = NULL; uint32_t taken = tls_list_bulk_take(tls, need, &batch_head, &batch_tail); if (taken == 0u || batch_head == NULL) { return NULL; } void* ret = batch_head; void* node = *(void**)ret; uint32_t remaining = (taken > 0u) ? (taken - 1u) : 0u; while (node && remaining > 0u) { void* next = *(void**)node; if (tiny_fast_push(class_idx, node)) { node = next; remaining--; } else { // Push failed, return remaining to TLS tls_list_bulk_put(tls, node, batch_tail, remaining); return ret; } } return ret; } // Quick slot refill from SLL static inline int quick_refill_from_sll(int class_idx) { if (!g_tls_sll_enable) return 0; TinyQuickSlot* qs = &g_tls_quick[class_idx]; int room = (int)(QUICK_CAP - qs->top); if (room <= 0) return 0; // Limit burst to a tiny constant to reduce loop/branches if (room > 2) room = 2; int filled = 0; while (room > 0) { void* head = g_tls_sll_head[class_idx]; if (!head) break; g_tls_sll_head[class_idx] = *(void**)head; if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--; qs->items[qs->top++] = head; room--; filled++; } if (filled > 0) HAK_TP1(quick_refill_sll, class_idx); if (filled > 0) { extern unsigned long long g_front_quick_hit[]; g_front_quick_hit[class_idx]++; } return filled; } // Quick slot refill from magazine static inline int quick_refill_from_mag(int class_idx) { TinyTLSMag* mag = &g_tls_mags[class_idx]; if (mag->top <= 0) return 0; TinyQuickSlot* qs = &g_tls_quick[class_idx]; int room = (int)(QUICK_CAP - qs->top); if (room <= 0) return 0; // Only a single transfer from magazine to minimize overhead int take = (mag->top > 0 && room > 0) ? 1 : 0; for (int i = 0; i < take; i++) { qs->items[qs->top++] = mag->items[--mag->top].ptr; } if (take > 0) HAK_TP1(quick_refill_mag, class_idx); return take; } // P0 optimization: Batch refill (enabled by default, set HAKMEM_TINY_P0_BATCH_REFILL=0 to disable) #ifndef HAKMEM_TINY_P0_BATCH_REFILL #define HAKMEM_TINY_P0_BATCH_REFILL 1 // Enable P0 by default (verified +5.16% improvement) #endif #if HAKMEM_TINY_P0_BATCH_REFILL #include "hakmem_tiny_refill_p0.inc.h" // Alias for compatibility #define sll_refill_small_from_ss sll_refill_batch_from_ss #endif // Refill a few nodes directly into TLS SLL from TLS-cached SuperSlab (owner-thread only) // Note: If HAKMEM_TINY_P0_BATCH_REFILL is enabled, sll_refill_batch_from_ss is used instead #if !HAKMEM_TINY_P0_BATCH_REFILL // Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c) // Note: Force non-inline to provide linkable definition for LTO #ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR __attribute__((noinline)) int sll_refill_small_from_ss(int class_idx, int max_take) { #else static inline int sll_refill_small_from_ss(int class_idx, int max_take) { #endif if (!g_use_superslab || max_take <= 0) return 0; TinyTLSSlab* tls = &g_tls_slabs[class_idx]; if (!tls->ss) { // Try to obtain a SuperSlab for this class if (superslab_refill(class_idx) == NULL) return 0; } TinySlabMeta* meta = tls->meta; if (!meta) return 0; // Class 4/5/6/7 special-case: simple batch refill (favor linear carve, minimal branching) // Optional gate for class3 via env: HAKMEM_TINY_SIMPLE_REFILL_C3=1 static int g_simple_c3 = -1; if (__builtin_expect(g_simple_c3 == -1, 0)) { const char* e = getenv("HAKMEM_TINY_SIMPLE_REFILL_C3"); g_simple_c3 = (e && *e && *e != '0') ? 1 : 0; } if (__builtin_expect(class_idx >= 4 || (class_idx == 3 && g_simple_c3), 0)) { uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP); int room = (int)sll_cap - (int)g_tls_sll_count[class_idx]; if (room <= 0) return 0; int take = max_take < room ? max_take : room; int taken = 0; size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0); for (; taken < take;) { // Linear first (LIKELY for class7) if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) { uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx); void* p = (void*)(base + ((size_t)meta->used * bs)); meta->used++; *(void**)p = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = p; g_tls_sll_count[class_idx]++; ss_active_inc(tls->ss); taken++; continue; } // Freelist fallback if (__builtin_expect(meta->freelist != NULL, 0)) { void* p = meta->freelist; meta->freelist = *(void**)p; meta->used++; *(void**)p = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = p; g_tls_sll_count[class_idx]++; ss_active_inc(tls->ss); taken++; continue; } // Need another slab with space if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break; meta = tls->meta; // refresh after refill } return taken; } // Compute how many we can actually push into SLL without overflow uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP); int room = (int)sll_cap - (int)g_tls_sll_count[class_idx]; if (room <= 0) return 0; int take = max_take < room ? max_take : room; int taken = 0; size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0); while (taken < take) { void* p = NULL; if (__builtin_expect(meta->freelist != NULL, 0)) { p = meta->freelist; meta->freelist = *(void**)p; meta->used++; // Track active blocks reserved into TLS SLL ss_active_inc(tls->ss); } else if (__builtin_expect(meta->used < meta->capacity, 1)) { void* slab_start = tiny_slab_base_for(tls->ss, tls->slab_idx); p = (char*)slab_start + ((size_t)meta->used * bs); meta->used++; // Track active blocks reserved into TLS SLL ss_active_inc(tls->ss); } else { // Move to another slab with space if (superslab_refill(class_idx) == NULL) break; meta = tls->meta; // refresh after refill continue; } if (!p) break; *(void**)p = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = p; g_tls_sll_count[class_idx]++; taken++; } return taken; } #endif // !HAKMEM_TINY_P0_BATCH_REFILL // Ultra-Bump TLS shadow try: returns pointer when a TLS bump window is armed // or can be armed by reserving a small chunk from the current SuperSlab meta. static inline void* superslab_tls_bump_fast(int class_idx) { if (!g_ultra_bump_shadow || !g_use_superslab) return NULL; // Serve from armed TLS window if present uint8_t* cur = g_tls_bcur[class_idx]; if (__builtin_expect(cur != NULL, 0)) { uint8_t* end = g_tls_bend[class_idx]; size_t bs = g_tiny_class_sizes[class_idx]; if (__builtin_expect(cur <= end - bs, 1)) { g_tls_bcur[class_idx] = cur + bs; #if HAKMEM_DEBUG_COUNTERS g_bump_hits[class_idx]++; #endif HAK_TP1(bump_hit, class_idx); return (void*)cur; } // Window exhausted g_tls_bcur[class_idx] = NULL; g_tls_bend[class_idx] = NULL; } // Arm a new window from TLS-cached SuperSlab meta (linear mode only) TinyTLSSlab* tls = &g_tls_slabs[class_idx]; TinySlabMeta* meta = tls->meta; if (!meta || meta->freelist != NULL) return NULL; // linear mode only uint16_t used = meta->used; uint16_t cap = meta->capacity; if (used >= cap) return NULL; uint32_t avail = (uint32_t)cap - (uint32_t)used; uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u); if (chunk > avail) chunk = avail; size_t bs = g_tiny_class_sizes[tls->ss->size_class] + ((tls->ss->size_class != 7) ? 1 : 0); uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx); uint8_t* start = base + ((size_t)used * bs); // Reserve the chunk once in header (keeps remote-free accounting valid) meta->used = (uint16_t)(used + (uint16_t)chunk); // Account all reserved blocks as active in SuperSlab ss_active_add(tls->ss, chunk); #if HAKMEM_DEBUG_COUNTERS g_bump_arms[class_idx]++; #endif g_tls_bcur[class_idx] = start + bs; g_tls_bend[class_idx] = base + (size_t)chunk * bs; return (void*)start; } // Frontend: refill FastCache directly from TLS active slab (owner-only) or adopt a slab static inline int frontend_refill_fc(int class_idx) { TinyFastCache* fc = &g_fast_cache[class_idx]; int room = TINY_FASTCACHE_CAP - fc->top; if (room <= 0) return 0; // Target refill (conservative for safety) int need = ultra_batch_for_class(class_idx); int tgt = atomic_load_explicit(&g_frontend_fill_target[class_idx], memory_order_relaxed); if (tgt > 0 && tgt < need) need = tgt; if (need > room) need = room; if (need <= 0) return 0; int filled = 0; // Step A: First bulk transfer from TLS SLL to FastCache (lock-free, O(1)) if (g_tls_sll_enable) { while (need > 0 && g_tls_sll_head[class_idx] != NULL) { void* h = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = *(void**)h; if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--; // underflow prevention fc->items[fc->top++] = h; need--; filled++; if (fc->top >= TINY_FASTCACHE_CAP) break; } } // Step B: If still not enough, transfer from TLS Magazine (lock-free, O(1)) if (need > 0) { tiny_small_mags_init_once(); if (class_idx > 3) tiny_mag_init_if_needed(class_idx); TinyTLSMag* mag = &g_tls_mags[class_idx]; while (need > 0 && mag->top > 0 && fc->top < TINY_FASTCACHE_CAP) { void* p = mag->items[--mag->top].ptr; fc->items[fc->top++] = p; need--; filled++; } } if (filled > 0) { eventq_push(class_idx, (uint32_t)g_tiny_class_sizes[class_idx]); HAK_PATHDBG_INC(g_path_refill_calls, class_idx); return 1; } return 0; } // Move up to 'n' items from TLS magazine to SLL if SLL has room (lock-free). static inline int bulk_mag_to_sll_if_room(int class_idx, TinyTLSMag* mag, int n) { if (g_tls_list_enable) return 0; if (!g_tls_sll_enable || n <= 0) return 0; uint32_t cap = sll_cap_for_class(class_idx, (uint32_t)mag->cap); uint32_t have = g_tls_sll_count[class_idx]; if (have >= cap) return 0; int room = (int)(cap - have); int avail = mag->top; // Hysteresis: avoid frequent tiny moves; take at least 8 if possible int take = (n < room ? n : room); if (take < 8 && avail >= 8 && room >= 8) take = 8; if (take > avail) take = avail; if (take <= 0) return 0; for (int i = 0; i < take; i++) { void* p = mag->items[--mag->top].ptr; *(void**)p = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = p; g_tls_sll_count[class_idx]++; } HAK_PATHDBG_INC(g_path_refill_calls, class_idx); return take; } // Ultra-mode (SLL-only) refill operation static inline void ultra_refill_sll(int class_idx) { int need = ultra_batch_for_class(class_idx); HAK_ULTRADBG_INC(g_ultra_refill_calls, class_idx); int sll_cap = ultra_sll_cap_for_class(class_idx); pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m; pthread_mutex_lock(lock); TinySlab* slab = g_tiny_pool.free_slabs[class_idx]; if (!slab) { slab = allocate_new_slab(class_idx); if (slab) { slab->next = g_tiny_pool.free_slabs[class_idx]; g_tiny_pool.free_slabs[class_idx] = slab; } } if (slab) { size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0); int remaining = need; while (remaining > 0 && slab->free_count > 0) { if ((int)g_tls_sll_count[class_idx] >= sll_cap) break; int first = hak_tiny_find_free_block(slab); if (first < 0) break; // Allocate the first found block hak_tiny_set_used(slab, first); slab->free_count--; void* p0 = (char*)slab->base + ((size_t)first * bs); *(void**)p0 = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = p0; g_tls_sll_count[class_idx]++; remaining--; // Try to allocate more from the same word to amortize scanning int word_idx = first / 64; uint64_t used = slab->bitmap[word_idx]; uint64_t free_bits = ~used; while (remaining > 0 && free_bits && slab->free_count > 0) { if ((int)g_tls_sll_count[class_idx] >= sll_cap) break; int bit_idx = __builtin_ctzll(free_bits); int block_idx = word_idx * 64 + bit_idx; hak_tiny_set_used(slab, block_idx); slab->free_count--; void* p = (char*)slab->base + ((size_t)block_idx * bs); *(void**)p = g_tls_sll_head[class_idx]; g_tls_sll_head[class_idx] = p; g_tls_sll_count[class_idx]++; remaining--; // Update free_bits for next iteration used = slab->bitmap[word_idx]; free_bits = ~used; } if (slab->free_count == 0) { move_to_full_list(class_idx, slab); break; } } } pthread_mutex_unlock(lock); } #endif // HAKMEM_TINY_REFILL_INC_H