Files
hakmem/core/hakmem_tiny_refill.inc.h
Moe Charm (CI) d9b334b968 Tiny: Enable P0 batch refill by default + docs and task update
Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.
2025-11-09 22:12:34 +09:00

519 lines
21 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_refill.inc.h
// Phase 2D-1: Hot-path inline functions - Refill operations
//
// This file contains hot-path refill functions for various allocation tiers.
// These functions are extracted from hakmem_tiny.c to improve maintainability and
// reduce the main file size by approximately 280 lines.
//
// Functions handle:
// - tiny_fast_refill_and_take: Fast cache refill (lines 584-622, 39 lines)
// - quick_refill_from_sll: Quick slot refill from SLL (lines 918-936, 19 lines)
// - quick_refill_from_mag: Quick slot refill from magazine (lines 938-949, 12 lines)
// - sll_refill_small_from_ss: SLL refill from superslab (lines 952-996, 45 lines)
// - superslab_tls_bump_fast: TLS bump allocation (lines 1016-1060, 45 lines)
// - frontend_refill_fc: Frontend fast cache refill (lines 1063-1106, 44 lines)
// - bulk_mag_to_sll_if_room: Magazine to SLL bulk transfer (lines 1133-1154, 22 lines)
// - ultra_refill_sll: Ultra-mode SLL refill (lines 1178-1233, 56 lines)
#ifndef HAKMEM_TINY_REFILL_INC_H
#define HAKMEM_TINY_REFILL_INC_H
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include "hakmem_tiny_magazine.h"
#include "hakmem_tiny_tls_list.h"
#include <stdint.h>
#include <pthread.h>
#include <stdlib.h>
// External declarations for TLS variables and globals
extern int g_fast_enable;
extern uint16_t g_fast_cap[TINY_NUM_CLASSES];
extern __thread void* g_fast_head[TINY_NUM_CLASSES];
extern __thread uint16_t g_fast_count[TINY_NUM_CLASSES];
extern int g_tls_list_enable;
extern int g_tls_sll_enable;
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
extern int g_use_superslab;
extern int g_ultra_bump_shadow;
extern int g_bump_chunk;
extern __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES];
extern __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES];
extern int g_fastcache_enable;
extern int g_quick_enable;
// External variable declarations
// Note: TinyTLSSlab, TinyFastCache, and TinyQuickSlot types must be defined before including this file
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];
extern TinyPool g_tiny_pool;
extern PaddedLock g_tiny_class_locks[TINY_NUM_CLASSES];
extern __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
extern __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES];
// Frontend fill target
extern _Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES];
// Debug counters
#if HAKMEM_DEBUG_COUNTERS
extern uint64_t g_bump_hits[TINY_NUM_CLASSES];
extern uint64_t g_bump_arms[TINY_NUM_CLASSES];
extern uint64_t g_path_refill_calls[TINY_NUM_CLASSES];
extern uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES];
#define HAK_PATHDBG_INC(arr, idx) do { if (g_path_debug_enabled) { (arr)[(idx)]++; } } while(0)
#define HAK_ULTRADBG_INC(arr, idx) do { (arr)[(idx)]++; } while(0)
extern int g_path_debug_enabled;
#else
#define HAK_PATHDBG_INC(arr, idx) do { (void)(idx); } while(0)
#define HAK_ULTRADBG_INC(arr, idx) do { (void)(idx); } while(0)
#endif
// Tracepoint macros
#ifndef HAK_TP1
#define HAK_TP1(name, idx) do { (void)(idx); } while(0)
#endif
// Forward declarations for functions used in this file
static inline void* tiny_fast_pop(int class_idx);
static inline int tiny_fast_push(int class_idx, void* ptr);
static inline int tls_refill_from_tls_slab(int class_idx, TinyTLSList* tls, uint32_t want);
static inline uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap);
static SuperSlab* superslab_refill(int class_idx);
static void* slab_data_start(SuperSlab* ss, int slab_idx);
static inline uint8_t* tiny_slab_base_for(SuperSlab* ss, int slab_idx);
static inline void ss_active_add(SuperSlab* ss, uint32_t n);
static inline void ss_active_inc(SuperSlab* ss);
static TinySlab* allocate_new_slab(int class_idx);
static void move_to_full_list(int class_idx, struct TinySlab* target_slab);
static int hak_tiny_find_free_block(TinySlab* slab);
static void hak_tiny_set_used(TinySlab* slab, int block_idx);
static inline int ultra_batch_for_class(int class_idx);
static inline int ultra_sll_cap_for_class(int class_idx);
// Note: tiny_small_mags_init_once and tiny_mag_init_if_needed are declared in hakmem_tiny_magazine.h
static void eventq_push(int class_idx, uint32_t size);
// Fast cache refill and take operation
static inline void* tiny_fast_refill_and_take(int class_idx, TinyTLSList* tls) {
void* direct = tiny_fast_pop(class_idx);
if (direct) return direct;
uint16_t cap = g_fast_cap[class_idx];
if (cap == 0) return NULL;
uint16_t count = g_fast_count[class_idx];
uint16_t need = cap > count ? (uint16_t)(cap - count) : 0;
if (need == 0) return NULL;
uint32_t have = tls->count;
if (have < need) {
uint32_t want = need - have;
uint32_t thresh = tls_list_refill_threshold(tls);
if (want < thresh) want = thresh;
tls_refill_from_tls_slab(class_idx, tls, want);
}
void* batch_head = NULL;
void* batch_tail = NULL;
uint32_t taken = tls_list_bulk_take(tls, need, &batch_head, &batch_tail);
if (taken == 0u || batch_head == NULL) {
return NULL;
}
void* ret = batch_head;
void* node = *(void**)ret;
uint32_t remaining = (taken > 0u) ? (taken - 1u) : 0u;
while (node && remaining > 0u) {
void* next = *(void**)node;
if (tiny_fast_push(class_idx, node)) {
node = next;
remaining--;
} else {
// Push failed, return remaining to TLS
tls_list_bulk_put(tls, node, batch_tail, remaining);
return ret;
}
}
return ret;
}
// Quick slot refill from SLL
static inline int quick_refill_from_sll(int class_idx) {
if (!g_tls_sll_enable) return 0;
TinyQuickSlot* qs = &g_tls_quick[class_idx];
int room = (int)(QUICK_CAP - qs->top);
if (room <= 0) return 0;
// Limit burst to a tiny constant to reduce loop/branches
if (room > 2) room = 2;
int filled = 0;
while (room > 0) {
void* head = g_tls_sll_head[class_idx];
if (!head) break;
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
qs->items[qs->top++] = head;
room--; filled++;
}
if (filled > 0) HAK_TP1(quick_refill_sll, class_idx);
if (filled > 0) {
extern unsigned long long g_front_quick_hit[];
g_front_quick_hit[class_idx]++;
}
return filled;
}
// Quick slot refill from magazine
static inline int quick_refill_from_mag(int class_idx) {
TinyTLSMag* mag = &g_tls_mags[class_idx];
if (mag->top <= 0) return 0;
TinyQuickSlot* qs = &g_tls_quick[class_idx];
int room = (int)(QUICK_CAP - qs->top);
if (room <= 0) return 0;
// Only a single transfer from magazine to minimize overhead
int take = (mag->top > 0 && room > 0) ? 1 : 0;
for (int i = 0; i < take; i++) { qs->items[qs->top++] = mag->items[--mag->top].ptr; }
if (take > 0) HAK_TP1(quick_refill_mag, class_idx);
return take;
}
// P0 optimization: Batch refillA/Bテスト用ランタイムゲートで呼び分け
// - デフォルトはOFF環境変数 HAKMEM_TINY_P0_ENABLE=1 で有効化)
#include "hakmem_tiny_refill_p0.inc.h"
// Debug helper: verify linear carve stays within slab usable bytes (Fail-Fast)
static inline int tiny_linear_carve_guard(TinyTLSSlab* tls,
TinySlabMeta* meta,
size_t stride,
uint32_t reserve,
const char* stage) {
#if HAKMEM_BUILD_RELEASE
(void)tls; (void)meta; (void)stride; (void)reserve; (void)stage;
return 1;
#else
if (!tls) return 0;
size_t usable = (tls->slab_idx == 0)
? SUPERSLAB_SLAB0_USABLE_SIZE
: SUPERSLAB_SLAB_USABLE_SIZE;
size_t needed = ((size_t)meta->carved + (size_t)reserve) * stride;
if (__builtin_expect(needed > usable, 0)) {
fprintf(stderr,
"[LINEAR_GUARD] stage=%s cls=%d slab=%d carved=%u used=%u cap=%u "
"stride=%zu reserve=%u needed=%zu usable=%zu\n",
stage ? stage : "linear",
tls->ss ? tls->ss->size_class : -1,
tls->slab_idx,
meta ? meta->carved : 0u,
meta ? meta->used : 0u,
meta ? meta->capacity : 0u,
stride,
reserve,
needed,
usable);
return 0;
}
return 1;
#endif
}
// Refill a few nodes directly into TLS SLL from TLS-cached SuperSlab (owner-thread only)
// Note: If HAKMEM_TINY_P0_BATCH_REFILL is enabled, sll_refill_batch_from_ss is used instead
#if !HAKMEM_TINY_P0_BATCH_REFILL
// Phase 6-1.7: Export for box refactor (Box 5 needs access from hakmem.c)
// Note: Force non-inline to provide linkable definition for LTO
#ifdef HAKMEM_TINY_PHASE6_BOX_REFACTOR
__attribute__((noinline)) int sll_refill_small_from_ss(int class_idx, int max_take) {
#else
static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
#endif
if (!g_use_superslab || max_take <= 0) return 0;
// ランタイムA/B: P0を有効化している場合はバッチrefillへ委譲
do {
// 既定: ONHAKMEM_TINY_P0_ENABLE=0 で明示的にOFF
static int g_p0_enable = -1;
if (__builtin_expect(g_p0_enable == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_ENABLE");
// 環境変数が'0'のときだけ無効、それ以外(未設定含む)は有効
g_p0_enable = (e && *e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_p0_enable, 1)) {
return sll_refill_batch_from_ss(class_idx, max_take);
}
} while (0);
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
if (!tls->ss) {
// Try to obtain a SuperSlab for this class
if (superslab_refill(class_idx) == NULL) return 0;
}
TinySlabMeta* meta = tls->meta;
if (!meta) return 0;
// Class 4/5/6/7 special-case: simple batch refill (favor linear carve, minimal branching)
// Optional gate for class3 via env: HAKMEM_TINY_SIMPLE_REFILL_C3=1
static int g_simple_c3 = -1;
if (__builtin_expect(g_simple_c3 == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_SIMPLE_REFILL_C3");
g_simple_c3 = (e && *e && *e != '0') ? 1 : 0;
}
if (__builtin_expect(class_idx >= 4 || (class_idx == 3 && g_simple_c3), 0)) {
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
if (room <= 0) return 0;
int take = max_take < room ? max_take : room;
int taken = 0;
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
for (; taken < take;) {
// Linear first (LIKELY for class7)
if (__builtin_expect(meta->freelist == NULL && meta->carved < meta->capacity, 1)) {
if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, 1, "simple"), 0)) {
abort();
}
uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
void* p = (void*)(base + ((size_t)meta->carved * bs));
meta->carved++;
meta->used++;
*(void**)p = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = p;
g_tls_sll_count[class_idx]++;
ss_active_inc(tls->ss);
taken++;
continue;
}
// Freelist fallback
if (__builtin_expect(meta->freelist != NULL, 0)) {
void* p = meta->freelist;
meta->freelist = *(void**)p;
meta->used++;
*(void**)p = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = p;
g_tls_sll_count[class_idx]++;
ss_active_inc(tls->ss);
taken++;
continue;
}
// Need another slab with space
if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
meta = tls->meta; // refresh after refill
}
return taken;
}
// Compute how many we can actually push into SLL without overflow
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
if (room <= 0) return 0;
int take = max_take < room ? max_take : room;
int taken = 0;
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
while (taken < take) {
void* p = NULL;
if (__builtin_expect(meta->freelist != NULL, 0)) {
p = meta->freelist; meta->freelist = *(void**)p; meta->used++;
// Track active blocks reserved into TLS SLL
ss_active_inc(tls->ss);
} else if (__builtin_expect(meta->carved < meta->capacity, 1)) {
if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, 1, "general"), 0)) {
abort();
}
void* slab_start = tiny_slab_base_for(tls->ss, tls->slab_idx);
p = (char*)slab_start + ((size_t)meta->carved * bs);
meta->carved++;
meta->used++;
// Track active blocks reserved into TLS SLL
ss_active_inc(tls->ss);
} else {
// Move to another slab with space
if (superslab_refill(class_idx) == NULL) break;
meta = tls->meta; // refresh after refill
continue;
}
if (!p) break;
*(void**)p = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = p;
g_tls_sll_count[class_idx]++;
taken++;
}
return taken;
}
#endif // !HAKMEM_TINY_P0_BATCH_REFILL
// Ultra-Bump TLS shadow try: returns pointer when a TLS bump window is armed
// or can be armed by reserving a small chunk from the current SuperSlab meta.
static inline void* superslab_tls_bump_fast(int class_idx) {
if (!g_ultra_bump_shadow || !g_use_superslab) return NULL;
// Serve from armed TLS window if present
uint8_t* cur = g_tls_bcur[class_idx];
if (__builtin_expect(cur != NULL, 0)) {
uint8_t* end = g_tls_bend[class_idx];
size_t bs = g_tiny_class_sizes[class_idx];
if (__builtin_expect(cur <= end - bs, 1)) {
g_tls_bcur[class_idx] = cur + bs;
#if HAKMEM_DEBUG_COUNTERS
g_bump_hits[class_idx]++;
#endif
HAK_TP1(bump_hit, class_idx);
return (void*)cur;
}
// Window exhausted
g_tls_bcur[class_idx] = NULL;
g_tls_bend[class_idx] = NULL;
}
// Arm a new window from TLS-cached SuperSlab meta (linear mode only)
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
TinySlabMeta* meta = tls->meta;
if (!meta || meta->freelist != NULL) return NULL; // linear mode only
// Use monotonic 'carved' for window arming
uint16_t carved = meta->carved;
uint16_t cap = meta->capacity;
if (carved >= cap) return NULL;
uint32_t avail = (uint32_t)cap - (uint32_t)carved;
uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u);
if (chunk > avail) chunk = avail;
size_t bs = g_tiny_class_sizes[tls->ss->size_class] + ((tls->ss->size_class != 7) ? 1 : 0);
uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, chunk, "tls_bump"), 0)) {
abort();
}
uint8_t* start = base + ((size_t)carved * bs);
// Reserve the chunk: advance carved and used accordingly
meta->carved = (uint16_t)(carved + (uint16_t)chunk);
meta->used = (uint16_t)(meta->used + (uint16_t)chunk);
// Account all reserved blocks as active in SuperSlab
ss_active_add(tls->ss, chunk);
#if HAKMEM_DEBUG_COUNTERS
g_bump_arms[class_idx]++;
#endif
g_tls_bcur[class_idx] = start + bs;
g_tls_bend[class_idx] = start + (size_t)chunk * bs;
return (void*)start;
}
// Frontend: refill FastCache directly from TLS active slab (owner-only) or adopt a slab
static inline int frontend_refill_fc(int class_idx) {
TinyFastCache* fc = &g_fast_cache[class_idx];
int room = TINY_FASTCACHE_CAP - fc->top;
if (room <= 0) return 0;
// Target refill (conservative for safety)
int need = ultra_batch_for_class(class_idx);
int tgt = atomic_load_explicit(&g_frontend_fill_target[class_idx], memory_order_relaxed);
if (tgt > 0 && tgt < need) need = tgt;
if (need > room) need = room;
if (need <= 0) return 0;
int filled = 0;
// Step A: First bulk transfer from TLS SLL to FastCache (lock-free, O(1))
if (g_tls_sll_enable) {
while (need > 0 && g_tls_sll_head[class_idx] != NULL) {
void* h = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = *(void**)h;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--; // underflow prevention
fc->items[fc->top++] = h;
need--; filled++;
if (fc->top >= TINY_FASTCACHE_CAP) break;
}
}
// Step B: If still not enough, transfer from TLS Magazine (lock-free, O(1))
if (need > 0) {
tiny_small_mags_init_once();
if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
TinyTLSMag* mag = &g_tls_mags[class_idx];
while (need > 0 && mag->top > 0 && fc->top < TINY_FASTCACHE_CAP) {
void* p = mag->items[--mag->top].ptr;
fc->items[fc->top++] = p;
need--; filled++;
}
}
if (filled > 0) {
eventq_push(class_idx, (uint32_t)g_tiny_class_sizes[class_idx]);
HAK_PATHDBG_INC(g_path_refill_calls, class_idx);
return 1;
}
return 0;
}
// Move up to 'n' items from TLS magazine to SLL if SLL has room (lock-free).
static inline int bulk_mag_to_sll_if_room(int class_idx, TinyTLSMag* mag, int n) {
if (g_tls_list_enable) return 0;
if (!g_tls_sll_enable || n <= 0) return 0;
uint32_t cap = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
uint32_t have = g_tls_sll_count[class_idx];
if (have >= cap) return 0;
int room = (int)(cap - have);
int avail = mag->top;
// Hysteresis: avoid frequent tiny moves; take at least 8 if possible
int take = (n < room ? n : room);
if (take < 8 && avail >= 8 && room >= 8) take = 8;
if (take > avail) take = avail;
if (take <= 0) return 0;
for (int i = 0; i < take; i++) {
void* p = mag->items[--mag->top].ptr;
*(void**)p = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = p;
g_tls_sll_count[class_idx]++;
}
HAK_PATHDBG_INC(g_path_refill_calls, class_idx);
return take;
}
// Ultra-mode (SLL-only) refill operation
static inline void ultra_refill_sll(int class_idx) {
int need = ultra_batch_for_class(class_idx);
HAK_ULTRADBG_INC(g_ultra_refill_calls, class_idx);
int sll_cap = ultra_sll_cap_for_class(class_idx);
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
pthread_mutex_lock(lock);
TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
if (!slab) {
slab = allocate_new_slab(class_idx);
if (slab) {
slab->next = g_tiny_pool.free_slabs[class_idx];
g_tiny_pool.free_slabs[class_idx] = slab;
}
}
if (slab) {
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
int remaining = need;
while (remaining > 0 && slab->free_count > 0) {
if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;
int first = hak_tiny_find_free_block(slab);
if (first < 0) break;
// Allocate the first found block
hak_tiny_set_used(slab, first);
slab->free_count--;
void* p0 = (char*)slab->base + ((size_t)first * bs);
*(void**)p0 = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = p0;
g_tls_sll_count[class_idx]++;
remaining--;
// Try to allocate more from the same word to amortize scanning
int word_idx = first / 64;
uint64_t used = slab->bitmap[word_idx];
uint64_t free_bits = ~used;
while (remaining > 0 && free_bits && slab->free_count > 0) {
if ((int)g_tls_sll_count[class_idx] >= sll_cap) break;
int bit_idx = __builtin_ctzll(free_bits);
int block_idx = word_idx * 64 + bit_idx;
hak_tiny_set_used(slab, block_idx);
slab->free_count--;
void* p = (char*)slab->base + ((size_t)block_idx * bs);
*(void**)p = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = p;
g_tls_sll_count[class_idx]++;
remaining--;
// Update free_bits for next iteration
used = slab->bitmap[word_idx];
free_bits = ~used;
}
if (slab->free_count == 0) {
move_to_full_list(class_idx, slab);
break;
}
}
}
pthread_mutex_unlock(lock);
}
#endif // HAKMEM_TINY_REFILL_INC_H