Files
hakmem/core/hakmem_tiny_alloc.inc
Moe Charm (CI) d9b334b968 Tiny: Enable P0 batch refill by default + docs and task update
Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.
2025-11-09 22:12:34 +09:00

311 lines
13 KiB
C++
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// ============================================================================
// Step 3: Cold-path outline - Wrapper Context Handler
// ============================================================================
// Purpose: Handle allocations during wrapper calls (rare execution)
// Rationale: Avoid re-entrancy hazards with pthread locks during wrapper calls
// Step 3d: Force inline for readability without performance loss
__attribute__((always_inline))
static inline void* hak_tiny_alloc_wrapper(int class_idx) {
ROUTE_BEGIN(class_idx);
// Wrapper-context fast path: magazine-only (never take locks or refill)
tiny_small_mags_init_once();
if (__builtin_expect(class_idx > 3, 0)) tiny_mag_init_if_needed(class_idx);
TinyTLSMag* mag = &g_tls_mags[class_idx];
if (mag->top > 0) {
void* p = mag->items[--mag->top].ptr;
HAK_RET_ALLOC(class_idx, p);
}
// Try TLS active slabs (owner-only, lock-free)
TinySlab* tls = g_tls_active_slab_a[class_idx];
if (!(tls && tls->free_count > 0)) tls = g_tls_active_slab_b[class_idx];
if (tls && tls->free_count > 0) {
tiny_remote_drain_owner(tls);
if (tls->free_count > 0) {
int block_idx = hak_tiny_find_free_block(tls);
if (block_idx >= 0) {
hak_tiny_set_used(tls, block_idx);
tls->free_count--;
size_t bs = g_tiny_class_sizes[class_idx];
void* p = (char*)tls->base + (block_idx * bs);
HAK_RET_ALLOC(class_idx, p);
}
}
}
// Optional: attempt limited refill under trylock (no remote drain)
if (g_wrap_tiny_refill) {
pthread_mutex_t* lock = &g_tiny_class_locks[class_idx].m;
if (pthread_mutex_trylock(lock) == 0) {
TinySlab* slab = g_tiny_pool.free_slabs[class_idx];
if (slab && slab->free_count > 0) {
int room = mag->cap - mag->top;
if (room > 16) room = 16; // wrapper refill is small and quick
if (room > slab->free_count) room = slab->free_count;
if (room > 0) {
size_t bs = g_tiny_class_sizes[class_idx];
void* ret = NULL;
for (int i = 0; i < room; i++) {
int idx = hak_tiny_find_free_block(slab);
if (idx < 0) break;
hak_tiny_set_used(slab, idx);
slab->free_count--;
void* p = (char*)slab->base + (idx * bs);
if (i < room - 1) {
mag->items[mag->top].ptr = p;
mag->top++;
} else {
ret = p; // return one directly
}
}
if (slab->free_count == 0) {
move_to_full_list(class_idx, slab);
}
pthread_mutex_unlock(lock);
if (ret) { HAK_RET_ALLOC(class_idx, ret); }
} else {
pthread_mutex_unlock(lock);
}
} else {
pthread_mutex_unlock(lock);
}
}
}
return NULL; // empty → fallback to next allocator tier
}
void* hak_tiny_alloc(size_t size) {
#if !HAKMEM_BUILD_RELEASE
if (!g_tiny_initialized) hak_tiny_init();
#else
if (__builtin_expect(!g_tiny_initialized, 0)) {
hak_tiny_init();
}
#endif
// Default (safe): Avoid using Tiny during wrapper callsTLSガード or 関数)
// If HAKMEM_WRAP_TINY=1, allow Tiny even when called from wrapper.
#if !HAKMEM_BUILD_RELEASE
# if HAKMEM_WRAPPER_TLS_GUARD
if (!g_wrap_tiny_enabled && __builtin_expect(g_tls_in_wrapper != 0, 0)) {
static int log1 = 0;
if (log1 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: in_wrapper\n"); log1++; }
return NULL;
}
# else
extern int hak_in_wrapper(void);
if (!g_wrap_tiny_enabled && __builtin_expect(hak_in_wrapper() != 0, 0)) {
static int log2 = 0;
if (log2 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: hak_in_wrapper\n"); log2++; }
return NULL;
}
# endif
#endif
// ========================================================================
// Cooperative stats polling (SIGUSR1 trigger safe point)
hak_tiny_stats_poll();
// ========================================================================
// Phase 6-1.5: Ultra-Simple Fast Path (when enabled)
// ========================================================================
// Design: "Simple Front + Smart Back" - inspired by Mid-Large HAKX +171%
// - 3-4 instruction fast path (Phase 6-1 style)
// - Existing SuperSlab + ACE + Learning backend
// Two variants:
// Phase 6-1.5: -DHAKMEM_TINY_PHASE6_ULTRA_SIMPLE=1 (alignment guessing)
// Phase 6-1.6: -DHAKMEM_TINY_PHASE6_METADATA=1 (metadata header)
#ifdef HAKMEM_TINY_PHASE6_ULTRA_SIMPLE
return hak_tiny_alloc_ultra_simple(size);
#elif defined(HAKMEM_TINY_PHASE6_METADATA)
return hak_tiny_alloc_metadata(size);
#endif
// ========================================================================
// 1. Size → class index
int class_idx = hak_tiny_size_to_class(size);
if (class_idx < 0) {
static int log3 = 0;
if (log3 < 2) { fprintf(stderr, "[DEBUG] Tiny blocked: class_idx < 0 for size %zu\n", size); log3++; }
return NULL; // >1KB
}
// Route fingerprint begin (debug-only; no-op unless HAKMEM_ROUTE=1)
ROUTE_BEGIN(class_idx);
do {
static int g_alloc_ring = -1;
if (__builtin_expect(g_alloc_ring == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ALLOC_RING");
g_alloc_ring = (e && *e && *e != '0') ? 1 : 0;
}
if (g_alloc_ring) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, (uint16_t)class_idx, (void*)(uintptr_t)size, 0);
}
} while (0);
#if HAKMEM_TINY_MINIMAL_FRONT
// Minimal Front for hot tiny classes (bench-focused):
// SLL direct pop → minimal refill → pop, bypassing other layers.
if (__builtin_expect(class_idx <= 3, 1)) {
void* head = g_tls_sll_head[class_idx];
if (__builtin_expect(head != NULL, 1)) {
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
// Refill a small batch directly from TLS-cached SuperSlab
#if HAKMEM_TINY_P0_BATCH_REFILL
(void)sll_refill_batch_from_ss(class_idx, 32);
#else
(void)sll_refill_small_from_ss(class_idx, 32);
#endif
head = g_tls_sll_head[class_idx];
if (__builtin_expect(head != NULL, 1)) {
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
// Fall through to slow path if still empty
}
#endif
// Ultra-Front: minimal per-class stack for hot tiny classes (opt-in)
// Try ultra_pop → (optional) ultra_refill_small → ultra_pop before other layers
if (__builtin_expect(g_ultra_simple && class_idx <= 3, 0)) {
void* up = ultra_pop(class_idx);
if (__builtin_expect(up == NULL, 0)) {
(void)ultra_refill_small(class_idx);
up = ultra_pop(class_idx);
}
if (__builtin_expect(up != NULL, 0)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, up, 0xF0);
HAK_RET_ALLOC(class_idx, up);
}
}
if (__builtin_expect(!g_debug_fast0, 1)) {
#ifdef HAKMEM_TINY_BENCH_FASTPATH
if (__builtin_expect(class_idx <= HAKMEM_TINY_BENCH_TINY_CLASSES, 1)) {
if (__builtin_expect(class_idx <= 3, 1)) {
unsigned char* done = &g_tls_bench_warm_done[class_idx];
if (__builtin_expect(*done == 0, 0)) {
int warm = (class_idx == 0) ? HAKMEM_TINY_BENCH_WARMUP8 :
(class_idx == 1) ? HAKMEM_TINY_BENCH_WARMUP16 :
(class_idx == 2) ? HAKMEM_TINY_BENCH_WARMUP32 :
HAKMEM_TINY_BENCH_WARMUP64;
#if HAKMEM_TINY_P0_BATCH_REFILL
if (warm > 0) (void)sll_refill_batch_from_ss(class_idx, warm);
#else
if (warm > 0) (void)sll_refill_small_from_ss(class_idx, warm);
#endif
*done = 1;
}
}
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
tiny_small_mags_init_once();
if (class_idx > 3) tiny_mag_init_if_needed(class_idx);
#endif
void* head = g_tls_sll_head[class_idx];
if (__builtin_expect(head != NULL, 1)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 0);
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
#ifndef HAKMEM_TINY_BENCH_SLL_ONLY
TinyTLSMag* mag = &g_tls_mags[class_idx];
int t = mag->top;
if (__builtin_expect(t > 0, 1)) {
void* p = mag->items[--t].ptr;
mag->top = t;
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, p, 1);
HAK_RET_ALLOC(class_idx, p);
}
#endif
int bench_refill = (class_idx == 0) ? HAKMEM_TINY_BENCH_REFILL8 :
(class_idx == 1) ? HAKMEM_TINY_BENCH_REFILL16 :
(class_idx == 2) ? HAKMEM_TINY_BENCH_REFILL32 :
HAKMEM_TINY_BENCH_REFILL64;
#if HAKMEM_TINY_P0_BATCH_REFILL
if (__builtin_expect(sll_refill_batch_from_ss(class_idx, bench_refill) > 0, 0)) {
#else
if (__builtin_expect(sll_refill_small_from_ss(class_idx, bench_refill) > 0, 0)) {
#endif
head = g_tls_sll_head[class_idx];
if (head) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, head, 2);
g_tls_sll_head[class_idx] = *(void**)head;
if (g_tls_sll_count[class_idx] > 0) g_tls_sll_count[class_idx]--;
HAK_RET_ALLOC(class_idx, head);
}
}
// fallthrough to slow path on miss
}
#endif
// TinyHotMag front: fast-tierが枯渇したとき、キャッシュを再補充してから利用する
if (__builtin_expect(g_hotmag_enable && class_idx <= 2 && g_fast_head[class_idx] == NULL, 0)) {
hotmag_init_if_needed(class_idx);
TinyHotMag* hm = &g_tls_hot_mag[class_idx];
void* hotmag_ptr = hotmag_pop(class_idx);
if (__builtin_expect(hotmag_ptr == NULL, 0)) {
if (hotmag_try_refill(class_idx, hm) > 0) {
hotmag_ptr = hotmag_pop(class_idx);
}
}
if (__builtin_expect(hotmag_ptr != NULL, 1)) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, hotmag_ptr, 3);
HAK_RET_ALLOC(class_idx, hotmag_ptr);
}
}
if (g_hot_alloc_fn[class_idx] != NULL) {
void* fast_hot = NULL;
switch (class_idx) {
case 0:
fast_hot = tiny_hot_pop_class0();
break;
case 1:
fast_hot = tiny_hot_pop_class1();
break;
case 2:
fast_hot = tiny_hot_pop_class2();
break;
case 3:
fast_hot = tiny_hot_pop_class3();
break;
default:
fast_hot = NULL;
break;
}
if (__builtin_expect(fast_hot != NULL, 1)) {
#if HAKMEM_BUILD_DEBUG
g_tls_hit_count[class_idx]++;
#endif
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast_hot, 4);
HAK_RET_ALLOC(class_idx, fast_hot);
}
}
void* fast = tiny_fast_pop(class_idx);
if (__builtin_expect(fast != NULL, 0)) {
#if HAKMEM_BUILD_DEBUG
g_tls_hit_count[class_idx]++;
#endif
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, fast, 5);
HAK_RET_ALLOC(class_idx, fast);
}
} else {
tiny_debug_ring_record(TINY_RING_EVENT_FRONT_BYPASS, (uint16_t)class_idx, NULL, 0);
}
void* slow_ptr = hak_tiny_alloc_slow(size, class_idx);
if (slow_ptr) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_SUCCESS, (uint16_t)class_idx, slow_ptr, 6);
HAK_RET_ALLOC(class_idx, slow_ptr); // Increment stats for slow path success
}
tiny_alloc_dump_tls_state(class_idx, "fail", &g_tls_slabs[class_idx]);
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_NULL, (uint16_t)class_idx, NULL, 0);
return slow_ptr;
}