Tiny: Enable P0 batch refill by default + docs and task update

Summary
- Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON
  (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1.
- Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist'
  to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking).
- Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain),
  HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta).
- Keep linear carve fail-fast guards across simple/general/TLS-bump paths.

Perf (1T, 100k×256B)
- P0 OFF: ~2.73M ops/s (stable)
- P0 ON (no drain): ~2.45M ops/s
- P0 ON (normal drain): ~2.76M ops/s (fastest)

Known
- Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used
  balance around batch freelist splice and remote drain splice.

Docs
- Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes).
- Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.
This commit is contained in:
Moe Charm (CI)
2025-11-09 22:12:34 +09:00
parent 1010a961fb
commit d9b334b968
24 changed files with 1240 additions and 69 deletions

View File

@ -24,6 +24,7 @@
#include "hakmem_tiny_tls_list.h"
#include <stdint.h>
#include <pthread.h>
#include <stdlib.h>
// External declarations for TLS variables and globals
extern int g_fast_enable;
@ -174,16 +175,44 @@ static inline int quick_refill_from_mag(int class_idx) {
return take;
}
// P0 optimization: Batch refill (enabled by default, set HAKMEM_TINY_P0_BATCH_REFILL=0 to disable)
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
#define HAKMEM_TINY_P0_BATCH_REFILL 1 // Enable P0 by default (verified +5.16% improvement)
#endif
#if HAKMEM_TINY_P0_BATCH_REFILL
// P0 optimization: Batch refillA/Bテスト用ランタイムゲートで呼び分け
// - デフォルトはOFF環境変数 HAKMEM_TINY_P0_ENABLE=1 で有効化)
#include "hakmem_tiny_refill_p0.inc.h"
// Alias for compatibility
#define sll_refill_small_from_ss sll_refill_batch_from_ss
// Debug helper: verify linear carve stays within slab usable bytes (Fail-Fast)
static inline int tiny_linear_carve_guard(TinyTLSSlab* tls,
TinySlabMeta* meta,
size_t stride,
uint32_t reserve,
const char* stage) {
#if HAKMEM_BUILD_RELEASE
(void)tls; (void)meta; (void)stride; (void)reserve; (void)stage;
return 1;
#else
if (!tls) return 0;
size_t usable = (tls->slab_idx == 0)
? SUPERSLAB_SLAB0_USABLE_SIZE
: SUPERSLAB_SLAB_USABLE_SIZE;
size_t needed = ((size_t)meta->carved + (size_t)reserve) * stride;
if (__builtin_expect(needed > usable, 0)) {
fprintf(stderr,
"[LINEAR_GUARD] stage=%s cls=%d slab=%d carved=%u used=%u cap=%u "
"stride=%zu reserve=%u needed=%zu usable=%zu\n",
stage ? stage : "linear",
tls->ss ? tls->ss->size_class : -1,
tls->slab_idx,
meta ? meta->carved : 0u,
meta ? meta->used : 0u,
meta ? meta->capacity : 0u,
stride,
reserve,
needed,
usable);
return 0;
}
return 1;
#endif
}
// Refill a few nodes directly into TLS SLL from TLS-cached SuperSlab (owner-thread only)
// Note: If HAKMEM_TINY_P0_BATCH_REFILL is enabled, sll_refill_batch_from_ss is used instead
@ -196,6 +225,19 @@ __attribute__((noinline)) int sll_refill_small_from_ss(int class_idx, int max_ta
static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
#endif
if (!g_use_superslab || max_take <= 0) return 0;
// ランタイムA/B: P0を有効化している場合はバッチrefillへ委譲
do {
// 既定: ONHAKMEM_TINY_P0_ENABLE=0 で明示的にOFF
static int g_p0_enable = -1;
if (__builtin_expect(g_p0_enable == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_ENABLE");
// 環境変数が'0'のときだけ無効、それ以外(未設定含む)は有効
g_p0_enable = (e && *e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_p0_enable, 1)) {
return sll_refill_batch_from_ss(class_idx, max_take);
}
} while (0);
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
if (!tls->ss) {
// Try to obtain a SuperSlab for this class
@ -220,9 +262,13 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
for (; taken < take;) {
// Linear first (LIKELY for class7)
if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
if (__builtin_expect(meta->freelist == NULL && meta->carved < meta->capacity, 1)) {
if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, 1, "simple"), 0)) {
abort();
}
uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
void* p = (void*)(base + ((size_t)meta->used * bs));
void* p = (void*)(base + ((size_t)meta->carved * bs));
meta->carved++;
meta->used++;
*(void**)p = g_tls_sll_head[class_idx];
g_tls_sll_head[class_idx] = p;
@ -264,9 +310,13 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
p = meta->freelist; meta->freelist = *(void**)p; meta->used++;
// Track active blocks reserved into TLS SLL
ss_active_inc(tls->ss);
} else if (__builtin_expect(meta->used < meta->capacity, 1)) {
} else if (__builtin_expect(meta->carved < meta->capacity, 1)) {
if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, 1, "general"), 0)) {
abort();
}
void* slab_start = tiny_slab_base_for(tls->ss, tls->slab_idx);
p = (char*)slab_start + ((size_t)meta->used * bs);
p = (char*)slab_start + ((size_t)meta->carved * bs);
meta->carved++;
meta->used++;
// Track active blocks reserved into TLS SLL
ss_active_inc(tls->ss);
@ -311,24 +361,29 @@ static inline void* superslab_tls_bump_fast(int class_idx) {
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
TinySlabMeta* meta = tls->meta;
if (!meta || meta->freelist != NULL) return NULL; // linear mode only
uint16_t used = meta->used;
// Use monotonic 'carved' for window arming
uint16_t carved = meta->carved;
uint16_t cap = meta->capacity;
if (used >= cap) return NULL;
uint32_t avail = (uint32_t)cap - (uint32_t)used;
if (carved >= cap) return NULL;
uint32_t avail = (uint32_t)cap - (uint32_t)carved;
uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u);
if (chunk > avail) chunk = avail;
size_t bs = g_tiny_class_sizes[tls->ss->size_class] + ((tls->ss->size_class != 7) ? 1 : 0);
uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
uint8_t* start = base + ((size_t)used * bs);
// Reserve the chunk once in header (keeps remote-free accounting valid)
meta->used = (uint16_t)(used + (uint16_t)chunk);
if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, chunk, "tls_bump"), 0)) {
abort();
}
uint8_t* start = base + ((size_t)carved * bs);
// Reserve the chunk: advance carved and used accordingly
meta->carved = (uint16_t)(carved + (uint16_t)chunk);
meta->used = (uint16_t)(meta->used + (uint16_t)chunk);
// Account all reserved blocks as active in SuperSlab
ss_active_add(tls->ss, chunk);
#if HAKMEM_DEBUG_COUNTERS
g_bump_arms[class_idx]++;
#endif
g_tls_bcur[class_idx] = start + bs;
g_tls_bend[class_idx] = base + (size_t)chunk * bs;
g_tls_bend[class_idx] = start + (size_t)chunk * bs;
return (void*)start;
}