Tiny: Enable P0 batch refill by default + docs and task update

Summary - Default P0 ON: Build-time HAKMEM_TINY_P0_BATCH_REFILL=1 remains; runtime gate now defaults to ON (HAKMEM_TINY_P0_ENABLE unset or not '0'). Kill switch preserved via HAKMEM_TINY_P0_DISABLE=1. - Fix critical bug: After freelist→SLL batch splice, increment TinySlabMeta::used by 'from_freelist' to mirror non-P0 behavior (prevents under-accounting and follow-on carve invariants from breaking). - Add low-overhead A/B toggles for triage: HAKMEM_TINY_P0_NO_DRAIN (skip remote drain), HAKMEM_TINY_P0_LOG (emit [P0_COUNTER_OK/MISMATCH] based on total_active_blocks delta). - Keep linear carve fail-fast guards across simple/general/TLS-bump paths. Perf (1T, 100k×256B) - P0 OFF: ~2.73M ops/s (stable) - P0 ON (no drain): ~2.45M ops/s - P0 ON (normal drain): ~2.76M ops/s (fastest) Known - Rare [P0_COUNTER_MISMATCH] warnings persist (non-fatal). Continue auditing active/used balance around batch freelist splice and remote drain splice. Docs - Add docs/TINY_P0_BATCH_REFILL.md (runtime switches, behavior, perf notes). - Update CURRENT_TASK.md with Tiny P0 status (default ON) and next steps.
2025-11-09 22:12:34 +09:00
parent 1010a961fb
commit d9b334b968
24 changed files with 1240 additions and 69 deletions
--- a/core/hakmem_tiny_refill.inc.h
+++ b/core/hakmem_tiny_refill.inc.h
@ -24,6 +24,7 @@
 #include "hakmem_tiny_tls_list.h"
 #include <stdint.h>
 #include <pthread.h>
+#include <stdlib.h>

 // External declarations for TLS variables and globals
 extern int g_fast_enable;
@ -174,16 +175,44 @@ static inline int quick_refill_from_mag(int class_idx) {
    return take;
 }

-// P0 optimization: Batch refill (enabled by default, set HAKMEM_TINY_P0_BATCH_REFILL=0 to disable)
-#ifndef HAKMEM_TINY_P0_BATCH_REFILL
-#define HAKMEM_TINY_P0_BATCH_REFILL 1  // Enable P0 by default (verified +5.16% improvement)
-#endif
-
-#if HAKMEM_TINY_P0_BATCH_REFILL
+// P0 optimization: Batch refill（A/Bテスト用ランタイムゲートで呼び分け）
+// - デフォルトはOFF（環境変数 HAKMEM_TINY_P0_ENABLE=1 で有効化）
 #include "hakmem_tiny_refill_p0.inc.h"
-// Alias for compatibility
-#define sll_refill_small_from_ss sll_refill_batch_from_ss
+
+// Debug helper: verify linear carve stays within slab usable bytes (Fail-Fast)
+static inline int tiny_linear_carve_guard(TinyTLSSlab* tls,
+                                          TinySlabMeta* meta,
+                                          size_t stride,
+                                          uint32_t reserve,
+                                          const char* stage) {
+#if HAKMEM_BUILD_RELEASE
+    (void)tls; (void)meta; (void)stride; (void)reserve; (void)stage;
+    return 1;
+#else
+    if (!tls) return 0;
+    size_t usable = (tls->slab_idx == 0)
+                        ? SUPERSLAB_SLAB0_USABLE_SIZE
+                        : SUPERSLAB_SLAB_USABLE_SIZE;
+    size_t needed = ((size_t)meta->carved + (size_t)reserve) * stride;
+    if (__builtin_expect(needed > usable, 0)) {
+        fprintf(stderr,
+                "[LINEAR_GUARD] stage=%s cls=%d slab=%d carved=%u used=%u cap=%u "
+                "stride=%zu reserve=%u needed=%zu usable=%zu\n",
+                stage ? stage : "linear",
+                tls->ss ? tls->ss->size_class : -1,
+                tls->slab_idx,
+                meta ? meta->carved : 0u,
+                meta ? meta->used : 0u,
+                meta ? meta->capacity : 0u,
+                stride,
+                reserve,
+                needed,
+                usable);
+        return 0;
+    }
+    return 1;
 #endif
+}

 // Refill a few nodes directly into TLS SLL from TLS-cached SuperSlab (owner-thread only)
 // Note: If HAKMEM_TINY_P0_BATCH_REFILL is enabled, sll_refill_batch_from_ss is used instead
@ -196,6 +225,19 @@ __attribute__((noinline)) int sll_refill_small_from_ss(int class_idx, int max_ta
 static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
 #endif
    if (!g_use_superslab || max_take <= 0) return 0;
+    // ランタイムA/B: P0を有効化している場合はバッチrefillへ委譲
+    do {
+        // 既定: ON（HAKMEM_TINY_P0_ENABLE=0 で明示的にOFF）
+        static int g_p0_enable = -1;
+        if (__builtin_expect(g_p0_enable == -1, 0)) {
+            const char* e = getenv("HAKMEM_TINY_P0_ENABLE");
+            // 環境変数が'0'のときだけ無効、それ以外（未設定含む）は有効
+            g_p0_enable = (e && *e && *e == '0') ? 0 : 1;
+        }
+        if (__builtin_expect(g_p0_enable, 1)) {
+            return sll_refill_batch_from_ss(class_idx, max_take);
+        }
+    } while (0);
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
    if (!tls->ss) {
        // Try to obtain a SuperSlab for this class
@ -220,9 +262,13 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
        size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
        for (; taken < take;) {
            // Linear first (LIKELY for class7)
-            if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
+            if (__builtin_expect(meta->freelist == NULL && meta->carved < meta->capacity, 1)) {
+                if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, 1, "simple"), 0)) {
+                    abort();
+                }
                uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
-                void* p = (void*)(base + ((size_t)meta->used * bs));
+                void* p = (void*)(base + ((size_t)meta->carved * bs));
+                meta->carved++;
                meta->used++;
                *(void**)p = g_tls_sll_head[class_idx];
                g_tls_sll_head[class_idx] = p;
@ -264,9 +310,13 @@ static inline int sll_refill_small_from_ss(int class_idx, int max_take) {
            p = meta->freelist; meta->freelist = *(void**)p; meta->used++;
            // Track active blocks reserved into TLS SLL
            ss_active_inc(tls->ss);
-        } else if (__builtin_expect(meta->used < meta->capacity, 1)) {
+        } else if (__builtin_expect(meta->carved < meta->capacity, 1)) {
+            if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, 1, "general"), 0)) {
+                abort();
+            }
            void* slab_start = tiny_slab_base_for(tls->ss, tls->slab_idx);
-            p = (char*)slab_start + ((size_t)meta->used * bs);
+            p = (char*)slab_start + ((size_t)meta->carved * bs);
+            meta->carved++;
            meta->used++;
            // Track active blocks reserved into TLS SLL
            ss_active_inc(tls->ss);
@ -311,24 +361,29 @@ static inline void* superslab_tls_bump_fast(int class_idx) {
    TinyTLSSlab* tls = &g_tls_slabs[class_idx];
    TinySlabMeta* meta = tls->meta;
    if (!meta || meta->freelist != NULL) return NULL;  // linear mode only
-    uint16_t used = meta->used;
+    // Use monotonic 'carved' for window arming
+    uint16_t carved = meta->carved;
    uint16_t cap  = meta->capacity;
-    if (used >= cap) return NULL;
-    uint32_t avail = (uint32_t)cap - (uint32_t)used;
+    if (carved >= cap) return NULL;
+    uint32_t avail = (uint32_t)cap - (uint32_t)carved;
    uint32_t chunk = (g_bump_chunk > 0 ? (uint32_t)g_bump_chunk : 1u);
    if (chunk > avail) chunk = avail;
    size_t bs = g_tiny_class_sizes[tls->ss->size_class] + ((tls->ss->size_class != 7) ? 1 : 0);
    uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
-    uint8_t* start = base + ((size_t)used * bs);
-    // Reserve the chunk once in header (keeps remote-free accounting valid)
-    meta->used = (uint16_t)(used + (uint16_t)chunk);
+    if (__builtin_expect(!tiny_linear_carve_guard(tls, meta, bs, chunk, "tls_bump"), 0)) {
+        abort();
+    }
+    uint8_t* start = base + ((size_t)carved * bs);
+    // Reserve the chunk: advance carved and used accordingly
+    meta->carved = (uint16_t)(carved + (uint16_t)chunk);
+    meta->used   = (uint16_t)(meta->used + (uint16_t)chunk);
    // Account all reserved blocks as active in SuperSlab
    ss_active_add(tls->ss, chunk);
 #if HAKMEM_DEBUG_COUNTERS
    g_bump_arms[class_idx]++;
 #endif
    g_tls_bcur[class_idx] = start + bs;
-    g_tls_bend[class_idx] = base + (size_t)chunk * bs;
+    g_tls_bend[class_idx] = start + (size_t)chunk * bs;
    return (void*)start;
 }