// hakmem_tiny_refill_p0.inc.h // ChatGPT Pro P0: Complete Batch Refill (SLL用) // // Purpose: Optimize sll_refill_small_from_ss with batch carving // Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126) // // Key optimization: ss_active_inc × 64 → ss_active_add × 1 // // Maintains: Existing g_tls_sll_head fast path (no changes to hot path!) // // Enable P0 by default for testing (set to 0 to disable) #ifndef HAKMEM_TINY_P0_BATCH_REFILL #define HAKMEM_TINY_P0_BATCH_REFILL 0 #endif #ifndef HAKMEM_TINY_REFILL_P0_INC_H #define HAKMEM_TINY_REFILL_P0_INC_H #include "tiny_box_geometry.h" // Box 3: Geometry & Capacity Calculator // Debug counters (compile-time gated) #if HAKMEM_DEBUG_COUNTERS extern unsigned long long g_rf_hit_slab[]; // Diagnostic counters for refill early returns extern unsigned long long g_rf_early_no_ss[]; // Line 27: !g_use_superslab extern unsigned long long g_rf_early_no_meta[]; // Line 35: !meta extern unsigned long long g_rf_early_no_room[]; // Line 40: room <= 0 extern unsigned long long g_rf_early_want_zero[]; // Line 55: want == 0 #endif // Refill TLS SLL from SuperSlab with batch carving (P0 optimization) #include "tiny_refill_opt.h" #include "tiny_fc_api.h" #include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe() // Optional P0 diagnostic logging helper static inline int p0_should_log(void) { static int en = -1; if (__builtin_expect(en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_P0_LOG"); en = (e && *e && *e != '0') ? 1 : 0; } return en; } static inline int sll_refill_batch_from_ss(int class_idx, int max_take) { // CRITICAL: C7 (1KB) is headerless - incompatible with TLS SLL refill // Reason: TLS SLL stores next pointer in first 8 bytes (user data for C7) // Solution: Skip refill for C7, force slow path allocation if (__builtin_expect(class_idx == 7, 0)) { return 0; // C7 uses slow path exclusively } // Runtime A/B kill switch (defensive). Set HAKMEM_TINY_P0_DISABLE=1 to bypass P0 path. do { static int g_p0_disable = -1; if (__builtin_expect(g_p0_disable == -1, 0)) { const char* e = getenv("HAKMEM_TINY_P0_DISABLE"); g_p0_disable = (e && *e && *e != '0') ? 1 : 0; } if (__builtin_expect(g_p0_disable, 0)) { return 0; } } while (0); if (!g_use_superslab || max_take <= 0) { #if HAKMEM_DEBUG_COUNTERS if (!g_use_superslab) g_rf_early_no_ss[class_idx]++; #endif return 0; } TinyTLSSlab* tls = &g_tls_slabs[class_idx]; uint32_t active_before = 0; if (tls->ss) { active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed); } // CRITICAL DEBUG: Log class 7 pre-warm if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) { fprintf(stderr, "[P0_DEBUG_C7] Entry: tls->ss=%p tls->meta=%p max_take=%d\n", (void*)tls->ss, (void*)tls->meta, max_take); } if (!tls->ss) { // Try to obtain a SuperSlab for this class if (superslab_refill(class_idx) == NULL) { if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) { fprintf(stderr, "[P0_DEBUG_C7] superslab_refill() returned NULL\n"); } return 0; } if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) { fprintf(stderr, "[P0_DEBUG_C7] After superslab_refill(): tls->ss=%p tls->meta=%p\n", (void*)tls->ss, (void*)tls->meta); } } TinySlabMeta* meta = tls->meta; if (!meta) { #if HAKMEM_DEBUG_COUNTERS g_rf_early_no_meta[class_idx]++; #endif if (__builtin_expect(class_idx == 7 && p0_should_log(), 0)) { fprintf(stderr, "[P0_DEBUG_C7] meta is NULL after superslab_refill, returning 0\n"); } return 0; } // Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B) // env: // - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5) // - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7) do { static int g_direct_fc = -1; static int g_direct_fc_c7 = -1; if (__builtin_expect(g_direct_fc == -1, 0)) { const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC"); // Default ON when unset g_direct_fc = (e && *e && *e == '0') ? 0 : 1; } if (__builtin_expect(g_direct_fc_c7 == -1, 0)) { const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7"); // Default OFF for class7 (1KB) until stability is fully verified; opt-in via env g_direct_fc_c7 = (e7 && *e7) ? ((*e7 == '0') ? 0 : 1) : 0; } if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) { int room = tiny_fc_room(class_idx); if (room <= 0) return 0; // Drain only if above threshold uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed); static int g_drain_th = -1; if (__builtin_expect(g_drain_th == -1, 0)) { const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH"); g_drain_th = (e && *e) ? atoi(e) : 64; if (g_drain_th < 0) g_drain_th = 0; } if (rmt >= (uint32_t)g_drain_th) { static int no_drain = -1; if (__builtin_expect(no_drain == -1, 0)) { const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN"); no_drain = (e && *e && *e != '0') ? 1 : 0; } if (!no_drain) { _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta); } } // Gather pointers without writing into objects void* out[128]; int produced = 0; TinySlabMeta* m = tls->meta; // Box 3: Get stride (block size + header, except C7 which is headerless) size_t bs = tiny_stride_for_class(class_idx); uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for_geometry(tls->ss, tls->slab_idx); while (produced < room) { if (__builtin_expect(m->freelist != NULL, 0)) { void* p = m->freelist; m->freelist = *(void**)p; m->used++; out[produced++] = p; continue; } if (__builtin_expect(m->carved < m->capacity, 1)) { void* p = (void*)(base + ((size_t)m->carved * bs)); m->carved++; m->used++; out[produced++] = p; continue; } // Need to move to another slab with space if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break; // Rebind tls = &g_tls_slabs[class_idx]; m = tls->meta; base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx); } if (produced > 0) { ss_active_add(tls->ss, (uint32_t)produced); int pushed = tiny_fc_push_bulk(class_idx, out, produced); (void)pushed; // roomに合わせているので一致するはず if (p0_should_log()) { static _Atomic int g_logged = 0; int exp = 0; if (atomic_compare_exchange_strong(&g_logged, &exp, 1)) { fprintf(stderr, "[P0_DIRECT_FC_TAKE] cls=%d take=%d room=%d drain_th=%d remote_cnt=%u\n", class_idx, produced, room, g_drain_th, rmt); } } return produced; } // fallthrough to regular path } } while (0); // Compute how many we can actually push into SLL without overflow uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP); int room = (int)sll_cap - (int)g_tls_sll_count[class_idx]; if (room <= 0) { #if HAKMEM_DEBUG_COUNTERS g_rf_early_no_room[class_idx]++; #endif return 0; } // For hot tiny classes (0..3), allow an env override to increase batch size uint32_t want = (uint32_t)max_take; if (class_idx <= 3) { static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value if (__builtin_expect(g_hot_override == -2, 0)) { const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT"); int v = (e && *e) ? atoi(e) : -1; if (v < 0) v = -1; if (v > 256) v = 256; // clamp g_hot_override = v; } if (g_hot_override > 0) want = (uint32_t)g_hot_override; } else { // Mid classes (>=4): optional override for batch size static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value if (__builtin_expect(g_mid_override == -2, 0)) { const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID"); int v = (e && *e) ? atoi(e) : -1; if (v < 0) v = -1; if (v > 256) v = 256; // clamp g_mid_override = v; } if (g_mid_override > 0) want = (uint32_t)g_mid_override; } if (want > (uint32_t)room) want = (uint32_t)room; if (want == 0) { #if HAKMEM_DEBUG_COUNTERS g_rf_early_want_zero[class_idx]++; #endif return 0; } // Box 3: Get stride (block size + header, except C7 which is headerless) size_t bs = tiny_stride_for_class(class_idx); int total_taken = 0; // === P0 Batch Carving Loop === while (want > 0) { // Calculate slab base for validation (accounts for 2048 offset in slab 0) uintptr_t ss_base = 0; uintptr_t ss_limit = 0; if (tls->ss && tls->slab_idx >= 0) { // Box 3: Get slab base (handles Slab 0 offset) uint8_t* slab_base = tiny_slab_base_for_geometry(tls->ss, tls->slab_idx); ss_base = (uintptr_t)slab_base; // Box 3: Get usable bytes for limit calculation ss_limit = ss_base + tiny_usable_bytes_for_slab(tls->slab_idx); } // CRITICAL FIX: Drain remote queue BEFORE popping from freelist // Without this, blocks in both freelist and remote queue can be double-allocated // (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data) // OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter) if (tls->ss && tls->slab_idx >= 0) { uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed); if (remote_count > 0) { // Runtime A/B: allow skipping remote drain for切り分け static int no_drain = -1; if (__builtin_expect(no_drain == -1, 0)) { const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN"); no_drain = (e && *e && *e != '0') ? 1 : 0; } if (!no_drain) { _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta); } } } // Handle freelist items first (usually 0) TinyRefillChain chain; uint32_t from_freelist = trc_pop_from_freelist( meta, class_idx, ss_base, ss_limit, bs, want, &chain); if (from_freelist > 0) { trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]); // FIX: Blocks from freelist were decremented when freed, must increment when allocated ss_active_add(tls->ss, from_freelist); // FIX: Keep TinySlabMeta::used consistent with non-P0 path meta->used = (uint16_t)((uint32_t)meta->used + from_freelist); extern unsigned long long g_rf_freelist_items[]; g_rf_freelist_items[class_idx] += from_freelist; total_taken += from_freelist; want -= from_freelist; if (want == 0) break; } // === Linear Carve (P0 Key Optimization!) === // Use monotonic 'carved' to track linear progression (used can decrement on free) if (meta->carved >= meta->capacity) { // Slab exhausted, try to get another if (superslab_refill(class_idx) == NULL) break; // CRITICAL FIX: Reload tls pointer after superslab_refill() binds new slab tls = &g_tls_slabs[class_idx]; meta = tls->meta; if (!meta) break; continue; } uint32_t available = meta->capacity - meta->carved; uint32_t batch = want; if (batch > available) batch = available; if (batch == 0) break; // Get slab base uint8_t* slab_base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx); // Diagnostic log (one-shot) static _Atomic int g_carve_log_printed = 0; if (atomic_load(&g_carve_log_printed) == 0 && atomic_exchange(&g_carve_log_printed, 1) == 0) { fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n", class_idx, tls->slab_idx, meta->used, meta->capacity, batch, (void*)slab_base, bs); fflush(stderr); } TinyRefillChain carve; trc_linear_carve(slab_base, bs, meta, batch, &carve); trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]); // FIX: Update SuperSlab active counter (was missing!) ss_active_add(tls->ss, batch); extern unsigned long long g_rf_carve_items[]; g_rf_carve_items[class_idx] += batch; total_taken += batch; want -= batch; } #if HAKMEM_DEBUG_COUNTERS // Track successful SLL refills from SuperSlab (compile-time gated) // NOTE: Increment unconditionally to verify counter is working g_rf_hit_slab[class_idx]++; #endif if (tls->ss && p0_should_log()) { uint32_t active_after = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed); int32_t delta = (int32_t)active_after - (int32_t)active_before; if ((int32_t)total_taken != delta) { fprintf(stderr, "[P0_COUNTER_MISMATCH] cls=%d slab=%d taken=%d active_delta=%d used=%u carved=%u cap=%u freelist=%p\n", class_idx, tls->slab_idx, total_taken, delta, (unsigned)meta->used, (unsigned)meta->carved, (unsigned)meta->capacity, meta->freelist); } else { fprintf(stderr, "[P0_COUNTER_OK] cls=%d slab=%d taken=%d active_delta=%d\n", class_idx, tls->slab_idx, total_taken, delta); } } return total_taken; } #endif // HAKMEM_TINY_REFILL_P0_INC_H