// hakmem_tiny_refill_p0.inc.h // ChatGPT Pro P0: Complete Batch Refill (SLL用) // // Purpose: Optimize sll_refill_small_from_ss with batch carving // Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126) // // Key optimization: ss_active_inc × 64 → ss_active_add × 1 // // Maintains: Existing g_tls_sll_head fast path (no changes to hot path!) // // Enable P0 by default for testing (set to 0 to disable) #ifndef HAKMEM_TINY_P0_BATCH_REFILL #define HAKMEM_TINY_P0_BATCH_REFILL 1 #endif #ifndef HAKMEM_TINY_REFILL_P0_INC_H #define HAKMEM_TINY_REFILL_P0_INC_H // Debug counters (compile-time gated) #if HAKMEM_DEBUG_COUNTERS extern unsigned long long g_rf_hit_slab[]; // Diagnostic counters for refill early returns extern unsigned long long g_rf_early_no_ss[]; // Line 27: !g_use_superslab extern unsigned long long g_rf_early_no_meta[]; // Line 35: !meta extern unsigned long long g_rf_early_no_room[]; // Line 40: room <= 0 extern unsigned long long g_rf_early_want_zero[]; // Line 55: want == 0 #endif // Refill TLS SLL from SuperSlab with batch carving (P0 optimization) #include "tiny_refill_opt.h" #include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe() static inline int sll_refill_batch_from_ss(int class_idx, int max_take) { if (!g_use_superslab || max_take <= 0) { #if HAKMEM_DEBUG_COUNTERS if (!g_use_superslab) g_rf_early_no_ss[class_idx]++; #endif return 0; } TinyTLSSlab* tls = &g_tls_slabs[class_idx]; if (!tls->ss) { // Try to obtain a SuperSlab for this class if (superslab_refill(class_idx) == NULL) return 0; } TinySlabMeta* meta = tls->meta; if (!meta) { #if HAKMEM_DEBUG_COUNTERS g_rf_early_no_meta[class_idx]++; #endif return 0; } // Compute how many we can actually push into SLL without overflow uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP); int room = (int)sll_cap - (int)g_tls_sll_count[class_idx]; if (room <= 0) { #if HAKMEM_DEBUG_COUNTERS g_rf_early_no_room[class_idx]++; #endif return 0; } // For hot tiny classes (0..3), allow an env override to increase batch size uint32_t want = (uint32_t)max_take; if (class_idx <= 3) { static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value if (__builtin_expect(g_hot_override == -2, 0)) { const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT"); int v = (e && *e) ? atoi(e) : -1; if (v < 0) v = -1; if (v > 256) v = 256; // clamp g_hot_override = v; } if (g_hot_override > 0) want = (uint32_t)g_hot_override; } else { // Mid classes (>=4): optional override for batch size static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value if (__builtin_expect(g_mid_override == -2, 0)) { const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID"); int v = (e && *e) ? atoi(e) : -1; if (v < 0) v = -1; if (v > 256) v = 256; // clamp g_mid_override = v; } if (g_mid_override > 0) want = (uint32_t)g_mid_override; } if (want > (uint32_t)room) want = (uint32_t)room; if (want == 0) { #if HAKMEM_DEBUG_COUNTERS g_rf_early_want_zero[class_idx]++; #endif return 0; } // Effective stride: class block size + 1-byte header for classes 0..6 size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0); int total_taken = 0; // === P0 Batch Carving Loop === while (want > 0) { // Calculate slab base for validation (accounts for 2048 offset in slab 0) uintptr_t ss_base = 0; uintptr_t ss_limit = 0; if (tls->ss && tls->slab_idx >= 0) { uint8_t* slab_base = tiny_slab_base_for(tls->ss, tls->slab_idx); ss_base = (uintptr_t)slab_base; // Limit is end of current slab ss_limit = ss_base + SLAB_SIZE; if (tls->slab_idx == 0) { ss_limit = ss_base + (SLAB_SIZE - SUPERSLAB_SLAB0_DATA_OFFSET); } } // CRITICAL FIX: Drain remote queue BEFORE popping from freelist // Without this, blocks in both freelist and remote queue can be double-allocated // (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data) // OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter) if (tls->ss && tls->slab_idx >= 0) { uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed); if (remote_count > 0) { _ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta); } } // Handle freelist items first (usually 0) TinyRefillChain chain; uint32_t from_freelist = trc_pop_from_freelist( meta, class_idx, ss_base, ss_limit, bs, want, &chain); if (from_freelist > 0) { trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]); // FIX: Blocks from freelist were decremented when freed, must increment when allocated ss_active_add(tls->ss, from_freelist); extern unsigned long long g_rf_freelist_items[]; g_rf_freelist_items[class_idx] += from_freelist; total_taken += from_freelist; want -= from_freelist; if (want == 0) break; } // === Linear Carve (P0 Key Optimization!) === if (meta->used >= meta->capacity) { // Slab exhausted, try to get another if (superslab_refill(class_idx) == NULL) break; meta = tls->meta; if (!meta) break; continue; } uint32_t available = meta->capacity - meta->used; uint32_t batch = want; if (batch > available) batch = available; if (batch == 0) break; // Get slab base uint8_t* slab_base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx); // Diagnostic log (one-shot) static _Atomic int g_carve_log_printed = 0; if (atomic_load(&g_carve_log_printed) == 0 && atomic_exchange(&g_carve_log_printed, 1) == 0) { fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n", class_idx, tls->slab_idx, meta->used, meta->capacity, batch, (void*)slab_base, bs); fflush(stderr); } TinyRefillChain carve; trc_linear_carve(slab_base, bs, meta, batch, &carve); trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]); // FIX: Update SuperSlab active counter (was missing!) ss_active_add(tls->ss, batch); extern unsigned long long g_rf_carve_items[]; g_rf_carve_items[class_idx] += batch; total_taken += batch; want -= batch; } #if HAKMEM_DEBUG_COUNTERS // Track successful SLL refills from SuperSlab (compile-time gated) // NOTE: Increment unconditionally to verify counter is working g_rf_hit_slab[class_idx]++; #endif return total_taken; } #endif // HAKMEM_TINY_REFILL_P0_INC_H