Files
hakmem/core/hakmem_tiny_refill_p0.inc.h
Moe Charm (CI) 1010a961fb Tiny: fix header/stride mismatch and harden refill paths
- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.
2025-11-09 18:55:50 +09:00

188 lines
7.3 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_refill_p0.inc.h
// ChatGPT Pro P0: Complete Batch Refill (SLL用)
//
// Purpose: Optimize sll_refill_small_from_ss with batch carving
// Based on: tls_refill_from_tls_slab (hakmem_tiny_tls_ops.h:115-126)
//
// Key optimization: ss_active_inc × 64 → ss_active_add × 1
//
// Maintains: Existing g_tls_sll_head fast path (no changes to hot path!)
//
// Enable P0 by default for testing (set to 0 to disable)
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
#define HAKMEM_TINY_P0_BATCH_REFILL 1
#endif
#ifndef HAKMEM_TINY_REFILL_P0_INC_H
#define HAKMEM_TINY_REFILL_P0_INC_H
// Debug counters (compile-time gated)
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_hit_slab[];
// Diagnostic counters for refill early returns
extern unsigned long long g_rf_early_no_ss[]; // Line 27: !g_use_superslab
extern unsigned long long g_rf_early_no_meta[]; // Line 35: !meta
extern unsigned long long g_rf_early_no_room[]; // Line 40: room <= 0
extern unsigned long long g_rf_early_want_zero[]; // Line 55: want == 0
#endif
// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
#include "tiny_refill_opt.h"
#include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe()
static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
if (!g_use_superslab || max_take <= 0) {
#if HAKMEM_DEBUG_COUNTERS
if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
#endif
return 0;
}
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
if (!tls->ss) {
// Try to obtain a SuperSlab for this class
if (superslab_refill(class_idx) == NULL) return 0;
}
TinySlabMeta* meta = tls->meta;
if (!meta) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_no_meta[class_idx]++;
#endif
return 0;
}
// Compute how many we can actually push into SLL without overflow
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
if (room <= 0) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_no_room[class_idx]++;
#endif
return 0;
}
// For hot tiny classes (0..3), allow an env override to increase batch size
uint32_t want = (uint32_t)max_take;
if (class_idx <= 3) {
static int g_hot_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
if (__builtin_expect(g_hot_override == -2, 0)) {
const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_HOT");
int v = (e && *e) ? atoi(e) : -1;
if (v < 0) v = -1; if (v > 256) v = 256; // clamp
g_hot_override = v;
}
if (g_hot_override > 0) want = (uint32_t)g_hot_override;
} else {
// Mid classes (>=4): optional override for batch size
static int g_mid_override = -2; // -2 = uninitialized, -1 = no override, >0 = value
if (__builtin_expect(g_mid_override == -2, 0)) {
const char* e = getenv("HAKMEM_TINY_REFILL_COUNT_MID");
int v = (e && *e) ? atoi(e) : -1;
if (v < 0) v = -1; if (v > 256) v = 256; // clamp
g_mid_override = v;
}
if (g_mid_override > 0) want = (uint32_t)g_mid_override;
}
if (want > (uint32_t)room) want = (uint32_t)room;
if (want == 0) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_want_zero[class_idx]++;
#endif
return 0;
}
// Effective stride: class block size + 1-byte header for classes 0..6
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
int total_taken = 0;
// === P0 Batch Carving Loop ===
while (want > 0) {
// Calculate slab base for validation (accounts for 2048 offset in slab 0)
uintptr_t ss_base = 0;
uintptr_t ss_limit = 0;
if (tls->ss && tls->slab_idx >= 0) {
uint8_t* slab_base = tiny_slab_base_for(tls->ss, tls->slab_idx);
ss_base = (uintptr_t)slab_base;
// Limit is end of current slab
ss_limit = ss_base + SLAB_SIZE;
if (tls->slab_idx == 0) {
ss_limit = ss_base + (SLAB_SIZE - SUPERSLAB_SLAB0_DATA_OFFSET);
}
}
// CRITICAL FIX: Drain remote queue BEFORE popping from freelist
// Without this, blocks in both freelist and remote queue can be double-allocated
// (Thread A pops from freelist, Thread B adds to remote queue, Thread A drains remote → overwrites user data)
// OPTIMIZATION: Only drain if remote queue is non-empty (check atomic counter)
if (tls->ss && tls->slab_idx >= 0) {
uint32_t remote_count = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
if (remote_count > 0) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
}
}
// Handle freelist items first (usually 0)
TinyRefillChain chain;
uint32_t from_freelist = trc_pop_from_freelist(
meta, class_idx, ss_base, ss_limit, bs, want, &chain);
if (from_freelist > 0) {
trc_splice_to_sll(class_idx, &chain, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
// FIX: Blocks from freelist were decremented when freed, must increment when allocated
ss_active_add(tls->ss, from_freelist);
extern unsigned long long g_rf_freelist_items[];
g_rf_freelist_items[class_idx] += from_freelist;
total_taken += from_freelist;
want -= from_freelist;
if (want == 0) break;
}
// === Linear Carve (P0 Key Optimization!) ===
if (meta->used >= meta->capacity) {
// Slab exhausted, try to get another
if (superslab_refill(class_idx) == NULL) break;
meta = tls->meta;
if (!meta) break;
continue;
}
uint32_t available = meta->capacity - meta->used;
uint32_t batch = want;
if (batch > available) batch = available;
if (batch == 0) break;
// Get slab base
uint8_t* slab_base = tls->slab_base ? tls->slab_base
: tiny_slab_base_for(tls->ss, tls->slab_idx);
// Diagnostic log (one-shot)
static _Atomic int g_carve_log_printed = 0;
if (atomic_load(&g_carve_log_printed) == 0 &&
atomic_exchange(&g_carve_log_printed, 1) == 0) {
fprintf(stderr, "[BATCH_CARVE] cls=%u slab=%d used=%u cap=%u batch=%u base=%p bs=%zu\n",
class_idx, tls->slab_idx, meta->used, meta->capacity, batch,
(void*)slab_base, bs);
fflush(stderr);
}
TinyRefillChain carve;
trc_linear_carve(slab_base, bs, meta, batch, &carve);
trc_splice_to_sll(class_idx, &carve, &g_tls_sll_head[class_idx], &g_tls_sll_count[class_idx]);
// FIX: Update SuperSlab active counter (was missing!)
ss_active_add(tls->ss, batch);
extern unsigned long long g_rf_carve_items[];
g_rf_carve_items[class_idx] += batch;
total_taken += batch;
want -= batch;
}
#if HAKMEM_DEBUG_COUNTERS
// Track successful SLL refills from SuperSlab (compile-time gated)
// NOTE: Increment unconditionally to verify counter is working
g_rf_hit_slab[class_idx]++;
#endif
return total_taken;
}
#endif // HAKMEM_TINY_REFILL_P0_INC_H