Files
hakmem/core/tiny_superslab_alloc.inc.h
Moe Charm (CI) 1010a961fb Tiny: fix header/stride mismatch and harden refill paths
- Root cause: header-based class indexing (HEADER_CLASSIDX=1) wrote a 1-byte
  header during allocation, but linear carve/refill and initial slab capacity
  still used bare class block sizes. This mismatch could overrun slab usable
  space and corrupt freelists, causing reproducible SEGV at ~100k iters.

Changes
- Superslab: compute capacity with effective stride (block_size + header for
  classes 0..6; class7 remains headerless) in superslab_init_slab(). Add a
  debug-only bound check in superslab_alloc_from_slab() to fail fast if carve
  would exceed usable bytes.
- Refill (non-P0 and P0): use header-aware stride for all linear carving and
  TLS window bump operations. Ensure alignment/validation in tiny_refill_opt.h
  also uses stride, not raw class size.
- Drain: keep existing defense-in-depth for remote sentinel and sanitize nodes
  before splicing into freelist (already present).

Notes
- This unifies the memory layout across alloc/linear-carve/refill with a single
  stride definition and keeps class7 (1024B) headerless as designed.
- Debug builds add fail-fast checks; release builds remain lean.

Next
- Re-run Tiny benches (256/1024B) in debug to confirm stability, then in
  release. If any remaining crash persists, bisect with HAKMEM_TINY_P0_BATCH_REFILL=0
  to isolate P0 batch carve, and continue reducing branch-miss as planned.
2025-11-09 18:55:50 +09:00

760 lines
34 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_superslab_alloc.inc.h - SuperSlab Allocation Layer
// Purpose: Slab allocation, refill, and adoption logic
// Extracted from: hakmem_tiny_free.inc lines 626-1170
// Box Theory: Box 4 (Refill/Adoption) integration
//
// Public functions:
// - superslab_alloc_from_slab(): Allocate from specific slab (linear or freelist)
// - superslab_refill(): Refill TLS slab (adoption, registry scan, fresh alloc)
// - hak_tiny_alloc_superslab(): Main SuperSlab allocation entry point
// ============================================================================
// Phase 6.23: SuperSlab Allocation Helpers
// ============================================================================
// Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation)
#include "hakmem_tiny_superslab_constants.h"
static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) {
TinySlabMeta* meta = &ss->slabs[slab_idx];
// Ensure remote queue is drained before handing blocks back to TLS (UNLIKELY in 1T)
if (__builtin_expect(atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0, 0)) {
uint32_t self_tid = tiny_self_u32();
SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
if (slab_is_valid(&h)) {
slab_drain_remote_full(&h);
int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0;
if (__builtin_expect(pending, 0)) {
if (__builtin_expect(g_debug_remote_guard, 0)) {
uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed);
tiny_remote_watch_note("alloc_pending_remote",
ss,
slab_idx,
(void*)head,
0xA243u,
self_tid,
0);
}
slab_release(&h);
return NULL;
}
slab_release(&h);
} else {
if (__builtin_expect(g_debug_remote_guard, 0)) {
tiny_remote_watch_note("alloc_acquire_fail",
ss,
slab_idx,
meta,
0xA244u,
self_tid,
0);
}
return NULL;
}
}
if (__builtin_expect(g_debug_remote_guard, 0)) {
uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire);
if (head_pending != 0) {
tiny_remote_watch_note("alloc_remote_pending",
ss,
slab_idx,
(void*)head_pending,
0xA247u,
tiny_self_u32(),
1);
return NULL;
}
}
// Phase 6.24: Linear allocation mode (freelist == NULL)
// This avoids the 4000-8000 cycle cost of building freelist on init
if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) {
// Linear allocation: use canonical tiny_slab_base_for() only
size_t unit_sz = g_tiny_class_sizes[ss->size_class]
#if HAKMEM_TINY_HEADER_CLASSIDX
+ ((ss->size_class != 7) ? 1 : 0)
#endif
;
uint8_t* base = tiny_slab_base_for(ss, slab_idx);
void* block_base = (void*)(base + ((size_t)meta->used * unit_sz));
#if !HAKMEM_BUILD_RELEASE
// Debug safety: Ensure we never carve past slab usable region (capacity mismatch guard)
size_t dbg_usable = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE;
uintptr_t dbg_off = (uintptr_t)((uint8_t*)block_base - base);
if (__builtin_expect(dbg_off + unit_sz > dbg_usable, 0)) {
fprintf(stderr, "[TINY_ALLOC_BOUNDS] cls=%u slab=%d used=%u cap=%u unit=%zu off=%lu usable=%zu\n",
ss->size_class, slab_idx, meta->used, meta->capacity, unit_sz,
(unsigned long)dbg_off, dbg_usable);
return NULL;
}
#endif
meta->used++;
void* user =
#if HAKMEM_TINY_HEADER_CLASSIDX
tiny_region_id_write_header(block_base, ss->size_class);
#else
block_base;
#endif
if (__builtin_expect(g_debug_remote_guard, 0)) {
tiny_remote_track_on_alloc(ss, slab_idx, user, "linear_alloc", 0);
tiny_remote_assert_not_remote(ss, slab_idx, user, "linear_alloc_ret", 0);
}
return user; // Fast path: O(1) pointer arithmetic
}
// Freelist mode (after first free())
if (__builtin_expect(meta->freelist != NULL, 0)) {
void* block = meta->freelist;
// CORRUPTION DEBUG: Validate freelist head before popping
if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
size_t blk = g_tiny_class_sizes[ss->size_class];
uint8_t* slab_base = tiny_slab_base_for(ss, slab_idx);
uintptr_t block_addr = (uintptr_t)block;
uintptr_t slab_addr = (uintptr_t)slab_base;
uintptr_t offset = block_addr - slab_addr;
fprintf(stderr, "[ALLOC_POP] cls=%u slab=%d block=%p offset=%zu (used=%u cap=%u)\n",
ss->size_class, slab_idx, block, offset, meta->used, meta->capacity);
if (offset % blk != 0) {
fprintf(stderr, "[ALLOC_CORRUPT] Freelist head is misaligned! block=%p offset=%zu blk=%zu\n",
block, offset, blk);
fprintf(stderr, "[ALLOC_CORRUPT] Expected alignment: %zu, actual: %zu\n",
blk, offset % blk);
tiny_failfast_abort_ptr("alloc_pop_misalign", ss, slab_idx, block, "freelist_head_corrupt");
}
size_t index = offset / blk;
if (index >= meta->capacity) {
fprintf(stderr, "[ALLOC_CORRUPT] Freelist head out of bounds! block=%p index=%zu cap=%u\n",
block, index, meta->capacity);
tiny_failfast_abort_ptr("alloc_pop_oob", ss, slab_idx, block, "freelist_head_oob");
}
}
meta->freelist = *(void**)block; // Pop from freelist
meta->used++;
if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
if (__builtin_expect(meta->used > meta->capacity, 0)) {
fprintf(stderr, "[ALLOC_CORRUPT] meta->used overflow on freelist alloc: used=%u cap=%u cls=%u slab=%d\n",
meta->used, meta->capacity, ss->size_class, slab_idx);
tiny_failfast_abort_ptr("alloc_used_overflow",
ss,
slab_idx,
block,
"freelist_used_over_capacity");
}
}
if (__builtin_expect(g_debug_remote_guard, 0)) {
tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0);
tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0);
}
return block;
}
return NULL; // Slab is full
}
// Adopt helper: acquire → drain → bind (single boundary) returns 1 on success
static inline int adopt_bind_if_safe(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx, int class_idx) {
uint32_t self_tid = tiny_self_u32();
SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid);
if (!slab_is_valid(&h)) return 0;
slab_drain_remote_full(&h);
if (__builtin_expect(slab_is_safe_to_bind(&h), 1)) {
// Optional: move a few nodes to Front SLL to boost next hits
tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx);
tiny_tls_bind_slab(tls, h.ss, h.slab_idx);
// Ownership now associated with TLS slab; release handle bookkeeping
slab_release(&h);
return 1;
}
slab_release(&h);
return 0;
}
// Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation)
static SuperSlab* superslab_refill(int class_idx) {
#if HAKMEM_DEBUG_COUNTERS
g_superslab_refill_calls_dbg[class_idx]++;
#endif
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
// ============================================================================
// Phase 2a: Dynamic Expansion - Initialize SuperSlabHead if needed
// ============================================================================
extern SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS];
extern SuperSlabHead* init_superslab_head(int class_idx);
extern int expand_superslab_head(SuperSlabHead* head);
SuperSlabHead* head = g_superslab_heads[class_idx];
if (!head) {
// First-time initialization for this class
head = init_superslab_head(class_idx);
if (!head) {
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr, "[DEBUG] superslab_refill: Failed to init SuperSlabHead for class %d\n", class_idx);
g_hakmem_lock_depth--;
return NULL; // Critical failure
}
g_superslab_heads[class_idx] = head;
}
// Try current chunk first (fast path)
SuperSlab* current_chunk = head->current_chunk;
if (current_chunk) {
// Check if current chunk has available slabs
// Bitmap semantics: 0=FREE, 1=OCCUPIED
// - 0x00000000 = all free (32 available)
// - 0xFFFFFFFF = all occupied (0 available)
int chunk_cap = ss_slabs_capacity(current_chunk);
uint32_t full_mask = (chunk_cap >= 32) ? 0xFFFFFFFF : ((1U << chunk_cap) - 1);
if (current_chunk->slab_bitmap != full_mask) {
// Current chunk has free slabs, use normal refill logic below
// (Will be handled by existing code that checks tls->ss)
if (tls->ss != current_chunk) {
// Update TLS to point to current chunk
tls->ss = current_chunk;
}
} else {
// Current chunk exhausted (all slabs occupied), try to expand
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] SuperSlab chunk exhausted for class %d (bitmap=0x%08x), expanding...\n",
class_idx, current_chunk->slab_bitmap);
g_hakmem_lock_depth--;
#endif
// Protect expansion with global lock (race condition fix)
static pthread_mutex_t expand_lock = PTHREAD_MUTEX_INITIALIZER;
pthread_mutex_lock(&expand_lock);
// Re-check after acquiring lock (another thread may have expanded)
current_chunk = head->current_chunk;
uint32_t recheck_mask = (ss_slabs_capacity(current_chunk) >= 32) ? 0xFFFFFFFF :
((1U << ss_slabs_capacity(current_chunk)) - 1);
if (current_chunk->slab_bitmap == recheck_mask) {
// Still exhausted, expand now
if (expand_superslab_head(head) < 0) {
pthread_mutex_unlock(&expand_lock);
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] CRITICAL: Failed to expand SuperSlabHead for class %d (system OOM)\n", class_idx);
g_hakmem_lock_depth--;
#endif
return NULL; // True system OOM
}
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] Successfully expanded SuperSlabHead for class %d\n", class_idx);
g_hakmem_lock_depth--;
#endif
}
// Update current_chunk and tls->ss to point to (potentially new) chunk
current_chunk = head->current_chunk;
tls->ss = current_chunk;
pthread_mutex_unlock(&expand_lock);
// Verify chunk has free slabs
full_mask = (ss_slabs_capacity(current_chunk) >= 32) ? 0xFFFFFFFF :
((1U << ss_slabs_capacity(current_chunk)) - 1);
if (!current_chunk || current_chunk->slab_bitmap == full_mask) {
#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE)
g_hakmem_lock_depth++;
fprintf(stderr, "[HAKMEM] CRITICAL: Chunk still has no free slabs for class %d after expansion\n", class_idx);
g_hakmem_lock_depth--;
#endif
return NULL;
}
}
}
// ============================================================================
// Continue with existing refill logic
// ============================================================================
static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen
if (g_ss_adopt_en == -1) {
char* e = getenv("HAKMEM_TINY_SS_ADOPT");
if (e) {
g_ss_adopt_en = (*e != '0') ? 1 : 0;
} else {
extern _Atomic int g_ss_remote_seen;
g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0;
}
}
extern int g_adopt_cool_period;
extern __thread int g_tls_adopt_cd[];
if (g_adopt_cool_period == -1) {
char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN");
int v = (cd ? atoi(cd) : 0);
if (v < 0) v = 0; if (v > 1024) v = 1024;
g_adopt_cool_period = v;
}
static int g_superslab_refill_debug_once = 0;
SuperSlab* prev_ss = tls->ss;
TinySlabMeta* prev_meta = tls->meta;
uint8_t prev_slab_idx = tls->slab_idx;
uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0;
uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0;
uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0;
uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0;
int free_idx_attempted = -2; // -2 = not evaluated, -1 = none, >=0 = chosen
int reused_slabs = 0;
// Optional: Mid-size simple refill to avoid multi-layer scans (class>=4)
do {
static int g_mid_simple_warn = 0;
if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) {
// If current TLS has a SuperSlab, prefer taking a virgin slab directly
if (tls->ss) {
int tls_cap = ss_slabs_capacity(tls->ss);
if (tls->ss->active_slabs < tls_cap) {
int free_idx = superslab_find_free_slab(tls->ss);
if (free_idx >= 0) {
uint32_t my_tid = tiny_self_u32();
superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
tiny_tls_bind_slab(tls, tls->ss, free_idx);
return tls->ss;
}
}
}
// Otherwise allocate a fresh SuperSlab and bind first slab
SuperSlab* ssn = superslab_allocate((uint8_t)class_idx);
if (!ssn) {
if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) {
g_mid_simple_warn++;
int err = errno;
fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err);
}
return NULL;
}
uint32_t my_tid = tiny_self_u32();
superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid);
SuperSlab* old = tls->ss;
tiny_tls_bind_slab(tls, ssn, 0);
superslab_ref_inc(ssn);
if (old && old != ssn) { superslab_ref_dec(old); }
return ssn;
}
} while (0);
// First, try to adopt a published partial SuperSlab for this class
if (g_ss_adopt_en) {
if (g_adopt_cool_period > 0) {
if (g_tls_adopt_cd[class_idx] > 0) {
g_tls_adopt_cd[class_idx]--;
} else {
// eligible to adopt
}
}
if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) {
SuperSlab* adopt = ss_partial_adopt(class_idx);
if (adopt && adopt->magic == SUPERSLAB_MAGIC) {
// ========================================================================
// Quick Win #2: First-Fit Adopt (vs Best-Fit scoring all 32 slabs)
// For Larson, any slab with freelist works - no need to score all 32!
// Expected improvement: -3,000 cycles (from 32 atomic loads + 32 scores)
// ========================================================================
int adopt_cap = ss_slabs_capacity(adopt);
int best = -1;
for (int s = 0; s < adopt_cap; s++) {
TinySlabMeta* m = &adopt->slabs[s];
// Quick check: Does this slab have a freelist?
if (m->freelist) {
// Yes! Try to acquire it immediately (first-fit)
best = s;
break; // ✅ OPTIMIZATION: Stop at first slab with freelist!
}
// Optional: Also check remote_heads if we want to prioritize those
// (But for Larson, freelist is sufficient)
}
if (best >= 0) {
if (adopt_bind_if_safe(tls, adopt, best, class_idx)) {
if (g_adopt_cool_period > 0) g_tls_adopt_cd[class_idx] = g_adopt_cool_period;
return adopt;
}
}
// If no freelist found, ignore and continue (optional: republish)
}
}
}
// Phase 7.6 Step 4: Check existing SuperSlab with priority order
if (tls->ss) {
// Priority 1: Reuse slabs with freelist (already freed blocks)
int tls_cap = ss_slabs_capacity(tls->ss);
uint32_t nonempty_mask = 0;
do {
static int g_mask_en = -1;
if (__builtin_expect(g_mask_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
g_mask_en = (e && *e && *e != '0') ? 1 : 0;
}
if (__builtin_expect(g_mask_en, 0)) {
nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire);
break;
}
for (int i = 0; i < tls_cap; i++) {
if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i);
}
} while (0);
// O(1) lookup: scan mask with ctz (1 instruction!)
while (__builtin_expect(nonempty_mask != 0, 1)) {
int i = __builtin_ctz(nonempty_mask); // Find first non-empty slab (O(1))
nonempty_mask &= ~(1u << i); // Clear bit for next iteration
// FIX #1 DELETED (Race condition fix):
// Previous drain without ownership caused concurrent freelist corruption.
// Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
// Remote frees will be drained when the slab is adopted (see tiny_refill.h paths).
if (adopt_bind_if_safe(tls, tls->ss, i, class_idx)) {
reused_slabs = 1;
return tls->ss;
}
}
// Priority 2: Use unused slabs (virgin slabs)
if (tls->ss->active_slabs < tls_cap) {
// Find next free slab
int free_idx = superslab_find_free_slab(tls->ss);
free_idx_attempted = free_idx;
if (free_idx >= 0) {
// Initialize this slab
uint32_t my_tid = tiny_self_u32();
superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid);
// Update TLS cache (unified update)
tiny_tls_bind_slab(tls, tls->ss, free_idx);
return tls->ss;
}
}
}
// Try to adopt a partial SuperSlab from registry (one-shot, cheap scan)
// This reduces pressure to allocate new SS when other threads freed blocks.
// Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan
if (!tls->ss) {
// Phase 6: Use per-class registry (262K → ~10-100 entries per class!)
extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS];
extern int g_super_reg_class_size[TINY_NUM_CLASSES];
const int scan_max = tiny_reg_scan_max();
int reg_size = g_super_reg_class_size[class_idx];
int scan_limit = (scan_max < reg_size) ? scan_max : reg_size;
for (int i = 0; i < scan_limit; i++) {
SuperSlab* ss = g_super_reg_by_class[class_idx][i];
if (!ss || ss->magic != SUPERSLAB_MAGIC) continue;
// Note: class_idx check is not needed (per-class registry!)
// Pick first slab with freelist (Box 4: adopt boundary helper)
int reg_cap = ss_slabs_capacity(ss);
for (int s = 0; s < reg_cap; s++) {
if (ss->slabs[s].freelist) {
if (adopt_bind_if_safe(tls, ss, s, class_idx)) return ss;
}
}
}
}
// Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window
{
SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls);
if (gate_ss) return gate_ss;
}
// Allocate new SuperSlab
SuperSlab* ss = superslab_allocate((uint8_t)class_idx);
if (!ss) {
if (!g_superslab_refill_debug_once) {
g_superslab_refill_debug_once = 1;
int err = errno;
// CRITICAL FIX (BUG #11): Protect fprintf() with lock_depth
// fprintf() can call malloc for buffering → must use libc malloc
extern __thread int g_hakmem_lock_depth;
g_hakmem_lock_depth++;
fprintf(stderr,
"[DEBUG] superslab_refill returned NULL (OOM) detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n",
class_idx,
(void*)prev_ss,
(unsigned)prev_active,
prev_bitmap,
(void*)prev_meta,
(unsigned)prev_meta_used,
(unsigned)prev_meta_cap,
(unsigned)prev_slab_idx,
reused_slabs,
free_idx_attempted,
err);
g_hakmem_lock_depth--;
}
// Clear errno to avoid confusion in fallback paths
errno = 0;
return NULL; // OOM
}
// Initialize first slab
uint32_t my_tid = tiny_self_u32();
superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid);
// Cache in unified TLS前のSS参照を解放
SuperSlab* old = tls->ss;
tiny_tls_bind_slab(tls, ss, 0);
// Maintain refcount将来の空回収に備え、TLS参照をカウント
superslab_ref_inc(ss);
if (old && old != ss) {
superslab_ref_dec(old);
}
return ss;
}
// Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix)
static inline void* hak_tiny_alloc_superslab(int class_idx) {
// DEBUG: Function entry trace (gated to avoid ring spam)
do {
static int g_alloc_ring = -1;
if (__builtin_expect(g_alloc_ring == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ALLOC_RING");
g_alloc_ring = (e && *e && *e != '0') ? 1 : 0;
}
if (g_alloc_ring) {
tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0);
}
} while (0);
// MidTC fast path: 128..1024Bclass>=4はTLS tcacheを最優先
do {
void* mp = midtc_pop(class_idx);
if (mp) {
HAK_RET_ALLOC(class_idx, mp);
}
} while (0);
// Phase 6.24: 1 TLS read (down from 3)
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
TinySlabMeta* meta = tls->meta;
int slab_idx = tls->slab_idx;
if (meta && slab_idx >= 0 && tls->ss) {
// CRITICAL: Verify class consistency BEFORE using tls->ss
// If tls->ss->size_class != class_idx, unbind and refill
if (tls->ss->size_class != class_idx) {
// Class mismatch: TLS is bound to wrong SuperSlab
// This happens when TLS was previously bound to different class
tls->ss = NULL;
tls->meta = NULL;
tls->slab_idx = -1;
tls->slab_base = NULL;
meta = NULL; // Force refill path below
} else {
// Ensure TLS view is consistent with canonical slab_base
uint8_t* canonical = tiny_slab_base_for(tls->ss, slab_idx);
if (tls->slab_base != canonical) {
tls->slab_base = canonical;
}
}
// A/B: Relaxed read for remote head presence check
static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed
if (__builtin_expect(g_alloc_remote_relax == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX");
g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0;
}
uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx],
g_alloc_remote_relax ? memory_order_relaxed
: memory_order_acquire);
if (__builtin_expect(pending != 0, 0)) {
uint32_t self_tid = tiny_self_u32();
if (ss_owner_try_acquire(meta, self_tid)) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta);
}
}
}
// FIX #2 DELETED (Race condition fix):
// Previous drain-all-slabs without ownership caused concurrent freelist corruption.
// Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash.
// Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h).
// Remote frees will be drained when the slab is adopted via refill paths.
// Fast path: Direct metadata access (no repeated TLS reads!)
if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
// Linear allocation (lazy init)
size_t block_size = g_tiny_class_sizes[tls->ss->size_class];
uint8_t* base = tls->slab_base; // tls_slab_base は tiny_slab_base_for(ss, slab_idx) 由来(唯一の真実)
// ULTRATHINK DEBUG: Capture the 53-byte mystery
if (tiny_refill_failfast_level() >= 3 && tls->ss->size_class == 7 && slab_idx == 0) {
fprintf(stderr, "[ULTRA_53_DEBUG] === Before allocation ===\n");
fprintf(stderr, "[ULTRA_53_DEBUG] ss=%p, slab_idx=%d, class=%d\n",
tls->ss, slab_idx, tls->ss->size_class);
fprintf(stderr, "[ULTRA_53_DEBUG] block_size=%zu, meta->used=%d, meta->capacity=%d\n",
block_size, meta->used, meta->capacity);
fprintf(stderr, "[ULTRA_53_DEBUG] tls->slab_base=%p\n", base);
fprintf(stderr, "[ULTRA_53_DEBUG] tiny_slab_base_for(ss,%d)=%p\n",
slab_idx, tiny_slab_base_for(tls->ss, slab_idx));
fprintf(stderr, "[ULTRA_53_DEBUG] sizeof(SuperSlab)=%zu\n", sizeof(SuperSlab));
fprintf(stderr, "[ULTRA_53_DEBUG] Expected base should be: ss + %zu\n", sizeof(SuperSlab));
fprintf(stderr, "[ULTRA_53_DEBUG] Actual base is: ss + 1024\n");
fprintf(stderr, "[ULTRA_53_DEBUG] Base error: %zu - 1024 = %zu bytes\n",
sizeof(SuperSlab), sizeof(SuperSlab) - 1024);
}
void* block = (void*)(base + ((size_t)meta->used * block_size));
// ULTRATHINK DEBUG: After calculation
if (tiny_refill_failfast_level() >= 3 && tls->ss->size_class == 7 && slab_idx == 0) {
size_t offset_from_ss = (uintptr_t)block - (uintptr_t)tls->ss;
size_t expected_offset = 1024 + ((size_t)meta->used * block_size);
fprintf(stderr, "[ULTRA_53_DEBUG] === Calculated block address ===\n");
fprintf(stderr, "[ULTRA_53_DEBUG] block=%p\n", block);
fprintf(stderr, "[ULTRA_53_DEBUG] offset from ss=%zu (0x%zx)\n", offset_from_ss, offset_from_ss);
fprintf(stderr, "[ULTRA_53_DEBUG] expected offset=%zu (0x%zx)\n", expected_offset, expected_offset);
fprintf(stderr, "[ULTRA_53_DEBUG] difference=%zd bytes\n",
(ssize_t)offset_from_ss - (ssize_t)expected_offset);
}
meta->used++;
// Fail-Fast: self-checkデバッグ時のみ有効
if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) {
uintptr_t base_ss = (uintptr_t)tls->ss;
size_t ss_size = (size_t)1ULL << tls->ss->lg_size;
uintptr_t limit_ss = base_ss + ss_size;
uintptr_t p = (uintptr_t)block;
size_t off = (p >= base_ss) ? (size_t)(p - base_ss) : 0;
int in_range = (p >= base_ss) && (p < limit_ss);
int aligned = ((p - (uintptr_t)base) % block_size) == 0;
int idx_ok = (tls->slab_idx >= 0) && (tls->slab_idx < ss_slabs_capacity(tls->ss));
if (!in_range || !aligned || !idx_ok || meta->used > (uint32_t)meta->capacity) {
// Diagnostic log before abort
fprintf(stderr, "[ALLOC_CARVE_BUG] cls=%u slab=%d used=%u cap=%u base=%p bs=%zu ptr=%p offset=%zu\n",
tls->ss->size_class, tls->slab_idx, meta->used, meta->capacity,
(void*)base, block_size, block, off);
fprintf(stderr, "[ALLOC_CARVE_BUG] in_range=%d aligned=%d idx_ok=%d used_check=%d\n",
in_range, aligned, idx_ok, meta->used > (uint32_t)meta->capacity);
fflush(stderr);
tiny_failfast_abort_ptr("alloc_ret_align",
tls->ss,
tls->slab_idx,
block,
!in_range ? "out_of_range"
: (!aligned ? "misaligned"
: (!idx_ok ? "bad_slab_idx"
: "over_capacity")));
}
}
// Track active blocks in SuperSlab for conservative reclamation
ss_active_inc(tls->ss);
// Route: slab linear
ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60);
HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead
}
if (meta && meta->freelist) {
// Freelist allocation
void* block = meta->freelist;
// Safety: bounds/alignment check (debug)
if (__builtin_expect(g_tiny_safe_free, 0)) {
size_t blk = g_tiny_class_sizes[tls->ss->size_class];
uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx);
uintptr_t delta = (uintptr_t)block - (uintptr_t)base;
int align_ok = ((delta % blk) == 0);
int range_ok = (delta / blk) < meta->capacity;
if (!align_ok || !range_ok) {
uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u);
tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u);
if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; }
return NULL;
}
}
void* next = *(void**)block;
meta->freelist = next;
meta->used++;
// Optional: clear freelist bit when becomes empty
do {
static int g_mask_en = -1;
if (__builtin_expect(g_mask_en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_FREELIST_MASK");
g_mask_en = (e && *e && *e != '0') ? 1 : 0;
}
if (__builtin_expect(g_mask_en, 0) && next == NULL) {
uint32_t bit = (1u << slab_idx);
atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release);
}
} while (0);
// Track active blocks in SuperSlab for conservative reclamation
ss_active_inc(tls->ss);
// Route: slab freelist
ROUTE_MARK(12); ROUTE_COMMIT(class_idx, 0x61);
HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead
}
// Slow path: Refill TLS slab
SuperSlab* ss = superslab_refill(class_idx);
if (!ss) {
static int log_oom = 0;
if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; }
return NULL; // OOM
}
// Retry allocation (metadata already cached in superslab_refill)
meta = tls->meta;
// DEBUG: Check each condition (disabled for benchmarks)
// static int log_retry = 0;
// if (log_retry < 2) {
// fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n",
// (void*)meta, meta ? meta->freelist : NULL,
// meta ? meta->used : 0, meta ? meta->capacity : 0,
// (void*)tls->slab_base);
// log_retry++;
// }
if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) {
size_t block_size = g_tiny_class_sizes[ss->size_class];
void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size));
// Disabled for benchmarks
// static int log_success = 0;
// if (log_success < 2) {
// fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n",
// block, class_idx, meta->used, meta->used + 1);
// log_success++;
// }
meta->used++;
// Track active blocks in SuperSlab for conservative reclamation
ss_active_inc(ss);
HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead
}
// Disabled for benchmarks
// static int log_fail = 0;
// if (log_fail < 2) {
// fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n");
// log_fail++;
// }
return NULL;
}