// tiny_superslab_alloc.inc.h - SuperSlab Allocation Layer // Purpose: Slab allocation, refill, and adoption logic // Extracted from: hakmem_tiny_free.inc lines 626-1170 // Box Theory: Box 4 (Refill/Adoption) integration // // Public functions: // - superslab_alloc_from_slab(): Allocate from specific slab (linear or freelist) // - superslab_refill(): Refill TLS slab (adoption, registry scan, fresh alloc) // - hak_tiny_alloc_superslab(): Main SuperSlab allocation entry point // ============================================================================ // Phase 6.23: SuperSlab Allocation Helpers // ============================================================================ // Phase 6.24: Allocate from SuperSlab slab (lazy freelist + linear allocation) #include "hakmem_tiny_superslab_constants.h" static inline void* superslab_alloc_from_slab(SuperSlab* ss, int slab_idx) { TinySlabMeta* meta = &ss->slabs[slab_idx]; // Ensure remote queue is drained before handing blocks back to TLS (UNLIKELY in 1T) if (__builtin_expect(atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0, 0)) { uint32_t self_tid = tiny_self_u32(); SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid); if (slab_is_valid(&h)) { slab_drain_remote_full(&h); int pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire) != 0; if (__builtin_expect(pending, 0)) { if (__builtin_expect(g_debug_remote_guard, 0)) { uintptr_t head = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_relaxed); tiny_remote_watch_note("alloc_pending_remote", ss, slab_idx, (void*)head, 0xA243u, self_tid, 0); } slab_release(&h); return NULL; } slab_release(&h); } else { if (__builtin_expect(g_debug_remote_guard, 0)) { tiny_remote_watch_note("alloc_acquire_fail", ss, slab_idx, meta, 0xA244u, self_tid, 0); } return NULL; } } if (__builtin_expect(g_debug_remote_guard, 0)) { uintptr_t head_pending = atomic_load_explicit(&ss->remote_heads[slab_idx], memory_order_acquire); if (head_pending != 0) { tiny_remote_watch_note("alloc_remote_pending", ss, slab_idx, (void*)head_pending, 0xA247u, tiny_self_u32(), 1); return NULL; } } // Phase 6.24: Linear allocation mode (freelist == NULL) // This avoids the 4000-8000 cycle cost of building freelist on init if (__builtin_expect(meta->freelist == NULL && meta->used < meta->capacity, 1)) { // Linear allocation: use canonical tiny_slab_base_for() only size_t unit_sz = g_tiny_class_sizes[ss->size_class] #if HAKMEM_TINY_HEADER_CLASSIDX + ((ss->size_class != 7) ? 1 : 0) #endif ; uint8_t* base = tiny_slab_base_for(ss, slab_idx); void* block_base = (void*)(base + ((size_t)meta->used * unit_sz)); #if !HAKMEM_BUILD_RELEASE // Debug safety: Ensure we never carve past slab usable region (capacity mismatch guard) size_t dbg_usable = (slab_idx == 0) ? SUPERSLAB_SLAB0_USABLE_SIZE : SUPERSLAB_SLAB_USABLE_SIZE; uintptr_t dbg_off = (uintptr_t)((uint8_t*)block_base - base); if (__builtin_expect(dbg_off + unit_sz > dbg_usable, 0)) { fprintf(stderr, "[TINY_ALLOC_BOUNDS] cls=%u slab=%d used=%u cap=%u unit=%zu off=%lu usable=%zu\n", ss->size_class, slab_idx, meta->used, meta->capacity, unit_sz, (unsigned long)dbg_off, dbg_usable); return NULL; } #endif meta->used++; void* user = #if HAKMEM_TINY_HEADER_CLASSIDX tiny_region_id_write_header(block_base, ss->size_class); #else block_base; #endif if (__builtin_expect(g_debug_remote_guard, 0)) { tiny_remote_track_on_alloc(ss, slab_idx, user, "linear_alloc", 0); tiny_remote_assert_not_remote(ss, slab_idx, user, "linear_alloc_ret", 0); } return user; // Fast path: O(1) pointer arithmetic } // Freelist mode (after first free()) if (__builtin_expect(meta->freelist != NULL, 0)) { void* block = meta->freelist; // CORRUPTION DEBUG: Validate freelist head before popping if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) { size_t blk = g_tiny_class_sizes[ss->size_class]; uint8_t* slab_base = tiny_slab_base_for(ss, slab_idx); uintptr_t block_addr = (uintptr_t)block; uintptr_t slab_addr = (uintptr_t)slab_base; uintptr_t offset = block_addr - slab_addr; fprintf(stderr, "[ALLOC_POP] cls=%u slab=%d block=%p offset=%zu (used=%u cap=%u)\n", ss->size_class, slab_idx, block, offset, meta->used, meta->capacity); if (offset % blk != 0) { fprintf(stderr, "[ALLOC_CORRUPT] Freelist head is misaligned! block=%p offset=%zu blk=%zu\n", block, offset, blk); fprintf(stderr, "[ALLOC_CORRUPT] Expected alignment: %zu, actual: %zu\n", blk, offset % blk); tiny_failfast_abort_ptr("alloc_pop_misalign", ss, slab_idx, block, "freelist_head_corrupt"); } size_t index = offset / blk; if (index >= meta->capacity) { fprintf(stderr, "[ALLOC_CORRUPT] Freelist head out of bounds! block=%p index=%zu cap=%u\n", block, index, meta->capacity); tiny_failfast_abort_ptr("alloc_pop_oob", ss, slab_idx, block, "freelist_head_oob"); } } meta->freelist = *(void**)block; // Pop from freelist meta->used++; if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) { if (__builtin_expect(meta->used > meta->capacity, 0)) { fprintf(stderr, "[ALLOC_CORRUPT] meta->used overflow on freelist alloc: used=%u cap=%u cls=%u slab=%d\n", meta->used, meta->capacity, ss->size_class, slab_idx); tiny_failfast_abort_ptr("alloc_used_overflow", ss, slab_idx, block, "freelist_used_over_capacity"); } } if (__builtin_expect(g_debug_remote_guard, 0)) { tiny_remote_track_on_alloc(ss, slab_idx, block, "freelist_alloc", 0); tiny_remote_assert_not_remote(ss, slab_idx, block, "freelist_alloc_ret", 0); } return block; } return NULL; // Slab is full } // Adopt helper: acquire → drain → bind (single boundary) – returns 1 on success static inline int adopt_bind_if_safe(TinyTLSSlab* tls, SuperSlab* ss, int slab_idx, int class_idx) { uint32_t self_tid = tiny_self_u32(); SlabHandle h = slab_try_acquire(ss, slab_idx, self_tid); if (!slab_is_valid(&h)) return 0; slab_drain_remote_full(&h); if (__builtin_expect(slab_is_safe_to_bind(&h), 1)) { // Optional: move a few nodes to Front SLL to boost next hits tiny_drain_freelist_to_sll_once(h.ss, h.slab_idx, class_idx); tiny_tls_bind_slab(tls, h.ss, h.slab_idx); // Ownership now associated with TLS slab; release handle bookkeeping slab_release(&h); return 1; } slab_release(&h); return 0; } // Phase 6.24 & 7.6: Refill TLS SuperSlab (with unified TLS cache + deferred allocation) static SuperSlab* superslab_refill(int class_idx) { #if HAKMEM_DEBUG_COUNTERS g_superslab_refill_calls_dbg[class_idx]++; #endif TinyTLSSlab* tls = &g_tls_slabs[class_idx]; // ============================================================================ // Phase 2a: Dynamic Expansion - Initialize SuperSlabHead if needed // ============================================================================ extern SuperSlabHead* g_superslab_heads[TINY_NUM_CLASSES_SS]; extern SuperSlabHead* init_superslab_head(int class_idx); extern int expand_superslab_head(SuperSlabHead* head); SuperSlabHead* head = g_superslab_heads[class_idx]; if (!head) { // First-time initialization for this class head = init_superslab_head(class_idx); if (!head) { extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[DEBUG] superslab_refill: Failed to init SuperSlabHead for class %d\n", class_idx); g_hakmem_lock_depth--; return NULL; // Critical failure } g_superslab_heads[class_idx] = head; } // Try current chunk first (fast path) SuperSlab* current_chunk = head->current_chunk; if (current_chunk) { // Check if current chunk has available slabs // Bitmap semantics: 0=FREE, 1=OCCUPIED // - 0x00000000 = all free (32 available) // - 0xFFFFFFFF = all occupied (0 available) int chunk_cap = ss_slabs_capacity(current_chunk); uint32_t full_mask = (chunk_cap >= 32) ? 0xFFFFFFFF : ((1U << chunk_cap) - 1); if (current_chunk->slab_bitmap != full_mask) { // Current chunk has free slabs, use normal refill logic below // (Will be handled by existing code that checks tls->ss) if (tls->ss != current_chunk) { // Update TLS to point to current chunk tls->ss = current_chunk; } } else { // Current chunk exhausted (all slabs occupied), try to expand #if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] SuperSlab chunk exhausted for class %d (bitmap=0x%08x), expanding...\n", class_idx, current_chunk->slab_bitmap); g_hakmem_lock_depth--; #endif // Protect expansion with global lock (race condition fix) static pthread_mutex_t expand_lock = PTHREAD_MUTEX_INITIALIZER; pthread_mutex_lock(&expand_lock); // Re-check after acquiring lock (another thread may have expanded) current_chunk = head->current_chunk; uint32_t recheck_mask = (ss_slabs_capacity(current_chunk) >= 32) ? 0xFFFFFFFF : ((1U << ss_slabs_capacity(current_chunk)) - 1); if (current_chunk->slab_bitmap == recheck_mask) { // Still exhausted, expand now if (expand_superslab_head(head) < 0) { pthread_mutex_unlock(&expand_lock); #if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Failed to expand SuperSlabHead for class %d (system OOM)\n", class_idx); g_hakmem_lock_depth--; #endif return NULL; // True system OOM } #if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] Successfully expanded SuperSlabHead for class %d\n", class_idx); g_hakmem_lock_depth--; #endif } // Update current_chunk and tls->ss to point to (potentially new) chunk current_chunk = head->current_chunk; tls->ss = current_chunk; pthread_mutex_unlock(&expand_lock); // Verify chunk has free slabs full_mask = (ss_slabs_capacity(current_chunk) >= 32) ? 0xFFFFFFFF : ((1U << ss_slabs_capacity(current_chunk)) - 1); if (!current_chunk || current_chunk->slab_bitmap == full_mask) { #if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Chunk still has no free slabs for class %d after expansion\n", class_idx); g_hakmem_lock_depth--; #endif return NULL; } } } // ============================================================================ // Continue with existing refill logic // ============================================================================ static int g_ss_adopt_en = -1; // env: HAKMEM_TINY_SS_ADOPT=1; default auto-on if remote seen if (g_ss_adopt_en == -1) { char* e = getenv("HAKMEM_TINY_SS_ADOPT"); if (e) { g_ss_adopt_en = (*e != '0') ? 1 : 0; } else { extern _Atomic int g_ss_remote_seen; g_ss_adopt_en = (atomic_load_explicit(&g_ss_remote_seen, memory_order_relaxed) != 0) ? 1 : 0; } } extern int g_adopt_cool_period; extern __thread int g_tls_adopt_cd[]; if (g_adopt_cool_period == -1) { char* cd = getenv("HAKMEM_TINY_SS_ADOPT_COOLDOWN"); int v = (cd ? atoi(cd) : 0); if (v < 0) v = 0; if (v > 1024) v = 1024; g_adopt_cool_period = v; } static int g_superslab_refill_debug_once = 0; SuperSlab* prev_ss = tls->ss; TinySlabMeta* prev_meta = tls->meta; uint8_t prev_slab_idx = tls->slab_idx; uint8_t prev_active = prev_ss ? prev_ss->active_slabs : 0; uint32_t prev_bitmap = prev_ss ? prev_ss->slab_bitmap : 0; uint32_t prev_meta_used = prev_meta ? prev_meta->used : 0; uint32_t prev_meta_cap = prev_meta ? prev_meta->capacity : 0; int free_idx_attempted = -2; // -2 = not evaluated, -1 = none, >=0 = chosen int reused_slabs = 0; // Optional: Mid-size simple refill to avoid multi-layer scans (class>=4) do { static int g_mid_simple_warn = 0; if (class_idx >= 4 && tiny_mid_refill_simple_enabled()) { // If current TLS has a SuperSlab, prefer taking a virgin slab directly if (tls->ss) { int tls_cap = ss_slabs_capacity(tls->ss); if (tls->ss->active_slabs < tls_cap) { int free_idx = superslab_find_free_slab(tls->ss); if (free_idx >= 0) { uint32_t my_tid = tiny_self_u32(); superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid); tiny_tls_bind_slab(tls, tls->ss, free_idx); return tls->ss; } } } // Otherwise allocate a fresh SuperSlab and bind first slab SuperSlab* ssn = superslab_allocate((uint8_t)class_idx); if (!ssn) { if (!g_superslab_refill_debug_once && g_mid_simple_warn < 2) { g_mid_simple_warn++; int err = errno; fprintf(stderr, "[DEBUG] mid_simple_refill OOM class=%d errno=%d\n", class_idx, err); } return NULL; } uint32_t my_tid = tiny_self_u32(); superslab_init_slab(ssn, 0, g_tiny_class_sizes[class_idx], my_tid); SuperSlab* old = tls->ss; tiny_tls_bind_slab(tls, ssn, 0); superslab_ref_inc(ssn); if (old && old != ssn) { superslab_ref_dec(old); } return ssn; } } while (0); // First, try to adopt a published partial SuperSlab for this class if (g_ss_adopt_en) { if (g_adopt_cool_period > 0) { if (g_tls_adopt_cd[class_idx] > 0) { g_tls_adopt_cd[class_idx]--; } else { // eligible to adopt } } if (g_adopt_cool_period == 0 || g_tls_adopt_cd[class_idx] == 0) { SuperSlab* adopt = ss_partial_adopt(class_idx); if (adopt && adopt->magic == SUPERSLAB_MAGIC) { // ======================================================================== // Quick Win #2: First-Fit Adopt (vs Best-Fit scoring all 32 slabs) // For Larson, any slab with freelist works - no need to score all 32! // Expected improvement: -3,000 cycles (from 32 atomic loads + 32 scores) // ======================================================================== int adopt_cap = ss_slabs_capacity(adopt); int best = -1; for (int s = 0; s < adopt_cap; s++) { TinySlabMeta* m = &adopt->slabs[s]; // Quick check: Does this slab have a freelist? if (m->freelist) { // Yes! Try to acquire it immediately (first-fit) best = s; break; // ✅ OPTIMIZATION: Stop at first slab with freelist! } // Optional: Also check remote_heads if we want to prioritize those // (But for Larson, freelist is sufficient) } if (best >= 0) { if (adopt_bind_if_safe(tls, adopt, best, class_idx)) { if (g_adopt_cool_period > 0) g_tls_adopt_cd[class_idx] = g_adopt_cool_period; return adopt; } } // If no freelist found, ignore and continue (optional: republish) } } } // Phase 7.6 Step 4: Check existing SuperSlab with priority order if (tls->ss) { // Priority 1: Reuse slabs with freelist (already freed blocks) int tls_cap = ss_slabs_capacity(tls->ss); uint32_t nonempty_mask = 0; do { static int g_mask_en = -1; if (__builtin_expect(g_mask_en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); g_mask_en = (e && *e && *e != '0') ? 1 : 0; } if (__builtin_expect(g_mask_en, 0)) { nonempty_mask = atomic_load_explicit(&tls->ss->freelist_mask, memory_order_acquire); break; } for (int i = 0; i < tls_cap; i++) { if (tls->ss->slabs[i].freelist) nonempty_mask |= (1u << i); } } while (0); // O(1) lookup: scan mask with ctz (1 instruction!) while (__builtin_expect(nonempty_mask != 0, 1)) { int i = __builtin_ctz(nonempty_mask); // Find first non-empty slab (O(1)) nonempty_mask &= ~(1u << i); // Clear bit for next iteration // FIX #1 DELETED (Race condition fix): // Previous drain without ownership caused concurrent freelist corruption. // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h). // Remote frees will be drained when the slab is adopted (see tiny_refill.h paths). if (adopt_bind_if_safe(tls, tls->ss, i, class_idx)) { reused_slabs = 1; return tls->ss; } } // Priority 2: Use unused slabs (virgin slabs) if (tls->ss->active_slabs < tls_cap) { // Find next free slab int free_idx = superslab_find_free_slab(tls->ss); free_idx_attempted = free_idx; if (free_idx >= 0) { // Initialize this slab uint32_t my_tid = tiny_self_u32(); superslab_init_slab(tls->ss, free_idx, g_tiny_class_sizes[class_idx], my_tid); // Update TLS cache (unified update) tiny_tls_bind_slab(tls, tls->ss, free_idx); return tls->ss; } } } // Try to adopt a partial SuperSlab from registry (one-shot, cheap scan) // This reduces pressure to allocate new SS when other threads freed blocks. // Phase 6: Registry Optimization - Use per-class registry for O(class_size) scan if (!tls->ss) { // Phase 6: Use per-class registry (262K → ~10-100 entries per class!) extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; extern int g_super_reg_class_size[TINY_NUM_CLASSES]; const int scan_max = tiny_reg_scan_max(); int reg_size = g_super_reg_class_size[class_idx]; int scan_limit = (scan_max < reg_size) ? scan_max : reg_size; for (int i = 0; i < scan_limit; i++) { SuperSlab* ss = g_super_reg_by_class[class_idx][i]; if (!ss || ss->magic != SUPERSLAB_MAGIC) continue; // Note: class_idx check is not needed (per-class registry!) // Pick first slab with freelist (Box 4: adopt boundary helper) int reg_cap = ss_slabs_capacity(ss); for (int s = 0; s < reg_cap; s++) { if (ss->slabs[s].freelist) { if (adopt_bind_if_safe(tls, ss, s, class_idx)) return ss; } } } } // Must-adopt-before-mmap gate: attempt sticky/hot/bench/mailbox/registry small-window { SuperSlab* gate_ss = tiny_must_adopt_gate(class_idx, tls); if (gate_ss) return gate_ss; } // Allocate new SuperSlab SuperSlab* ss = superslab_allocate((uint8_t)class_idx); if (!ss) { if (!g_superslab_refill_debug_once) { g_superslab_refill_debug_once = 1; int err = errno; // CRITICAL FIX (BUG #11): Protect fprintf() with lock_depth // fprintf() can call malloc for buffering → must use libc malloc extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM) detail: class=%d prev_ss=%p active=%u bitmap=0x%08x prev_meta=%p used=%u cap=%u slab_idx=%u reused_freelist=%d free_idx=%d errno=%d\n", class_idx, (void*)prev_ss, (unsigned)prev_active, prev_bitmap, (void*)prev_meta, (unsigned)prev_meta_used, (unsigned)prev_meta_cap, (unsigned)prev_slab_idx, reused_slabs, free_idx_attempted, err); g_hakmem_lock_depth--; } // Clear errno to avoid confusion in fallback paths errno = 0; return NULL; // OOM } // Initialize first slab uint32_t my_tid = tiny_self_u32(); superslab_init_slab(ss, 0, g_tiny_class_sizes[class_idx], my_tid); // Cache in unified TLS(前のSS参照を解放) SuperSlab* old = tls->ss; tiny_tls_bind_slab(tls, ss, 0); // Maintain refcount(将来の空回収に備え、TLS参照をカウント) superslab_ref_inc(ss); if (old && old != ss) { superslab_ref_dec(old); } return ss; } // Phase 6.24: SuperSlab-based allocation (TLS unified, Medium fix) static inline void* hak_tiny_alloc_superslab(int class_idx) { // DEBUG: Function entry trace (gated to avoid ring spam) do { static int g_alloc_ring = -1; if (__builtin_expect(g_alloc_ring == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ALLOC_RING"); g_alloc_ring = (e && *e && *e != '0') ? 1 : 0; } if (g_alloc_ring) { tiny_debug_ring_record(TINY_RING_EVENT_ALLOC_ENTER, 0x01, (void*)(uintptr_t)class_idx, 0); } } while (0); // MidTC fast path: 128..1024B(class>=4)はTLS tcacheを最優先 do { void* mp = midtc_pop(class_idx); if (mp) { HAK_RET_ALLOC(class_idx, mp); } } while (0); // Phase 6.24: 1 TLS read (down from 3) TinyTLSSlab* tls = &g_tls_slabs[class_idx]; TinySlabMeta* meta = tls->meta; int slab_idx = tls->slab_idx; if (meta && slab_idx >= 0 && tls->ss) { // CRITICAL: Verify class consistency BEFORE using tls->ss // If tls->ss->size_class != class_idx, unbind and refill if (tls->ss->size_class != class_idx) { // Class mismatch: TLS is bound to wrong SuperSlab // This happens when TLS was previously bound to different class tls->ss = NULL; tls->meta = NULL; tls->slab_idx = -1; tls->slab_base = NULL; meta = NULL; // Force refill path below } else { // Ensure TLS view is consistent with canonical slab_base uint8_t* canonical = tiny_slab_base_for(tls->ss, slab_idx); if (tls->slab_base != canonical) { tls->slab_base = canonical; } } // A/B: Relaxed read for remote head presence check static int g_alloc_remote_relax = -1; // env: HAKMEM_TINY_ALLOC_REMOTE_RELAX=1 → relaxed if (__builtin_expect(g_alloc_remote_relax == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ALLOC_REMOTE_RELAX"); g_alloc_remote_relax = (e && *e && *e != '0') ? 1 : 0; } uintptr_t pending = atomic_load_explicit(&tls->ss->remote_heads[slab_idx], g_alloc_remote_relax ? memory_order_relaxed : memory_order_acquire); if (__builtin_expect(pending != 0, 0)) { uint32_t self_tid = tiny_self_u32(); if (ss_owner_try_acquire(meta, self_tid)) { _ss_remote_drain_to_freelist_unsafe(tls->ss, slab_idx, meta); } } } // FIX #2 DELETED (Race condition fix): // Previous drain-all-slabs without ownership caused concurrent freelist corruption. // Problem: Thread A owns slab 5, Thread B drains all slabs including 5 → both modify freelist → crash. // Ownership protocol: MUST bind+owner_cas BEFORE drain (see Fix #3 in tiny_refill.h). // Remote frees will be drained when the slab is adopted via refill paths. // Fast path: Direct metadata access (no repeated TLS reads!) if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) { // Linear allocation (lazy init) size_t block_size = g_tiny_class_sizes[tls->ss->size_class]; uint8_t* base = tls->slab_base; // tls_slab_base は tiny_slab_base_for(ss, slab_idx) 由来(唯一の真実) // ULTRATHINK DEBUG: Capture the 53-byte mystery if (tiny_refill_failfast_level() >= 3 && tls->ss->size_class == 7 && slab_idx == 0) { fprintf(stderr, "[ULTRA_53_DEBUG] === Before allocation ===\n"); fprintf(stderr, "[ULTRA_53_DEBUG] ss=%p, slab_idx=%d, class=%d\n", tls->ss, slab_idx, tls->ss->size_class); fprintf(stderr, "[ULTRA_53_DEBUG] block_size=%zu, meta->used=%d, meta->capacity=%d\n", block_size, meta->used, meta->capacity); fprintf(stderr, "[ULTRA_53_DEBUG] tls->slab_base=%p\n", base); fprintf(stderr, "[ULTRA_53_DEBUG] tiny_slab_base_for(ss,%d)=%p\n", slab_idx, tiny_slab_base_for(tls->ss, slab_idx)); fprintf(stderr, "[ULTRA_53_DEBUG] sizeof(SuperSlab)=%zu\n", sizeof(SuperSlab)); fprintf(stderr, "[ULTRA_53_DEBUG] Expected base should be: ss + %zu\n", sizeof(SuperSlab)); fprintf(stderr, "[ULTRA_53_DEBUG] Actual base is: ss + 1024\n"); fprintf(stderr, "[ULTRA_53_DEBUG] Base error: %zu - 1024 = %zu bytes\n", sizeof(SuperSlab), sizeof(SuperSlab) - 1024); } void* block = (void*)(base + ((size_t)meta->used * block_size)); // ULTRATHINK DEBUG: After calculation if (tiny_refill_failfast_level() >= 3 && tls->ss->size_class == 7 && slab_idx == 0) { size_t offset_from_ss = (uintptr_t)block - (uintptr_t)tls->ss; size_t expected_offset = 1024 + ((size_t)meta->used * block_size); fprintf(stderr, "[ULTRA_53_DEBUG] === Calculated block address ===\n"); fprintf(stderr, "[ULTRA_53_DEBUG] block=%p\n", block); fprintf(stderr, "[ULTRA_53_DEBUG] offset from ss=%zu (0x%zx)\n", offset_from_ss, offset_from_ss); fprintf(stderr, "[ULTRA_53_DEBUG] expected offset=%zu (0x%zx)\n", expected_offset, expected_offset); fprintf(stderr, "[ULTRA_53_DEBUG] difference=%zd bytes\n", (ssize_t)offset_from_ss - (ssize_t)expected_offset); } meta->used++; // Fail-Fast: self-check(デバッグ時のみ有効) if (__builtin_expect(tiny_refill_failfast_level() >= 2, 0)) { uintptr_t base_ss = (uintptr_t)tls->ss; size_t ss_size = (size_t)1ULL << tls->ss->lg_size; uintptr_t limit_ss = base_ss + ss_size; uintptr_t p = (uintptr_t)block; size_t off = (p >= base_ss) ? (size_t)(p - base_ss) : 0; int in_range = (p >= base_ss) && (p < limit_ss); int aligned = ((p - (uintptr_t)base) % block_size) == 0; int idx_ok = (tls->slab_idx >= 0) && (tls->slab_idx < ss_slabs_capacity(tls->ss)); if (!in_range || !aligned || !idx_ok || meta->used > (uint32_t)meta->capacity) { // Diagnostic log before abort fprintf(stderr, "[ALLOC_CARVE_BUG] cls=%u slab=%d used=%u cap=%u base=%p bs=%zu ptr=%p offset=%zu\n", tls->ss->size_class, tls->slab_idx, meta->used, meta->capacity, (void*)base, block_size, block, off); fprintf(stderr, "[ALLOC_CARVE_BUG] in_range=%d aligned=%d idx_ok=%d used_check=%d\n", in_range, aligned, idx_ok, meta->used > (uint32_t)meta->capacity); fflush(stderr); tiny_failfast_abort_ptr("alloc_ret_align", tls->ss, tls->slab_idx, block, !in_range ? "out_of_range" : (!aligned ? "misaligned" : (!idx_ok ? "bad_slab_idx" : "over_capacity"))); } } // Track active blocks in SuperSlab for conservative reclamation ss_active_inc(tls->ss); // Route: slab linear ROUTE_MARK(11); ROUTE_COMMIT(class_idx, 0x60); HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead } if (meta && meta->freelist) { // Freelist allocation void* block = meta->freelist; // Safety: bounds/alignment check (debug) if (__builtin_expect(g_tiny_safe_free, 0)) { size_t blk = g_tiny_class_sizes[tls->ss->size_class]; uint8_t* base = tiny_slab_base_for(tls->ss, tls->slab_idx); uintptr_t delta = (uintptr_t)block - (uintptr_t)base; int align_ok = ((delta % blk) == 0); int range_ok = (delta / blk) < meta->capacity; if (!align_ok || !range_ok) { uintptr_t info = ((uintptr_t)(align_ok ? 1u : 0u) << 32) | (uint32_t)(range_ok ? 1u : 0u); tiny_debug_ring_record(TINY_RING_EVENT_REMOTE_INVALID, (uint16_t)tls->ss->size_class, block, info | 0xA100u); if (g_tiny_safe_free_strict) { raise(SIGUSR2); return NULL; } return NULL; } } void* next = *(void**)block; meta->freelist = next; meta->used++; // Optional: clear freelist bit when becomes empty do { static int g_mask_en = -1; if (__builtin_expect(g_mask_en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_FREELIST_MASK"); g_mask_en = (e && *e && *e != '0') ? 1 : 0; } if (__builtin_expect(g_mask_en, 0) && next == NULL) { uint32_t bit = (1u << slab_idx); atomic_fetch_and_explicit(&tls->ss->freelist_mask, ~bit, memory_order_release); } } while (0); // Track active blocks in SuperSlab for conservative reclamation ss_active_inc(tls->ss); // Route: slab freelist ROUTE_MARK(12); ROUTE_COMMIT(class_idx, 0x61); HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead } // Slow path: Refill TLS slab SuperSlab* ss = superslab_refill(class_idx); if (!ss) { static int log_oom = 0; if (log_oom < 2) { fprintf(stderr, "[DEBUG] superslab_refill returned NULL (OOM)\n"); log_oom++; } return NULL; // OOM } // Retry allocation (metadata already cached in superslab_refill) meta = tls->meta; // DEBUG: Check each condition (disabled for benchmarks) // static int log_retry = 0; // if (log_retry < 2) { // fprintf(stderr, "[DEBUG] Retry alloc: meta=%p, freelist=%p, used=%u, capacity=%u, slab_base=%p\n", // (void*)meta, meta ? meta->freelist : NULL, // meta ? meta->used : 0, meta ? meta->capacity : 0, // (void*)tls->slab_base); // log_retry++; // } if (meta && meta->freelist == NULL && meta->used < meta->capacity && tls->slab_base) { size_t block_size = g_tiny_class_sizes[ss->size_class]; void* block = (void*)(tls->slab_base + ((size_t)meta->used * block_size)); // Disabled for benchmarks // static int log_success = 0; // if (log_success < 2) { // fprintf(stderr, "[DEBUG] Superslab alloc SUCCESS: ptr=%p, class=%d, used=%u->%u\n", // block, class_idx, meta->used, meta->used + 1); // log_success++; // } meta->used++; // Track active blocks in SuperSlab for conservative reclamation ss_active_inc(ss); HAK_RET_ALLOC(class_idx, block); // Phase 8.4: Zero hot-path overhead } // Disabled for benchmarks // static int log_fail = 0; // if (log_fail < 2) { // fprintf(stderr, "[DEBUG] Retry alloc FAILED - returning NULL\n"); // log_fail++; // } return NULL; }