Major breakthrough: sh8bench now completes without SIGSEGV! Added defensive refcounting and failsafe mechanisms to prevent use-after-free and corruption propagation. Changes: 1. SuperSlab Refcount Pinning (core/box/tls_sll_box.h) - tls_sll_push_impl: increment refcount before adding to list - tls_sll_pop_impl: decrement refcount when removing from list - Prevents SuperSlab from being freed while TLS SLL holds pointers 2. SuperSlab Release Guards (core/superslab_allocate.c, shared_pool_release.c) - Check refcount > 0 before freeing SuperSlab - If refcount > 0, defer release instead of freeing - Prevents use-after-free when TLS/remote/freelist hold stale pointers 3. TLS SLL Next Pointer Validation (core/box/tls_sll_box.h) - Detect invalid next pointer during traversal - Log [TLS_SLL_NEXT_INVALID] when detected - Drop list to prevent corruption propagation 4. Unified Cache Freelist Validation (core/front/tiny_unified_cache.c) - Validate freelist head before use - Log [UNIFIED_FREELIST_INVALID] for corrupted lists - Defensive drop to prevent bad allocations 5. Early Refcount Decrement Fix (core/tiny_free_fast.inc.h) - Removed ss_active_dec_one from fast path - Prevents premature refcount depletion - Defers decrement to proper cleanup path Test Results: ✅ sh8bench completes successfully (exit code 0) ✅ No SIGSEGV or ABORT signals ✅ Short runs (5s) crash-free ⚠️ Multiple [TLS_SLL_NEXT_INVALID] / [UNIFIED_FREELIST_INVALID] logged ⚠️ Invalid pointers still present (stale references exist) Status Analysis: - Stability: ACHIEVED (no crashes) - Root Cause: NOT FULLY SOLVED (invalid pointers remain) - Approach: Defensive + refcount guards working well Remaining Issues: ❌ Why does SuperSlab get unregistered while TLS SLL holds pointers? ❌ SuperSlab lifecycle: remote_queue / adopt / LRU interactions? ❌ Stale pointers indicate improper SuperSlab lifetime management Performance Impact: - Refcount operations: +1-3 cycles per push/pop (minor) - Validation checks: +2-5 cycles (minor) - Overall: < 5% overhead estimated Next Investigation: - Trace SuperSlab lifecycle (allocation → registration → unregister → free) - Check remote_queue handling - Verify adopt/LRU mechanisms - Correlate stale pointer logs with SuperSlab unregister events Log Volume Warning: - May produce many diagnostic logs on long runs - Consider ENV gating for production Technical Notes: - Refcount is per-SuperSlab, not global - Guards prevent symptom propagation, not root cause - Root cause is in SuperSlab lifecycle management 🤖 Generated with Claude Code (https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
944 lines
36 KiB
C
944 lines
36 KiB
C
// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions)
|
||
// Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%)
|
||
// Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend
|
||
// Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart
|
||
//
|
||
// Box 5-NEW: SFC (Super Front Cache) Integration
|
||
// Architecture: SFC (Layer 0, 128-256 slots) → SLL (Layer 1, unlimited) → SuperSlab (Layer 2+)
|
||
// Cascade Refill: SFC ← SLL (one-way, safe)
|
||
// Goal: +200% performance (4.19M → 12M+ ops/s)
|
||
//
|
||
// Phase 2b: Adaptive TLS Cache Sizing
|
||
// Hot classes grow to 2048 slots, cold classes shrink to 16 slots
|
||
// Expected: +3-10% performance, -30-50% TLS cache memory overhead
|
||
#pragma once
|
||
#include "tiny_atomic.h"
|
||
#include "hakmem_tiny.h"
|
||
#include "tiny_route.h"
|
||
#include "tiny_alloc_fast_sfc.inc.h" // Box 5-NEW: SFC Layer
|
||
#include "hakmem_tiny_fastcache.inc.h" // Array stack (FastCache) for C0–C3
|
||
#include "hakmem_tiny_tls_list.h" // TLS List (for tiny_fast_refill_and_take)
|
||
#include "tiny_region_id.h" // Phase 7: Header-based class_idx lookup
|
||
#include "tiny_adaptive_sizing.h" // Phase 2b: Adaptive sizing
|
||
#include "box/tls_sll_box.h" // Box TLS-SLL: C7-safe push/pop/splice
|
||
#include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write
|
||
#include "box/tiny_front_config_box.h" // Phase 7-Step3: Compile-time config for dead code elimination
|
||
#include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
#include "box/front_gate_box.h"
|
||
#endif
|
||
#include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection
|
||
|
||
// Phase 7-Step6-Fix: Config wrapper functions moved to tiny_fastcache.c
|
||
// (Forward declarations are in tiny_front_config_box.h)
|
||
#if HAKMEM_TINY_HEADER_CLASSIDX
|
||
// Ring Cache and Unified Cache removed (A/B test: OFF is faster)
|
||
#endif
|
||
#include "box/front_metrics_box.h" // Phase 19-1: Frontend layer metrics
|
||
#include "front/tiny_heap_v2.h" // Front-V2: TLS magazine (tcache-like) front
|
||
#include "hakmem_tiny_lazy_init.inc.h" // Phase 22: Lazy per-class initialization
|
||
#include "box/tiny_sizeclass_hist_box.h" // Phase 3-4: Tiny size class histogram (ACE learning)
|
||
#include "box/ultra_slim_alloc_box.h" // Phase 19-2: Ultra SLIM 4-layer fast path
|
||
#include <stdio.h>
|
||
#include <stdatomic.h>
|
||
|
||
// P1.3/P2.2: Helper to track active/tls_cached when allocating from TLS SLL
|
||
// ENV gate: HAKMEM_TINY_ACTIVE_TRACK=1 to enable (default: 0 for performance)
|
||
// Flow: TLS SLL → User means active++, tls_cached--
|
||
// Priority-2: Use cached ENV (eliminate lazy-init syscall overhead)
|
||
static inline void tiny_active_track_alloc(void* base) {
|
||
if (__builtin_expect(HAK_ENV_TINY_ACTIVE_TRACK(), 0)) {
|
||
extern SuperSlab* ss_fast_lookup(void* ptr);
|
||
SuperSlab* ss = ss_fast_lookup(base);
|
||
if (ss && ss->magic == SUPERSLAB_MAGIC) {
|
||
int slab_idx = slab_index_for(ss, base);
|
||
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
atomic_fetch_add_explicit(&meta->active, 1, memory_order_relaxed);
|
||
atomic_fetch_sub_explicit(&meta->tls_cached, 1, memory_order_relaxed); // P2.2
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Diag counter: size>=1024 allocations routed to Tiny (env: HAKMEM_TINY_ALLOC_1024_METRIC)
|
||
extern _Atomic uint64_t g_tiny_alloc_ge1024[];
|
||
// Priority-2: Use cached ENV (eliminate lazy-init syscall overhead)
|
||
static inline void tiny_diag_track_size_ge1024_fast(size_t req_size, int class_idx) {
|
||
if (__builtin_expect(req_size < 1024, 1)) return;
|
||
if (!__builtin_expect(HAK_ENV_TINY_ALLOC_1024_METRIC(), 0)) return;
|
||
if (__builtin_expect(class_idx >= 0 && class_idx < TINY_NUM_CLASSES, 1)) {
|
||
atomic_fetch_add_explicit(&g_tiny_alloc_ge1024[class_idx], 1, memory_order_relaxed);
|
||
}
|
||
}
|
||
|
||
// Phase 7 Task 2: Aggressive inline TLS cache access
|
||
// Enable with: make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1
|
||
#ifndef HAKMEM_TINY_AGGRESSIVE_INLINE
|
||
#define HAKMEM_TINY_AGGRESSIVE_INLINE 0
|
||
#endif
|
||
|
||
#if HAKMEM_TINY_AGGRESSIVE_INLINE
|
||
#include "tiny_alloc_fast_inline.h"
|
||
#endif
|
||
|
||
// ========== Debug Counters (compile-time gated) ==========
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Refill-stage counters (defined in hakmem_tiny.c)
|
||
extern unsigned long long g_rf_total_calls[];
|
||
extern unsigned long long g_rf_hit_bench[];
|
||
extern unsigned long long g_rf_hit_hot[];
|
||
extern unsigned long long g_rf_hit_mail[];
|
||
extern unsigned long long g_rf_hit_slab[];
|
||
extern unsigned long long g_rf_hit_ss[];
|
||
extern unsigned long long g_rf_hit_reg[];
|
||
extern unsigned long long g_rf_mmap_calls[];
|
||
|
||
// Publish hits (defined in hakmem_tiny.c)
|
||
extern unsigned long long g_pub_mail_hits[];
|
||
extern unsigned long long g_pub_bench_hits[];
|
||
extern unsigned long long g_pub_hot_hits[];
|
||
|
||
// Free pipeline (defined in hakmem_tiny.c)
|
||
extern unsigned long long g_free_via_tls_sll[];
|
||
#endif
|
||
|
||
// ========== Box 5: Allocation Fast Path ==========
|
||
// 箱理論の Fast Allocation 層。TLS freelist から直接 pop(3-4命令)。
|
||
// 不変条件:
|
||
// - TLS freelist が非空なら即座に return (no lock, no sync)
|
||
// - Miss なら Backend (Box 3: SuperSlab) に委譲
|
||
// - Cross-thread allocation は考慮しない(Backend が処理)
|
||
|
||
// External TLS variables (defined in hakmem_tiny.c)
|
||
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure
|
||
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
||
|
||
// External backend functions
|
||
// P0 Fix: Use appropriate refill function based on P0 status
|
||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||
extern int sll_refill_batch_from_ss(int class_idx, int max_take);
|
||
#else
|
||
extern int sll_refill_small_from_ss(int class_idx, int max_take);
|
||
#endif
|
||
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
|
||
extern int hak_tiny_size_to_class(size_t size);
|
||
extern int tiny_refill_failfast_level(void);
|
||
extern const size_t g_tiny_class_sizes[];
|
||
// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
|
||
extern int g_refill_count_global;
|
||
extern int g_refill_count_hot;
|
||
extern int g_refill_count_mid;
|
||
extern int g_refill_count_class[TINY_NUM_CLASSES];
|
||
|
||
// HAK_RET_ALLOC macro is now defined in core/hakmem_tiny.c
|
||
// See lines 116-152 for single definition point based on HAKMEM_TINY_HEADER_CLASSIDX
|
||
|
||
// ========== RDTSC Profiling (lightweight) ==========
|
||
#ifdef __x86_64__
|
||
static inline uint64_t tiny_fast_rdtsc(void) {
|
||
unsigned int lo, hi;
|
||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
|
||
return ((uint64_t)hi << 32) | lo;
|
||
}
|
||
#else
|
||
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
|
||
#endif
|
||
|
||
// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
|
||
static __thread uint64_t g_tiny_alloc_hits = 0;
|
||
static __thread uint64_t g_tiny_alloc_cycles = 0;
|
||
static __thread uint64_t g_tiny_refill_calls = 0;
|
||
static __thread uint64_t g_tiny_refill_cycles = 0;
|
||
|
||
// Priority-2: Use cached ENV (eliminate lazy-init + static var overhead)
|
||
static inline int tiny_profile_enabled(void) {
|
||
return HAK_ENV_TINY_PROFILE();
|
||
}
|
||
|
||
// Print profiling results at exit
|
||
static void tiny_fast_print_profile(void) __attribute__((destructor));
|
||
static void tiny_fast_print_profile(void) {
|
||
if (!tiny_profile_enabled()) return;
|
||
if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;
|
||
|
||
fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
|
||
if (g_tiny_alloc_hits > 0) {
|
||
fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
|
||
(unsigned long)g_tiny_alloc_hits,
|
||
(unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
|
||
}
|
||
if (g_tiny_refill_calls > 0) {
|
||
fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n",
|
||
(unsigned long)g_tiny_refill_calls,
|
||
(unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
|
||
}
|
||
fprintf(stderr, "===================================================\n\n");
|
||
}
|
||
|
||
// ========== Front-V2 helpers (tcache-like TLS magazine) ==========
|
||
// Priority-2: Use cached ENV (eliminate lazy-init overhead)
|
||
static inline int tiny_heap_v2_stats_enabled(void) {
|
||
return HAK_ENV_TINY_HEAP_V2_STATS();
|
||
}
|
||
|
||
// TLS HeapV2 initialization barrier (ensures mag->top is zero on first use)
|
||
static inline void tiny_heap_v2_ensure_init(void) {
|
||
extern __thread int g_tls_heap_v2_initialized;
|
||
extern __thread TinyHeapV2Mag g_tiny_heap_v2_mag[];
|
||
|
||
if (__builtin_expect(!g_tls_heap_v2_initialized, 0)) {
|
||
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
|
||
g_tiny_heap_v2_mag[i].top = 0;
|
||
}
|
||
g_tls_heap_v2_initialized = 1;
|
||
}
|
||
}
|
||
|
||
static inline int tiny_heap_v2_refill_mag(int class_idx) {
|
||
// FIX: Ensure TLS is initialized before first magazine access
|
||
tiny_heap_v2_ensure_init();
|
||
if (class_idx < 0 || class_idx > 3) return 0;
|
||
if (!tiny_heap_v2_class_enabled(class_idx)) return 0;
|
||
|
||
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
|
||
if (!TINY_FRONT_TLS_SLL_ENABLED) return 0;
|
||
|
||
TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx];
|
||
const int cap = TINY_HEAP_V2_MAG_CAP;
|
||
int filled = 0;
|
||
|
||
// FIX: Validate mag->top before use (prevent uninitialized TLS corruption)
|
||
if (mag->top < 0 || mag->top > cap) {
|
||
static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0};
|
||
if (!s_reset_logged[class_idx]) {
|
||
fprintf(stderr, "[HEAP_V2_REFILL] C%d mag->top=%d corrupted, reset to 0\n",
|
||
class_idx, mag->top);
|
||
s_reset_logged[class_idx] = 1;
|
||
}
|
||
mag->top = 0;
|
||
}
|
||
|
||
// First, steal from TLS SLL if already available.
|
||
while (mag->top < cap) {
|
||
void* base = NULL;
|
||
if (!tls_sll_pop(class_idx, &base)) break;
|
||
mag->items[mag->top++] = base;
|
||
filled++;
|
||
}
|
||
|
||
// If magazine is still empty, ask backend to refill SLL once, then steal again.
|
||
if (mag->top < cap && filled == 0) {
|
||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||
(void)sll_refill_batch_from_ss(class_idx, cap);
|
||
#else
|
||
(void)sll_refill_small_from_ss(class_idx, cap);
|
||
#endif
|
||
while (mag->top < cap) {
|
||
void* base = NULL;
|
||
if (!tls_sll_pop(class_idx, &base)) break;
|
||
mag->items[mag->top++] = base;
|
||
filled++;
|
||
}
|
||
}
|
||
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
if (filled > 0) {
|
||
g_tiny_heap_v2_stats[class_idx].refill_calls++;
|
||
g_tiny_heap_v2_stats[class_idx].refill_blocks += (uint64_t)filled;
|
||
}
|
||
}
|
||
return filled;
|
||
}
|
||
|
||
static inline void* tiny_heap_v2_alloc_by_class(int class_idx) {
|
||
// FIX: Ensure TLS is initialized before first magazine access
|
||
tiny_heap_v2_ensure_init();
|
||
if (class_idx < 0 || class_idx > 3) return NULL;
|
||
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
|
||
if (!TINY_FRONT_HEAP_V2_ENABLED) return NULL;
|
||
if (!tiny_heap_v2_class_enabled(class_idx)) return NULL;
|
||
|
||
TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx];
|
||
|
||
// Hit: magazine has entries
|
||
if (__builtin_expect(mag->top > 0, 1)) {
|
||
// FIX: Add underflow protection before array access
|
||
const int cap = TINY_HEAP_V2_MAG_CAP;
|
||
if (mag->top > cap || mag->top < 0) {
|
||
static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0};
|
||
if (!s_reset_logged[class_idx]) {
|
||
fprintf(stderr, "[HEAP_V2_ALLOC] C%d mag->top=%d corrupted, reset to 0\n",
|
||
class_idx, mag->top);
|
||
s_reset_logged[class_idx] = 1;
|
||
}
|
||
mag->top = 0;
|
||
return NULL; // Fall through to refill path
|
||
}
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
g_tiny_heap_v2_stats[class_idx].alloc_calls++;
|
||
g_tiny_heap_v2_stats[class_idx].mag_hits++;
|
||
}
|
||
return mag->items[--mag->top];
|
||
}
|
||
|
||
// Miss: try single refill from SLL/backend
|
||
int filled = tiny_heap_v2_refill_mag(class_idx);
|
||
if (filled > 0 && mag->top > 0) {
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
g_tiny_heap_v2_stats[class_idx].alloc_calls++;
|
||
g_tiny_heap_v2_stats[class_idx].mag_hits++;
|
||
}
|
||
return mag->items[--mag->top];
|
||
}
|
||
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
g_tiny_heap_v2_stats[class_idx].backend_oom++;
|
||
}
|
||
return NULL;
|
||
}
|
||
|
||
// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========
|
||
|
||
// External SFC control (defined in hakmem_tiny_sfc.c)
|
||
extern int g_sfc_enabled;
|
||
|
||
// Allocation fast path (inline for zero-cost)
|
||
// Returns: pointer on success, NULL on miss (caller should try refill/slow)
|
||
//
|
||
// Box 5-NEW Architecture:
|
||
// Layer 0: SFC (128-256 slots, high hit rate) [if enabled]
|
||
// Layer 1: SLL (unlimited, existing)
|
||
// Cascade: SFC miss → try SLL → refill
|
||
//
|
||
// Assembly (x86-64, optimized):
|
||
// mov rax, QWORD PTR g_sfc_head[class_idx] ; SFC: Load head
|
||
// test rax, rax ; Check NULL
|
||
// jne .sfc_hit ; If not empty, SFC hit!
|
||
// mov rax, QWORD PTR g_tls_sll_head[class_idx] ; SLL: Load head
|
||
// test rax, rax ; Check NULL
|
||
// je .miss ; If empty, miss
|
||
// mov rdx, QWORD PTR [rax] ; Load next
|
||
// mov QWORD PTR g_tls_sll_head[class_idx], rdx ; Update head
|
||
// ret ; Return ptr
|
||
// .sfc_hit:
|
||
// mov rdx, QWORD PTR [rax] ; Load next
|
||
// mov QWORD PTR g_sfc_head[class_idx], rdx ; Update head
|
||
// ret
|
||
// .miss:
|
||
// ; Fall through to refill
|
||
//
|
||
// Expected: 3-4 instructions on SFC hit, 6-8 on SLL hit
|
||
static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||
// PRIORITY 1: Bounds check before any TLS array access
|
||
HAK_CHECK_CLASS_IDX(class_idx, "tiny_alloc_fast_pop");
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Phase 3: Debug counters eliminated in release builds
|
||
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
|
||
|
||
// DEBUG: Log class 2 pops (DISABLED for performance)
|
||
static _Atomic uint64_t g_fast_pop_count = 0;
|
||
uint64_t pop_call = atomic_fetch_add(&g_fast_pop_count, 1);
|
||
if (0 && class_idx == 2 && pop_call > 5840 && pop_call < 5900) {
|
||
fprintf(stderr, "[FAST_POP_C2] call=%lu cls=%d head=%p count=%u\n",
|
||
pop_call, class_idx, g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
|
||
fflush(stderr);
|
||
}
|
||
#endif
|
||
|
||
// Phase E1-CORRECT: C7 now has headers, can use fast path
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
void* out = NULL;
|
||
if (front_gate_try_pop(class_idx, &out)) {
|
||
return out;
|
||
}
|
||
return NULL;
|
||
#else
|
||
// ========== Phase 19-1: Quick Prune (Frontend SLIM mode) ==========
|
||
// ENV: HAKMEM_TINY_FRONT_SLIM=1
|
||
// Goal: Skip FastCache + SFC layers, go straight to SLL (88-99% hit rate)
|
||
// Expected: 22M → 27-30M ops/s (+22-36%)
|
||
// Priority-2: Use cached ENV (eliminate lazy-init TLS overhead)
|
||
|
||
// SLIM MODE: Skip FastCache + SFC, go straight to SLL
|
||
if (__builtin_expect(HAK_ENV_TINY_FRONT_SLIM(), 0)) {
|
||
// Box Boundary: TLS SLL freelist pop (only layer in SLIM mode)
|
||
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
|
||
if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) {
|
||
void* base = NULL;
|
||
if (tls_sll_pop(class_idx, &base)) {
|
||
// Front Gate: SLL hit (SLIM fast path - 3 instructions)
|
||
extern unsigned long long g_front_sll_hit[];
|
||
g_front_sll_hit[class_idx]++;
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base);
|
||
return base;
|
||
}
|
||
}
|
||
// SLIM mode miss → return NULL (caller refills)
|
||
return NULL;
|
||
}
|
||
// ========== End Phase 19-1: Quick Prune ==========
|
||
|
||
// Phase 7 Task 3: Profiling overhead removed in release builds
|
||
// In release mode, compiler can completely eliminate profiling code
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
||
#endif
|
||
|
||
// Phase 1: Try array stack (FastCache) first for hottest tiny classes (C0–C3)
|
||
// Phase 7-Step4: Use config macro for dead code elimination in PGO mode
|
||
if (__builtin_expect(TINY_FRONT_FASTCACHE_ENABLED && class_idx <= 3, 1)) {
|
||
void* fc = fastcache_pop(class_idx);
|
||
if (__builtin_expect(fc != NULL, 1)) {
|
||
// Frontend FastCache hit (already tracked by g_front_fc_hit)
|
||
extern unsigned long long g_front_fc_hit[];
|
||
g_front_fc_hit[class_idx]++;
|
||
return fc;
|
||
} else {
|
||
// Frontend FastCache miss (already tracked by g_front_fc_miss)
|
||
extern unsigned long long g_front_fc_miss[];
|
||
g_front_fc_miss[class_idx]++;
|
||
}
|
||
}
|
||
|
||
// Box 5-NEW: Layer 0 - Try SFC first (if enabled)
|
||
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
|
||
static __thread int sfc_check_done = 0;
|
||
static __thread int sfc_is_enabled = 0;
|
||
if (__builtin_expect(!sfc_check_done, 0)) {
|
||
sfc_is_enabled = TINY_FRONT_SFC_ENABLED;
|
||
sfc_check_done = 1;
|
||
}
|
||
|
||
if (__builtin_expect(sfc_is_enabled, 1)) {
|
||
void* base = sfc_alloc(class_idx);
|
||
if (__builtin_expect(base != NULL, 1)) {
|
||
// Front Gate: SFC hit
|
||
extern unsigned long long g_front_sfc_hit[];
|
||
g_front_sfc_hit[class_idx]++;
|
||
// 🚀 SFC HIT! (Layer 0)
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
if (start) {
|
||
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
|
||
g_tiny_alloc_hits++;
|
||
}
|
||
#endif
|
||
// ✅ FIX #16: Return BASE pointer (not USER)
|
||
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
|
||
// which does the BASE → USER conversion. Double conversion was causing corruption!
|
||
return base;
|
||
}
|
||
// SFC miss → try SLL (Layer 1)
|
||
}
|
||
|
||
// Box Boundary: Layer 1 - TLS SLL freelist の先頭を pop(envで無効化可)
|
||
// Note: This is in tiny_alloc_fast_pop(), not tiny_alloc_fast(), so use global variable
|
||
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
|
||
if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) {
|
||
// Use Box TLS-SLL API (C7-safe pop)
|
||
// CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!)
|
||
// Reading head before pop causes stale read → rbp=0xa0 SEGV
|
||
void* base = NULL;
|
||
if (tls_sll_pop(class_idx, &base)) {
|
||
// Front Gate: SLL hit (fast path 3 instructions)
|
||
extern unsigned long long g_front_sll_hit[];
|
||
g_front_sll_hit[class_idx]++;
|
||
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base);
|
||
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Track TLS freelist hits (compile-time gated, zero runtime cost when disabled)
|
||
g_free_via_tls_sll[class_idx]++;
|
||
#endif
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Debug: Track profiling (release builds skip this overhead)
|
||
if (start) {
|
||
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
|
||
g_tiny_alloc_hits++;
|
||
}
|
||
#endif
|
||
// ✅ FIX #16: Return BASE pointer (not USER)
|
||
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
|
||
// which does the BASE → USER conversion. Double conversion was causing corruption!
|
||
return base;
|
||
}
|
||
}
|
||
|
||
// Fast path miss → NULL (caller should refill)
|
||
return NULL;
|
||
#endif
|
||
}
|
||
|
||
// ========== Cascade Refill: SFC ← SLL (Box Theory boundary) ==========
|
||
|
||
// Cascade refill: Transfer blocks from SLL to SFC (one-way, safe)
|
||
// Returns: number of blocks transferred
|
||
//
|
||
// Contract:
|
||
// - Transfer ownership: SLL → SFC
|
||
// - No circular dependency: one-way only
|
||
// - Boundary clear: SLL pop → SFC push
|
||
// - Fallback safe: if SFC full, stop (no overflow)
|
||
// Env-driven cascade percentage (0-100), default 50%
|
||
// Priority-2: Use cached ENV (eliminate lazy-init + atoi() overhead)
|
||
static inline int sfc_cascade_pct(void) {
|
||
return HAK_ENV_SFC_CASCADE_PCT();
|
||
}
|
||
|
||
static inline int sfc_refill_from_sll(int class_idx, int target_count) {
|
||
// PRIORITY 1: Bounds check
|
||
HAK_CHECK_CLASS_IDX(class_idx, "sfc_refill_from_sll");
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
|
||
#endif
|
||
|
||
int transferred = 0;
|
||
uint32_t cap = g_sfc_capacity[class_idx];
|
||
|
||
// Adjust target based on cascade percentage
|
||
int pct = sfc_cascade_pct();
|
||
int want = (target_count * pct) / 100;
|
||
if (want <= 0) want = target_count / 2; // safety fallback
|
||
|
||
while (transferred < want && g_tls_sll[class_idx].count > 0) {
|
||
// Check SFC capacity before transfer
|
||
if (g_sfc_count[class_idx] >= cap) {
|
||
break; // SFC full, stop
|
||
}
|
||
|
||
// Pop from SLL (Layer 1) using Box TLS-SLL API (C7-safe)
|
||
void* ptr = NULL;
|
||
if (!tls_sll_pop(class_idx, &ptr)) {
|
||
break; // SLL empty
|
||
}
|
||
|
||
// Push to SFC (Layer 0) — header-aware
|
||
tiny_next_write(class_idx, ptr, g_sfc_head[class_idx]);
|
||
g_sfc_head[class_idx] = ptr;
|
||
g_sfc_count[class_idx]++;
|
||
|
||
transferred++;
|
||
}
|
||
|
||
return transferred;
|
||
}
|
||
|
||
// ========== Refill Path: Backend Integration ==========
|
||
|
||
// Refill TLS freelist from backend (SuperSlab/ACE/Learning layer)
|
||
// Returns: number of blocks refilled
|
||
//
|
||
// Box 5-NEW Architecture:
|
||
// SFC enabled: SuperSlab → SLL → SFC (cascade)
|
||
// SFC disabled: SuperSlab → SLL (direct, old path)
|
||
//
|
||
// This integrates with existing HAKMEM infrastructure:
|
||
// - SuperSlab provides memory chunks
|
||
// - ACE provides adaptive capacity learning
|
||
// - L25 provides mid-large integration
|
||
//
|
||
// Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 16)
|
||
// - Smaller count (8-16): better for diverse workloads, faster warmup
|
||
// - Larger count (64-128): better for homogeneous workloads, fewer refills
|
||
static inline int tiny_alloc_fast_refill(int class_idx) {
|
||
// Phase E1-CORRECT: C7 now has headers, can use refill
|
||
|
||
// Phase 7 Task 3: Profiling overhead removed in release builds
|
||
// In release mode, compiler can completely eliminate profiling code
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
||
#endif
|
||
|
||
// Phase 2b: Check available capacity before refill
|
||
int available_capacity = get_available_capacity(class_idx);
|
||
if (available_capacity <= 0) {
|
||
// Cache is full, don't refill
|
||
return 0;
|
||
}
|
||
|
||
// Phase 7 Task 3: Simplified refill count (cached per-class in TLS)
|
||
// Previous: Complex precedence logic on every miss (5-10 cycles overhead)
|
||
// Now: Simple TLS cache lookup (1-2 cycles)
|
||
static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
|
||
// Simple adaptive booster: bump per-class refill size when refills are frequent.
|
||
static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0};
|
||
int cnt = s_refill_count[class_idx];
|
||
if (__builtin_expect(cnt == 0, 0)) {
|
||
// First miss: Initialize from globals (parsed at init time)
|
||
int v = HAKMEM_TINY_REFILL_DEFAULT; // Default from hakmem_build_flags.h
|
||
|
||
// Precedence: per-class > hot/mid > global
|
||
if (g_refill_count_class[class_idx] > 0) {
|
||
v = g_refill_count_class[class_idx];
|
||
} else if (class_idx <= 3 && g_refill_count_hot > 0) {
|
||
v = g_refill_count_hot;
|
||
} else if (class_idx >= 4 && g_refill_count_mid > 0) {
|
||
v = g_refill_count_mid;
|
||
} else if (g_refill_count_global > 0) {
|
||
v = g_refill_count_global;
|
||
}
|
||
|
||
// Clamp to sane range (min: 8, max: 256)
|
||
if (v < 8) v = 8; // Minimum: avoid thrashing
|
||
if (v > 256) v = 256; // Maximum: avoid excessive TLS memory
|
||
|
||
s_refill_count[class_idx] = v;
|
||
cnt = v;
|
||
}
|
||
|
||
// Phase 2b: Clamp refill count to available capacity
|
||
if (cnt > available_capacity) {
|
||
cnt = available_capacity;
|
||
}
|
||
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Track refill calls (compile-time gated)
|
||
g_rf_total_calls[class_idx]++;
|
||
#endif
|
||
|
||
// Box Boundary: Delegate to Backend (Box 3: SuperSlab)
|
||
// Refill Dispatch: Standard (ss_refill_fc_fill) vs Legacy SLL (A/B only)
|
||
// Standard: Enabled by FRONT_DIRECT=1, REFILL_BATCH=1, or P0_DIRECT_FC_ALL=1
|
||
// Legacy: Fallback for compatibility (will be deprecated)
|
||
int refilled = 0;
|
||
|
||
// Front-Direct A/B 実装は現 HEAD では非対応。
|
||
// 常にレガシー経路(SS→SLL→FC)を使う。
|
||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||
refilled = sll_refill_batch_from_ss(class_idx, cnt);
|
||
#else
|
||
refilled = sll_refill_small_from_ss(class_idx, cnt);
|
||
#endif
|
||
|
||
// Lightweight adaptation: if refills keep happening, increase per-class refill.
|
||
// Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
|
||
if (refilled > 0) {
|
||
uint8_t c = ++s_refill_calls[class_idx];
|
||
if (class_idx == 7) {
|
||
// Every 4 refills, increase target by +16 up to 128 (unless overridden).
|
||
if ((c & 0x03u) == 0) {
|
||
int target = s_refill_count[class_idx];
|
||
if (target < 128) {
|
||
target += 16;
|
||
if (target > 128) target = 128;
|
||
s_refill_count[class_idx] = target;
|
||
}
|
||
}
|
||
}
|
||
} else {
|
||
// No refill performed (capacity full): slowly decay the counter.
|
||
if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--;
|
||
}
|
||
|
||
// Phase 2b: Track refill and adapt cache size
|
||
if (refilled > 0) {
|
||
track_refill_for_adaptation(class_idx);
|
||
}
|
||
|
||
// Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default)
|
||
// Priority-2: Use cached ENV (eliminate lazy-init TLS overhead)
|
||
|
||
// Only cascade if explicitly enabled AND we have refilled blocks in SLL
|
||
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
|
||
if (HAK_ENV_TINY_SFC_CASCADE() && TINY_FRONT_SFC_ENABLED && refilled > 0) {
|
||
// Transfer half of refilled blocks to SFC (keep half in SLL for future)
|
||
int sfc_target = refilled / 2;
|
||
if (sfc_target > 0) {
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
front_gate_after_refill(class_idx, refilled);
|
||
#else
|
||
int transferred = sfc_refill_from_sll(class_idx, sfc_target);
|
||
(void)transferred; // Unused, but could track stats
|
||
#endif
|
||
}
|
||
}
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Debug: Track profiling (release builds skip this overhead)
|
||
if (start) {
|
||
g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
|
||
g_tiny_refill_calls++;
|
||
}
|
||
#endif
|
||
|
||
return refilled;
|
||
}
|
||
|
||
// ========== Combined Fast Path (Alloc + Refill) ==========
|
||
|
||
// Complete fast path allocation (inline for zero-cost)
|
||
// Returns: pointer on success, NULL on failure (OOM or size too large)
|
||
//
|
||
// Flow:
|
||
// 1. TLS freelist pop (3-4 instructions) - Hit rate ~95%
|
||
// 2. Miss → Refill from backend (~5% cases)
|
||
// 3. Refill success → Retry pop
|
||
// 4. Refill failure → Slow path (OOM or new SuperSlab allocation)
|
||
//
|
||
// Example usage:
|
||
// void* ptr = tiny_alloc_fast(64);
|
||
// if (!ptr) {
|
||
// // OOM handling
|
||
// }
|
||
static inline void* tiny_alloc_fast(size_t size) {
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Phase 3: Debug counters eliminated in release builds
|
||
static _Atomic uint64_t alloc_call_count = 0;
|
||
uint64_t call_num = atomic_fetch_add(&alloc_call_count, 1);
|
||
#endif
|
||
|
||
// Phase 22: Global init (once per process)
|
||
lazy_init_global();
|
||
|
||
// ========== Phase 19-2: Ultra SLIM 4-Layer Fast Path ==========
|
||
// ENV: HAKMEM_TINY_ULTRA_SLIM=1
|
||
// Expected: 90-110M ops/s (mimalloc parity)
|
||
// Architecture: Init Safety + Size-to-Class + ACE Learning + TLS SLL Direct
|
||
// Note: ACE learning preserved (HAKMEM's differentiator vs mimalloc)
|
||
|
||
// Debug: Check if Ultra SLIM is enabled (first call only)
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
static __thread int debug_checked = 0;
|
||
if (!debug_checked) {
|
||
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
|
||
int enabled = TINY_FRONT_ULTRA_SLIM_ENABLED;
|
||
if (enabled) {
|
||
fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: ENABLED (will use 4-layer path)\n");
|
||
} else {
|
||
fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: DISABLED (will use standard path)\n");
|
||
}
|
||
debug_checked = 1;
|
||
}
|
||
#endif
|
||
|
||
// Phase 7-Step4: Use config macro for dead code elimination in PGO mode
|
||
if (__builtin_expect(TINY_FRONT_ULTRA_SLIM_ENABLED, 0)) {
|
||
return ultra_slim_alloc_with_refill(size);
|
||
}
|
||
// ========== End Phase 19-2: Ultra SLIM ==========
|
||
|
||
// 1. Size → class index (inline, fast)
|
||
int class_idx = hak_tiny_size_to_class(size);
|
||
|
||
if (__builtin_expect(class_idx < 0, 0)) {
|
||
return NULL; // Size > 1KB, not Tiny
|
||
}
|
||
|
||
// Phase 3c L1D Opt: Prefetch TLS cache head early
|
||
// Phase 3d-B: Prefetch unified TLS SLL struct (single prefetch for both head+count)
|
||
__builtin_prefetch(&g_tls_sll[class_idx], 0, 3);
|
||
|
||
// Phase 22: Lazy per-class init (on first use)
|
||
lazy_init_class(class_idx);
|
||
|
||
// Phase 3-4: Record allocation for ACE Profile learning
|
||
// TLS increment only (no atomic operation, amortized flush at threshold)
|
||
tiny_sizeclass_hist_hit(class_idx);
|
||
|
||
// P0.1: Cache g_tls_sll_enable once (Phase 3-4 instruction reduction)
|
||
// Eliminates redundant global variable reads (2-3 instructions saved)
|
||
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
|
||
const int sll_enabled = TINY_FRONT_TLS_SLL_ENABLED;
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Phase 3: Debug checks eliminated in release builds
|
||
// CRITICAL: Bounds check to catch corruption
|
||
if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) {
|
||
fprintf(stderr, "[TINY_ALLOC_FAST] FATAL: class_idx=%d out of bounds! size=%zu call=%lu\n",
|
||
class_idx, size, call_num);
|
||
fflush(stderr);
|
||
abort();
|
||
}
|
||
|
||
// Debug logging (DISABLED for performance)
|
||
if (0 && call_num > 14250 && call_num < 14280) {
|
||
fprintf(stderr, "[TINY_ALLOC] call=%lu size=%zu class=%d sll_head[%d]=%p count=%u\n",
|
||
call_num, size, class_idx, class_idx,
|
||
g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
|
||
fflush(stderr);
|
||
}
|
||
#endif
|
||
|
||
ROUTE_BEGIN(class_idx);
|
||
|
||
void* ptr = NULL;
|
||
|
||
// Front-V2: TLS magazine front (A/B, default OFF)
|
||
// Phase 7-Step4: Use config macro for dead code elimination in PGO mode
|
||
if (__builtin_expect(TINY_FRONT_HEAP_V2_ENABLED && front_prune_heapv2_enabled() && class_idx <= 3, 0)) {
|
||
void* hv2 = tiny_heap_v2_alloc_by_class(class_idx);
|
||
if (hv2) {
|
||
front_metrics_heapv2_hit(class_idx);
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, hv2);
|
||
} else {
|
||
front_metrics_heapv2_miss(class_idx);
|
||
}
|
||
}
|
||
|
||
// Generic front (FastCache/SFC/SLL)
|
||
// Respect SLL global toggle
|
||
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
|
||
if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) {
|
||
// For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads.
|
||
if (class_idx <= 3) {
|
||
#if HAKMEM_TINY_INLINE_SLL
|
||
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
|
||
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
|
||
#else
|
||
// Default: Safe Box API (Box TLS-SLL) for all standard builds
|
||
ptr = tiny_alloc_fast_pop(class_idx);
|
||
#endif
|
||
} else {
|
||
void* base = NULL;
|
||
if (tls_sll_pop(class_idx, &base)) {
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base);
|
||
ptr = base;
|
||
} else {
|
||
ptr = NULL;
|
||
}
|
||
}
|
||
} else {
|
||
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
|
||
}
|
||
|
||
// Phase 3c L1D Opt: Prefetch next freelist entry if we got a pointer
|
||
if (__builtin_expect(ptr != NULL, 1)) {
|
||
__builtin_prefetch(ptr, 0, 3);
|
||
}
|
||
|
||
if (__builtin_expect(ptr != NULL, 1)) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, ptr);
|
||
}
|
||
|
||
// Refill to TLS List/SLL
|
||
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||
void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
|
||
if (took) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, took);
|
||
}
|
||
|
||
// Backend refill後に再トライ
|
||
{
|
||
int refilled = tiny_alloc_fast_refill(class_idx);
|
||
if (__builtin_expect(refilled > 0, 1)) {
|
||
// Retry SLL if enabled (P0.1: using cached sll_enabled)
|
||
if (__builtin_expect(sll_enabled, 1)) {
|
||
if (class_idx <= 3) {
|
||
#if HAKMEM_TINY_INLINE_SLL
|
||
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
|
||
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
|
||
#else
|
||
// Default: Safe Box API (Box TLS-SLL) for all standard builds
|
||
ptr = tiny_alloc_fast_pop(class_idx);
|
||
#endif
|
||
} else {
|
||
void* base2 = NULL;
|
||
if (tls_sll_pop(class_idx, &base2)) {
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base2);
|
||
ptr = base2;
|
||
} else {
|
||
ptr = NULL;
|
||
}
|
||
}
|
||
} else {
|
||
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
|
||
}
|
||
if (ptr) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, ptr);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 5. Refill failure or still empty → slow path (OOM or new SuperSlab)
|
||
// Box Boundary: Delegate to Slow Path (Box 3 backend)
|
||
ptr = hak_tiny_alloc_slow(size, class_idx);
|
||
if (ptr) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, ptr);
|
||
}
|
||
|
||
return ptr; // NULL if OOM
|
||
}
|
||
|
||
// ========== Push to TLS Freelist (for free path) ==========
|
||
|
||
// Push block to TLS freelist (used by free fast path)
|
||
// This is a "helper" for Box 6 (Free Fast Path)
|
||
//
|
||
// Invariant: ptr must belong to current thread (no ownership check here)
|
||
// Caller (Box 6) is responsible for ownership verification
|
||
static inline void tiny_alloc_fast_push(int class_idx, void* ptr) {
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
front_gate_push_tls(class_idx, ptr);
|
||
#else
|
||
// Box Boundary: Push to TLS freelist using Box TLS-SLL API (C7-safe)
|
||
uint32_t capacity = UINT32_MAX; // Unlimited for helper function
|
||
if (!tls_sll_push(class_idx, ptr, capacity)) {
|
||
// C7 rejected or SLL somehow full (should not happen)
|
||
// In release builds, this is a no-op (caller expects success)
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
fprintf(stderr, "[WARN] tls_sll_push failed in tiny_alloc_fast_push cls=%d ptr=%p\n",
|
||
class_idx, ptr);
|
||
#endif
|
||
}
|
||
#endif
|
||
}
|
||
|
||
// ========== Statistics & Diagnostics ==========
|
||
|
||
// Get TLS freelist stats (for debugging/profiling)
|
||
typedef struct {
|
||
int class_idx;
|
||
void* head;
|
||
uint32_t count;
|
||
} TinyAllocFastStats;
|
||
|
||
static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
|
||
TinyAllocFastStats stats = {
|
||
.class_idx = class_idx,
|
||
.head = g_tls_sll[class_idx].head,
|
||
.count = g_tls_sll[class_idx].count
|
||
};
|
||
return stats;
|
||
}
|
||
|
||
// Reset TLS freelist (for testing/benchmarking)
|
||
// WARNING: This leaks memory! Only use in controlled test environments.
|
||
static inline void tiny_alloc_fast_reset(int class_idx) {
|
||
tls_sll_set_head_raw(class_idx, NULL, "fast_reset");
|
||
g_tls_sll[class_idx].count = 0;
|
||
}
|
||
|
||
// ========== Performance Notes ==========
|
||
//
|
||
// Expected metrics (based on System tcache & HAKX +171% results):
|
||
// - Fast path hit rate: 95%+ (workload dependent)
|
||
// - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs)
|
||
// - Miss penalty: ~20-50 instructions (refill from SuperSlab)
|
||
// - Throughput improvement: +10-25% vs current multi-layer design
|
||
//
|
||
// Key optimizations:
|
||
// 1. `__builtin_expect` for branch prediction (hot path first)
|
||
// 2. `static inline` for zero-cost abstraction
|
||
// 3. TLS variables (no atomic ops, no locks)
|
||
// 4. Minimal work in fast path (defer stats/accounting to backend)
|
||
//
|
||
// Comparison with current design:
|
||
// - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...)
|
||
// - New: 3-4 instructions (TLS freelist pop only)
|
||
// - Reduction: -80% instructions in hot path
|
||
//
|
||
// Inspired by:
|
||
// - System tcache (glibc malloc) - 3-4 instruction fast path
|
||
// - HAKX Mid-Large (+171%) - "Simple Front + Smart Back"
|
||
// - Box Theory - Clear boundaries, minimal coupling
|