Add active field to TinySlabMeta to track blocks currently held by users (not in TLS SLL or freelist caches). This enables accurate empty slab detection that accounts for TLS SLL cached blocks. Changes: - superslab_types.h: Add _Atomic uint16_t active field - ss_allocation_box.c, hakmem_tiny_superslab.c: Initialize active=0 - tiny_free_fast_v2.inc.h: Decrement active on TLS SLL push - tiny_alloc_fast.inc.h: Add tiny_active_track_alloc() helper, increment active on TLS SLL pop (all code paths) - ss_hot_cold_box.h: ss_is_slab_empty() uses active when enabled All tracking is ENV-gated: HAKMEM_TINY_ACTIVE_TRACK=1 to enable. Default is off for zero performance impact. Invariant: active = used - tls_cached (active <= used) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
962 lines
36 KiB
C
962 lines
36 KiB
C
// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions)
|
||
// Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%)
|
||
// Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend
|
||
// Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart
|
||
//
|
||
// Box 5-NEW: SFC (Super Front Cache) Integration
|
||
// Architecture: SFC (Layer 0, 128-256 slots) → SLL (Layer 1, unlimited) → SuperSlab (Layer 2+)
|
||
// Cascade Refill: SFC ← SLL (one-way, safe)
|
||
// Goal: +200% performance (4.19M → 12M+ ops/s)
|
||
//
|
||
// Phase 2b: Adaptive TLS Cache Sizing
|
||
// Hot classes grow to 2048 slots, cold classes shrink to 16 slots
|
||
// Expected: +3-10% performance, -30-50% TLS cache memory overhead
|
||
#pragma once
|
||
#include "tiny_atomic.h"
|
||
#include "hakmem_tiny.h"
|
||
#include "tiny_route.h"
|
||
#include "tiny_alloc_fast_sfc.inc.h" // Box 5-NEW: SFC Layer
|
||
#include "hakmem_tiny_fastcache.inc.h" // Array stack (FastCache) for C0–C3
|
||
#include "hakmem_tiny_tls_list.h" // TLS List (for tiny_fast_refill_and_take)
|
||
#include "tiny_region_id.h" // Phase 7: Header-based class_idx lookup
|
||
#include "tiny_adaptive_sizing.h" // Phase 2b: Adaptive sizing
|
||
#include "box/tls_sll_box.h" // Box TLS-SLL: C7-safe push/pop/splice
|
||
#include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
#include "box/front_gate_box.h"
|
||
#endif
|
||
#include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection
|
||
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
|
||
// Ring Cache and Unified Cache removed (A/B test: OFF is faster)
|
||
#endif
|
||
#include "box/front_metrics_box.h" // Phase 19-1: Frontend layer metrics
|
||
#include "front/tiny_heap_v2.h" // Front-V2: TLS magazine (tcache-like) front
|
||
#include "hakmem_tiny_lazy_init.inc.h" // Phase 22: Lazy per-class initialization
|
||
#include "box/tiny_sizeclass_hist_box.h" // Phase 3-4: Tiny size class histogram (ACE learning)
|
||
#include "box/ultra_slim_alloc_box.h" // Phase 19-2: Ultra SLIM 4-layer fast path
|
||
#include <stdio.h>
|
||
#include <stdatomic.h>
|
||
|
||
// P1.3: Helper to increment meta->active when allocating from TLS SLL
|
||
// ENV gate: HAKMEM_TINY_ACTIVE_TRACK=1 to enable (default: 0 for performance)
|
||
static inline void tiny_active_track_alloc(void* base) {
|
||
static __thread int g_active_track = -1;
|
||
if (__builtin_expect(g_active_track == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_ACTIVE_TRACK");
|
||
g_active_track = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
if (__builtin_expect(g_active_track, 0)) {
|
||
extern SuperSlab* ss_fast_lookup(void* ptr);
|
||
SuperSlab* ss = ss_fast_lookup(base);
|
||
if (ss && ss->magic == SUPERSLAB_MAGIC) {
|
||
int slab_idx = slab_index_for(ss, base);
|
||
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
|
||
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
||
atomic_fetch_add_explicit(&meta->active, 1, memory_order_relaxed);
|
||
}
|
||
}
|
||
}
|
||
}
|
||
|
||
// Diag counter: size>=1024 allocations routed to Tiny (env: HAKMEM_TINY_ALLOC_1024_METRIC)
|
||
extern _Atomic uint64_t g_tiny_alloc_ge1024[];
|
||
static inline void tiny_diag_track_size_ge1024_fast(size_t req_size, int class_idx) {
|
||
if (__builtin_expect(req_size < 1024, 1)) return;
|
||
static int s_metric_en = -1;
|
||
if (__builtin_expect(s_metric_en == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_ALLOC_1024_METRIC");
|
||
s_metric_en = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
if (!__builtin_expect(s_metric_en, 0)) return;
|
||
if (__builtin_expect(class_idx >= 0 && class_idx < TINY_NUM_CLASSES, 1)) {
|
||
atomic_fetch_add_explicit(&g_tiny_alloc_ge1024[class_idx], 1, memory_order_relaxed);
|
||
}
|
||
}
|
||
|
||
// Phase 7 Task 2: Aggressive inline TLS cache access
|
||
// Enable with: make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1
|
||
#ifndef HAKMEM_TINY_AGGRESSIVE_INLINE
|
||
#define HAKMEM_TINY_AGGRESSIVE_INLINE 0
|
||
#endif
|
||
|
||
#if HAKMEM_TINY_AGGRESSIVE_INLINE
|
||
#include "tiny_alloc_fast_inline.h"
|
||
#endif
|
||
|
||
// ========== Debug Counters (compile-time gated) ==========
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Refill-stage counters (defined in hakmem_tiny.c)
|
||
extern unsigned long long g_rf_total_calls[];
|
||
extern unsigned long long g_rf_hit_bench[];
|
||
extern unsigned long long g_rf_hit_hot[];
|
||
extern unsigned long long g_rf_hit_mail[];
|
||
extern unsigned long long g_rf_hit_slab[];
|
||
extern unsigned long long g_rf_hit_ss[];
|
||
extern unsigned long long g_rf_hit_reg[];
|
||
extern unsigned long long g_rf_mmap_calls[];
|
||
|
||
// Publish hits (defined in hakmem_tiny.c)
|
||
extern unsigned long long g_pub_mail_hits[];
|
||
extern unsigned long long g_pub_bench_hits[];
|
||
extern unsigned long long g_pub_hot_hits[];
|
||
|
||
// Free pipeline (defined in hakmem_tiny.c)
|
||
extern unsigned long long g_free_via_tls_sll[];
|
||
#endif
|
||
|
||
// ========== Box 5: Allocation Fast Path ==========
|
||
// 箱理論の Fast Allocation 層。TLS freelist から直接 pop(3-4命令)。
|
||
// 不変条件:
|
||
// - TLS freelist が非空なら即座に return (no lock, no sync)
|
||
// - Miss なら Backend (Box 3: SuperSlab) に委譲
|
||
// - Cross-thread allocation は考慮しない(Backend が処理)
|
||
|
||
// External TLS variables (defined in hakmem_tiny.c)
|
||
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure
|
||
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
|
||
|
||
// External backend functions
|
||
// P0 Fix: Use appropriate refill function based on P0 status
|
||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||
extern int sll_refill_batch_from_ss(int class_idx, int max_take);
|
||
#else
|
||
extern int sll_refill_small_from_ss(int class_idx, int max_take);
|
||
#endif
|
||
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
|
||
extern int hak_tiny_size_to_class(size_t size);
|
||
extern int tiny_refill_failfast_level(void);
|
||
extern const size_t g_tiny_class_sizes[];
|
||
// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
|
||
extern int g_refill_count_global;
|
||
extern int g_refill_count_hot;
|
||
extern int g_refill_count_mid;
|
||
extern int g_refill_count_class[TINY_NUM_CLASSES];
|
||
|
||
// HAK_RET_ALLOC macro is now defined in core/hakmem_tiny.c
|
||
// See lines 116-152 for single definition point based on HAKMEM_TINY_HEADER_CLASSIDX
|
||
|
||
// ========== RDTSC Profiling (lightweight) ==========
|
||
#ifdef __x86_64__
|
||
static inline uint64_t tiny_fast_rdtsc(void) {
|
||
unsigned int lo, hi;
|
||
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
|
||
return ((uint64_t)hi << 32) | lo;
|
||
}
|
||
#else
|
||
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
|
||
#endif
|
||
|
||
// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
|
||
static __thread uint64_t g_tiny_alloc_hits = 0;
|
||
static __thread uint64_t g_tiny_alloc_cycles = 0;
|
||
static __thread uint64_t g_tiny_refill_calls = 0;
|
||
static __thread uint64_t g_tiny_refill_cycles = 0;
|
||
static int g_tiny_profile_enabled = -1; // -1: uninitialized
|
||
|
||
static inline int tiny_profile_enabled(void) {
|
||
if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) {
|
||
const char* env = getenv("HAKMEM_TINY_PROFILE");
|
||
g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
|
||
}
|
||
return g_tiny_profile_enabled;
|
||
}
|
||
|
||
// Print profiling results at exit
|
||
static void tiny_fast_print_profile(void) __attribute__((destructor));
|
||
static void tiny_fast_print_profile(void) {
|
||
if (!tiny_profile_enabled()) return;
|
||
if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;
|
||
|
||
fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
|
||
if (g_tiny_alloc_hits > 0) {
|
||
fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
|
||
(unsigned long)g_tiny_alloc_hits,
|
||
(unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
|
||
}
|
||
if (g_tiny_refill_calls > 0) {
|
||
fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n",
|
||
(unsigned long)g_tiny_refill_calls,
|
||
(unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
|
||
}
|
||
fprintf(stderr, "===================================================\n\n");
|
||
}
|
||
|
||
// ========== Front-V2 helpers (tcache-like TLS magazine) ==========
|
||
static inline int tiny_heap_v2_stats_enabled(void) {
|
||
static int enabled = -1;
|
||
if (__builtin_expect(enabled == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_HEAP_V2_STATS");
|
||
enabled = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
return enabled;
|
||
}
|
||
|
||
// TLS HeapV2 initialization barrier (ensures mag->top is zero on first use)
|
||
static inline void tiny_heap_v2_ensure_init(void) {
|
||
extern __thread int g_tls_heap_v2_initialized;
|
||
extern __thread TinyHeapV2Mag g_tiny_heap_v2_mag[];
|
||
|
||
if (__builtin_expect(!g_tls_heap_v2_initialized, 0)) {
|
||
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
|
||
g_tiny_heap_v2_mag[i].top = 0;
|
||
}
|
||
g_tls_heap_v2_initialized = 1;
|
||
}
|
||
}
|
||
|
||
static inline int tiny_heap_v2_refill_mag(int class_idx) {
|
||
// FIX: Ensure TLS is initialized before first magazine access
|
||
tiny_heap_v2_ensure_init();
|
||
if (class_idx < 0 || class_idx > 3) return 0;
|
||
if (!tiny_heap_v2_class_enabled(class_idx)) return 0;
|
||
|
||
extern int g_tls_sll_enable;
|
||
if (!g_tls_sll_enable) return 0;
|
||
|
||
TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx];
|
||
const int cap = TINY_HEAP_V2_MAG_CAP;
|
||
int filled = 0;
|
||
|
||
// FIX: Validate mag->top before use (prevent uninitialized TLS corruption)
|
||
if (mag->top < 0 || mag->top > cap) {
|
||
static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0};
|
||
if (!s_reset_logged[class_idx]) {
|
||
fprintf(stderr, "[HEAP_V2_REFILL] C%d mag->top=%d corrupted, reset to 0\n",
|
||
class_idx, mag->top);
|
||
s_reset_logged[class_idx] = 1;
|
||
}
|
||
mag->top = 0;
|
||
}
|
||
|
||
// First, steal from TLS SLL if already available.
|
||
while (mag->top < cap) {
|
||
void* base = NULL;
|
||
if (!tls_sll_pop(class_idx, &base)) break;
|
||
mag->items[mag->top++] = base;
|
||
filled++;
|
||
}
|
||
|
||
// If magazine is still empty, ask backend to refill SLL once, then steal again.
|
||
if (mag->top < cap && filled == 0) {
|
||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||
(void)sll_refill_batch_from_ss(class_idx, cap);
|
||
#else
|
||
(void)sll_refill_small_from_ss(class_idx, cap);
|
||
#endif
|
||
while (mag->top < cap) {
|
||
void* base = NULL;
|
||
if (!tls_sll_pop(class_idx, &base)) break;
|
||
mag->items[mag->top++] = base;
|
||
filled++;
|
||
}
|
||
}
|
||
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
if (filled > 0) {
|
||
g_tiny_heap_v2_stats[class_idx].refill_calls++;
|
||
g_tiny_heap_v2_stats[class_idx].refill_blocks += (uint64_t)filled;
|
||
}
|
||
}
|
||
return filled;
|
||
}
|
||
|
||
static inline void* tiny_heap_v2_alloc_by_class(int class_idx) {
|
||
// FIX: Ensure TLS is initialized before first magazine access
|
||
tiny_heap_v2_ensure_init();
|
||
if (class_idx < 0 || class_idx > 3) return NULL;
|
||
if (!tiny_heap_v2_enabled()) return NULL;
|
||
if (!tiny_heap_v2_class_enabled(class_idx)) return NULL;
|
||
|
||
TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx];
|
||
|
||
// Hit: magazine has entries
|
||
if (__builtin_expect(mag->top > 0, 1)) {
|
||
// FIX: Add underflow protection before array access
|
||
const int cap = TINY_HEAP_V2_MAG_CAP;
|
||
if (mag->top > cap || mag->top < 0) {
|
||
static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0};
|
||
if (!s_reset_logged[class_idx]) {
|
||
fprintf(stderr, "[HEAP_V2_ALLOC] C%d mag->top=%d corrupted, reset to 0\n",
|
||
class_idx, mag->top);
|
||
s_reset_logged[class_idx] = 1;
|
||
}
|
||
mag->top = 0;
|
||
return NULL; // Fall through to refill path
|
||
}
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
g_tiny_heap_v2_stats[class_idx].alloc_calls++;
|
||
g_tiny_heap_v2_stats[class_idx].mag_hits++;
|
||
}
|
||
return mag->items[--mag->top];
|
||
}
|
||
|
||
// Miss: try single refill from SLL/backend
|
||
int filled = tiny_heap_v2_refill_mag(class_idx);
|
||
if (filled > 0 && mag->top > 0) {
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
g_tiny_heap_v2_stats[class_idx].alloc_calls++;
|
||
g_tiny_heap_v2_stats[class_idx].mag_hits++;
|
||
}
|
||
return mag->items[--mag->top];
|
||
}
|
||
|
||
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
|
||
g_tiny_heap_v2_stats[class_idx].backend_oom++;
|
||
}
|
||
return NULL;
|
||
}
|
||
|
||
// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========
|
||
|
||
// External SFC control (defined in hakmem_tiny_sfc.c)
|
||
extern int g_sfc_enabled;
|
||
|
||
// Allocation fast path (inline for zero-cost)
|
||
// Returns: pointer on success, NULL on miss (caller should try refill/slow)
|
||
//
|
||
// Box 5-NEW Architecture:
|
||
// Layer 0: SFC (128-256 slots, high hit rate) [if enabled]
|
||
// Layer 1: SLL (unlimited, existing)
|
||
// Cascade: SFC miss → try SLL → refill
|
||
//
|
||
// Assembly (x86-64, optimized):
|
||
// mov rax, QWORD PTR g_sfc_head[class_idx] ; SFC: Load head
|
||
// test rax, rax ; Check NULL
|
||
// jne .sfc_hit ; If not empty, SFC hit!
|
||
// mov rax, QWORD PTR g_tls_sll_head[class_idx] ; SLL: Load head
|
||
// test rax, rax ; Check NULL
|
||
// je .miss ; If empty, miss
|
||
// mov rdx, QWORD PTR [rax] ; Load next
|
||
// mov QWORD PTR g_tls_sll_head[class_idx], rdx ; Update head
|
||
// ret ; Return ptr
|
||
// .sfc_hit:
|
||
// mov rdx, QWORD PTR [rax] ; Load next
|
||
// mov QWORD PTR g_sfc_head[class_idx], rdx ; Update head
|
||
// ret
|
||
// .miss:
|
||
// ; Fall through to refill
|
||
//
|
||
// Expected: 3-4 instructions on SFC hit, 6-8 on SLL hit
|
||
static inline void* tiny_alloc_fast_pop(int class_idx) {
|
||
// PRIORITY 1: Bounds check before any TLS array access
|
||
HAK_CHECK_CLASS_IDX(class_idx, "tiny_alloc_fast_pop");
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Phase 3: Debug counters eliminated in release builds
|
||
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
|
||
|
||
// DEBUG: Log class 2 pops (DISABLED for performance)
|
||
static _Atomic uint64_t g_fast_pop_count = 0;
|
||
uint64_t pop_call = atomic_fetch_add(&g_fast_pop_count, 1);
|
||
if (0 && class_idx == 2 && pop_call > 5840 && pop_call < 5900) {
|
||
fprintf(stderr, "[FAST_POP_C2] call=%lu cls=%d head=%p count=%u\n",
|
||
pop_call, class_idx, g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
|
||
fflush(stderr);
|
||
}
|
||
#endif
|
||
|
||
// Phase E1-CORRECT: C7 now has headers, can use fast path
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
void* out = NULL;
|
||
if (front_gate_try_pop(class_idx, &out)) {
|
||
return out;
|
||
}
|
||
return NULL;
|
||
#else
|
||
// ========== Phase 19-1: Quick Prune (Frontend SLIM mode) ==========
|
||
// ENV: HAKMEM_TINY_FRONT_SLIM=1
|
||
// Goal: Skip FastCache + SFC layers, go straight to SLL (88-99% hit rate)
|
||
// Expected: 22M → 27-30M ops/s (+22-36%)
|
||
static __thread int g_front_slim_checked = 0;
|
||
static __thread int g_front_slim_enabled = 0;
|
||
|
||
if (__builtin_expect(!g_front_slim_checked, 0)) {
|
||
const char* e = getenv("HAKMEM_TINY_FRONT_SLIM");
|
||
g_front_slim_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||
g_front_slim_checked = 1;
|
||
}
|
||
|
||
// SLIM MODE: Skip FastCache + SFC, go straight to SLL
|
||
if (__builtin_expect(g_front_slim_enabled, 0)) {
|
||
// Box Boundary: TLS SLL freelist pop (only layer in SLIM mode)
|
||
extern int g_tls_sll_enable;
|
||
if (__builtin_expect(g_tls_sll_enable, 1)) {
|
||
void* base = NULL;
|
||
if (tls_sll_pop(class_idx, &base)) {
|
||
// Front Gate: SLL hit (SLIM fast path - 3 instructions)
|
||
extern unsigned long long g_front_sll_hit[];
|
||
g_front_sll_hit[class_idx]++;
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base);
|
||
return base;
|
||
}
|
||
}
|
||
// SLIM mode miss → return NULL (caller refills)
|
||
return NULL;
|
||
}
|
||
// ========== End Phase 19-1: Quick Prune ==========
|
||
|
||
// Phase 7 Task 3: Profiling overhead removed in release builds
|
||
// In release mode, compiler can completely eliminate profiling code
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
||
#endif
|
||
|
||
// Phase 1: Try array stack (FastCache) first for hottest tiny classes (C0–C3)
|
||
if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) {
|
||
void* fc = fastcache_pop(class_idx);
|
||
if (__builtin_expect(fc != NULL, 1)) {
|
||
// Frontend FastCache hit (already tracked by g_front_fc_hit)
|
||
extern unsigned long long g_front_fc_hit[];
|
||
g_front_fc_hit[class_idx]++;
|
||
return fc;
|
||
} else {
|
||
// Frontend FastCache miss (already tracked by g_front_fc_miss)
|
||
extern unsigned long long g_front_fc_miss[];
|
||
g_front_fc_miss[class_idx]++;
|
||
}
|
||
}
|
||
|
||
// Box 5-NEW: Layer 0 - Try SFC first (if enabled)
|
||
// Cache g_sfc_enabled in TLS to avoid global load on every allocation
|
||
static __thread int sfc_check_done = 0;
|
||
static __thread int sfc_is_enabled = 0;
|
||
if (__builtin_expect(!sfc_check_done, 0)) {
|
||
sfc_is_enabled = g_sfc_enabled;
|
||
sfc_check_done = 1;
|
||
}
|
||
|
||
if (__builtin_expect(sfc_is_enabled, 1)) {
|
||
void* base = sfc_alloc(class_idx);
|
||
if (__builtin_expect(base != NULL, 1)) {
|
||
// Front Gate: SFC hit
|
||
extern unsigned long long g_front_sfc_hit[];
|
||
g_front_sfc_hit[class_idx]++;
|
||
// 🚀 SFC HIT! (Layer 0)
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
if (start) {
|
||
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
|
||
g_tiny_alloc_hits++;
|
||
}
|
||
#endif
|
||
// ✅ FIX #16: Return BASE pointer (not USER)
|
||
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
|
||
// which does the BASE → USER conversion. Double conversion was causing corruption!
|
||
return base;
|
||
}
|
||
// SFC miss → try SLL (Layer 1)
|
||
}
|
||
|
||
// Box Boundary: Layer 1 - TLS SLL freelist の先頭を pop(envで無効化可)
|
||
// Note: This is in tiny_alloc_fast_pop(), not tiny_alloc_fast(), so use global variable
|
||
extern int g_tls_sll_enable;
|
||
if (__builtin_expect(g_tls_sll_enable, 1)) {
|
||
// Use Box TLS-SLL API (C7-safe pop)
|
||
// CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!)
|
||
// Reading head before pop causes stale read → rbp=0xa0 SEGV
|
||
void* base = NULL;
|
||
if (tls_sll_pop(class_idx, &base)) {
|
||
// Front Gate: SLL hit (fast path 3 instructions)
|
||
extern unsigned long long g_front_sll_hit[];
|
||
g_front_sll_hit[class_idx]++;
|
||
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base);
|
||
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Track TLS freelist hits (compile-time gated, zero runtime cost when disabled)
|
||
g_free_via_tls_sll[class_idx]++;
|
||
#endif
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Debug: Track profiling (release builds skip this overhead)
|
||
if (start) {
|
||
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
|
||
g_tiny_alloc_hits++;
|
||
}
|
||
#endif
|
||
// ✅ FIX #16: Return BASE pointer (not USER)
|
||
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
|
||
// which does the BASE → USER conversion. Double conversion was causing corruption!
|
||
return base;
|
||
}
|
||
}
|
||
|
||
// Fast path miss → NULL (caller should refill)
|
||
return NULL;
|
||
#endif
|
||
}
|
||
|
||
// ========== Cascade Refill: SFC ← SLL (Box Theory boundary) ==========
|
||
|
||
// Cascade refill: Transfer blocks from SLL to SFC (one-way, safe)
|
||
// Returns: number of blocks transferred
|
||
//
|
||
// Contract:
|
||
// - Transfer ownership: SLL → SFC
|
||
// - No circular dependency: one-way only
|
||
// - Boundary clear: SLL pop → SFC push
|
||
// - Fallback safe: if SFC full, stop (no overflow)
|
||
// Env-driven cascade percentage (0-100), default 50%
|
||
static inline int sfc_cascade_pct(void) {
|
||
static int pct = -1;
|
||
if (__builtin_expect(pct == -1, 0)) {
|
||
const char* e = getenv("HAKMEM_SFC_CASCADE_PCT");
|
||
int v = e && *e ? atoi(e) : 50;
|
||
if (v < 0) v = 0; if (v > 100) v = 100;
|
||
pct = v;
|
||
}
|
||
return pct;
|
||
}
|
||
|
||
static inline int sfc_refill_from_sll(int class_idx, int target_count) {
|
||
// PRIORITY 1: Bounds check
|
||
HAK_CHECK_CLASS_IDX(class_idx, "sfc_refill_from_sll");
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
|
||
#endif
|
||
|
||
int transferred = 0;
|
||
uint32_t cap = g_sfc_capacity[class_idx];
|
||
|
||
// Adjust target based on cascade percentage
|
||
int pct = sfc_cascade_pct();
|
||
int want = (target_count * pct) / 100;
|
||
if (want <= 0) want = target_count / 2; // safety fallback
|
||
|
||
while (transferred < want && g_tls_sll[class_idx].count > 0) {
|
||
// Check SFC capacity before transfer
|
||
if (g_sfc_count[class_idx] >= cap) {
|
||
break; // SFC full, stop
|
||
}
|
||
|
||
// Pop from SLL (Layer 1) using Box TLS-SLL API (C7-safe)
|
||
void* ptr = NULL;
|
||
if (!tls_sll_pop(class_idx, &ptr)) {
|
||
break; // SLL empty
|
||
}
|
||
|
||
// Push to SFC (Layer 0) — header-aware
|
||
tiny_next_write(class_idx, ptr, g_sfc_head[class_idx]);
|
||
g_sfc_head[class_idx] = ptr;
|
||
g_sfc_count[class_idx]++;
|
||
|
||
transferred++;
|
||
}
|
||
|
||
return transferred;
|
||
}
|
||
|
||
// ========== Refill Path: Backend Integration ==========
|
||
|
||
// Refill TLS freelist from backend (SuperSlab/ACE/Learning layer)
|
||
// Returns: number of blocks refilled
|
||
//
|
||
// Box 5-NEW Architecture:
|
||
// SFC enabled: SuperSlab → SLL → SFC (cascade)
|
||
// SFC disabled: SuperSlab → SLL (direct, old path)
|
||
//
|
||
// This integrates with existing HAKMEM infrastructure:
|
||
// - SuperSlab provides memory chunks
|
||
// - ACE provides adaptive capacity learning
|
||
// - L25 provides mid-large integration
|
||
//
|
||
// Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 16)
|
||
// - Smaller count (8-16): better for diverse workloads, faster warmup
|
||
// - Larger count (64-128): better for homogeneous workloads, fewer refills
|
||
static inline int tiny_alloc_fast_refill(int class_idx) {
|
||
// Phase E1-CORRECT: C7 now has headers, can use refill
|
||
|
||
// Phase 7 Task 3: Profiling overhead removed in release builds
|
||
// In release mode, compiler can completely eliminate profiling code
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
|
||
#endif
|
||
|
||
// Phase 2b: Check available capacity before refill
|
||
int available_capacity = get_available_capacity(class_idx);
|
||
if (available_capacity <= 0) {
|
||
// Cache is full, don't refill
|
||
return 0;
|
||
}
|
||
|
||
// Phase 7 Task 3: Simplified refill count (cached per-class in TLS)
|
||
// Previous: Complex precedence logic on every miss (5-10 cycles overhead)
|
||
// Now: Simple TLS cache lookup (1-2 cycles)
|
||
static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
|
||
// Simple adaptive booster: bump per-class refill size when refills are frequent.
|
||
static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0};
|
||
int cnt = s_refill_count[class_idx];
|
||
if (__builtin_expect(cnt == 0, 0)) {
|
||
// First miss: Initialize from globals (parsed at init time)
|
||
int v = HAKMEM_TINY_REFILL_DEFAULT; // Default from hakmem_build_flags.h
|
||
|
||
// Precedence: per-class > hot/mid > global
|
||
if (g_refill_count_class[class_idx] > 0) {
|
||
v = g_refill_count_class[class_idx];
|
||
} else if (class_idx <= 3 && g_refill_count_hot > 0) {
|
||
v = g_refill_count_hot;
|
||
} else if (class_idx >= 4 && g_refill_count_mid > 0) {
|
||
v = g_refill_count_mid;
|
||
} else if (g_refill_count_global > 0) {
|
||
v = g_refill_count_global;
|
||
}
|
||
|
||
// Clamp to sane range (min: 8, max: 256)
|
||
if (v < 8) v = 8; // Minimum: avoid thrashing
|
||
if (v > 256) v = 256; // Maximum: avoid excessive TLS memory
|
||
|
||
s_refill_count[class_idx] = v;
|
||
cnt = v;
|
||
}
|
||
|
||
// Phase 2b: Clamp refill count to available capacity
|
||
if (cnt > available_capacity) {
|
||
cnt = available_capacity;
|
||
}
|
||
|
||
#if HAKMEM_DEBUG_COUNTERS
|
||
// Track refill calls (compile-time gated)
|
||
g_rf_total_calls[class_idx]++;
|
||
#endif
|
||
|
||
// Box Boundary: Delegate to Backend (Box 3: SuperSlab)
|
||
// Refill Dispatch: Standard (ss_refill_fc_fill) vs Legacy SLL (A/B only)
|
||
// Standard: Enabled by FRONT_DIRECT=1, REFILL_BATCH=1, or P0_DIRECT_FC_ALL=1
|
||
// Legacy: Fallback for compatibility (will be deprecated)
|
||
int refilled = 0;
|
||
|
||
// Front-Direct A/B 実装は現 HEAD では非対応。
|
||
// 常にレガシー経路(SS→SLL→FC)を使う。
|
||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||
refilled = sll_refill_batch_from_ss(class_idx, cnt);
|
||
#else
|
||
refilled = sll_refill_small_from_ss(class_idx, cnt);
|
||
#endif
|
||
|
||
// Lightweight adaptation: if refills keep happening, increase per-class refill.
|
||
// Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
|
||
if (refilled > 0) {
|
||
uint8_t c = ++s_refill_calls[class_idx];
|
||
if (class_idx == 7) {
|
||
// Every 4 refills, increase target by +16 up to 128 (unless overridden).
|
||
if ((c & 0x03u) == 0) {
|
||
int target = s_refill_count[class_idx];
|
||
if (target < 128) {
|
||
target += 16;
|
||
if (target > 128) target = 128;
|
||
s_refill_count[class_idx] = target;
|
||
}
|
||
}
|
||
}
|
||
} else {
|
||
// No refill performed (capacity full): slowly decay the counter.
|
||
if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--;
|
||
}
|
||
|
||
// Phase 2b: Track refill and adapt cache size
|
||
if (refilled > 0) {
|
||
track_refill_for_adaptation(class_idx);
|
||
}
|
||
|
||
// Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default)
|
||
static __thread int sfc_cascade_enabled = -1;
|
||
if (__builtin_expect(sfc_cascade_enabled == -1, 0)) {
|
||
// Check ENV flag (default: OFF)
|
||
const char* e = getenv("HAKMEM_TINY_SFC_CASCADE");
|
||
sfc_cascade_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||
}
|
||
|
||
// Only cascade if explicitly enabled AND we have refilled blocks in SLL
|
||
if (sfc_cascade_enabled && g_sfc_enabled && refilled > 0) {
|
||
// Transfer half of refilled blocks to SFC (keep half in SLL for future)
|
||
int sfc_target = refilled / 2;
|
||
if (sfc_target > 0) {
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
front_gate_after_refill(class_idx, refilled);
|
||
#else
|
||
int transferred = sfc_refill_from_sll(class_idx, sfc_target);
|
||
(void)transferred; // Unused, but could track stats
|
||
#endif
|
||
}
|
||
}
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Debug: Track profiling (release builds skip this overhead)
|
||
if (start) {
|
||
g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
|
||
g_tiny_refill_calls++;
|
||
}
|
||
#endif
|
||
|
||
return refilled;
|
||
}
|
||
|
||
// ========== Combined Fast Path (Alloc + Refill) ==========
|
||
|
||
// Complete fast path allocation (inline for zero-cost)
|
||
// Returns: pointer on success, NULL on failure (OOM or size too large)
|
||
//
|
||
// Flow:
|
||
// 1. TLS freelist pop (3-4 instructions) - Hit rate ~95%
|
||
// 2. Miss → Refill from backend (~5% cases)
|
||
// 3. Refill success → Retry pop
|
||
// 4. Refill failure → Slow path (OOM or new SuperSlab allocation)
|
||
//
|
||
// Example usage:
|
||
// void* ptr = tiny_alloc_fast(64);
|
||
// if (!ptr) {
|
||
// // OOM handling
|
||
// }
|
||
static inline void* tiny_alloc_fast(size_t size) {
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Phase 3: Debug counters eliminated in release builds
|
||
static _Atomic uint64_t alloc_call_count = 0;
|
||
uint64_t call_num = atomic_fetch_add(&alloc_call_count, 1);
|
||
#endif
|
||
|
||
// Phase 22: Global init (once per process)
|
||
lazy_init_global();
|
||
|
||
// ========== Phase 19-2: Ultra SLIM 4-Layer Fast Path ==========
|
||
// ENV: HAKMEM_TINY_ULTRA_SLIM=1
|
||
// Expected: 90-110M ops/s (mimalloc parity)
|
||
// Architecture: Init Safety + Size-to-Class + ACE Learning + TLS SLL Direct
|
||
// Note: ACE learning preserved (HAKMEM's differentiator vs mimalloc)
|
||
|
||
// Debug: Check if Ultra SLIM is enabled (first call only)
|
||
static __thread int debug_checked = 0;
|
||
if (!debug_checked) {
|
||
int enabled = ultra_slim_mode_enabled();
|
||
if (enabled) {
|
||
fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: ENABLED (will use 4-layer path)\n");
|
||
} else {
|
||
fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: DISABLED (will use standard path)\n");
|
||
}
|
||
debug_checked = 1;
|
||
}
|
||
|
||
if (__builtin_expect(ultra_slim_mode_enabled(), 0)) {
|
||
return ultra_slim_alloc_with_refill(size);
|
||
}
|
||
// ========== End Phase 19-2: Ultra SLIM ==========
|
||
|
||
// 1. Size → class index (inline, fast)
|
||
int class_idx = hak_tiny_size_to_class(size);
|
||
|
||
if (__builtin_expect(class_idx < 0, 0)) {
|
||
return NULL; // Size > 1KB, not Tiny
|
||
}
|
||
|
||
// Phase 3c L1D Opt: Prefetch TLS cache head early
|
||
// Phase 3d-B: Prefetch unified TLS SLL struct (single prefetch for both head+count)
|
||
__builtin_prefetch(&g_tls_sll[class_idx], 0, 3);
|
||
|
||
// Phase 22: Lazy per-class init (on first use)
|
||
lazy_init_class(class_idx);
|
||
|
||
// Phase 3-4: Record allocation for ACE Profile learning
|
||
// TLS increment only (no atomic operation, amortized flush at threshold)
|
||
tiny_sizeclass_hist_hit(class_idx);
|
||
|
||
// P0.1: Cache g_tls_sll_enable once (Phase 3-4 instruction reduction)
|
||
// Eliminates redundant global variable reads (2-3 instructions saved)
|
||
extern int g_tls_sll_enable;
|
||
const int sll_enabled = g_tls_sll_enable;
|
||
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
// Phase 3: Debug checks eliminated in release builds
|
||
// CRITICAL: Bounds check to catch corruption
|
||
if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) {
|
||
fprintf(stderr, "[TINY_ALLOC_FAST] FATAL: class_idx=%d out of bounds! size=%zu call=%lu\n",
|
||
class_idx, size, call_num);
|
||
fflush(stderr);
|
||
abort();
|
||
}
|
||
|
||
// Debug logging (DISABLED for performance)
|
||
if (0 && call_num > 14250 && call_num < 14280) {
|
||
fprintf(stderr, "[TINY_ALLOC] call=%lu size=%zu class=%d sll_head[%d]=%p count=%u\n",
|
||
call_num, size, class_idx, class_idx,
|
||
g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
|
||
fflush(stderr);
|
||
}
|
||
#endif
|
||
|
||
ROUTE_BEGIN(class_idx);
|
||
|
||
void* ptr = NULL;
|
||
|
||
// Front-V2: TLS magazine front (A/B, default OFF)
|
||
if (__builtin_expect(tiny_heap_v2_enabled() && front_prune_heapv2_enabled() && class_idx <= 3, 0)) {
|
||
void* hv2 = tiny_heap_v2_alloc_by_class(class_idx);
|
||
if (hv2) {
|
||
front_metrics_heapv2_hit(class_idx);
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, hv2);
|
||
} else {
|
||
front_metrics_heapv2_miss(class_idx);
|
||
}
|
||
}
|
||
|
||
// Generic front (FastCache/SFC/SLL)
|
||
// Respect SLL global toggle
|
||
if (__builtin_expect(g_tls_sll_enable, 1)) {
|
||
// For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads.
|
||
if (class_idx <= 3) {
|
||
#if HAKMEM_TINY_INLINE_SLL
|
||
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
|
||
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
|
||
#else
|
||
// Default: Safe Box API (Box TLS-SLL) for all standard builds
|
||
ptr = tiny_alloc_fast_pop(class_idx);
|
||
#endif
|
||
} else {
|
||
void* base = NULL;
|
||
if (tls_sll_pop(class_idx, &base)) {
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base);
|
||
ptr = base;
|
||
} else {
|
||
ptr = NULL;
|
||
}
|
||
}
|
||
} else {
|
||
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
|
||
}
|
||
|
||
// Phase 3c L1D Opt: Prefetch next freelist entry if we got a pointer
|
||
if (__builtin_expect(ptr != NULL, 1)) {
|
||
__builtin_prefetch(ptr, 0, 3);
|
||
}
|
||
|
||
if (__builtin_expect(ptr != NULL, 1)) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, ptr);
|
||
}
|
||
|
||
// Refill to TLS List/SLL
|
||
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||
void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
|
||
if (took) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, took);
|
||
}
|
||
|
||
// Backend refill後に再トライ
|
||
{
|
||
int refilled = tiny_alloc_fast_refill(class_idx);
|
||
if (__builtin_expect(refilled > 0, 1)) {
|
||
// Retry SLL if enabled (P0.1: using cached sll_enabled)
|
||
if (__builtin_expect(sll_enabled, 1)) {
|
||
if (class_idx <= 3) {
|
||
#if HAKMEM_TINY_INLINE_SLL
|
||
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
|
||
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
|
||
#else
|
||
// Default: Safe Box API (Box TLS-SLL) for all standard builds
|
||
ptr = tiny_alloc_fast_pop(class_idx);
|
||
#endif
|
||
} else {
|
||
void* base2 = NULL;
|
||
if (tls_sll_pop(class_idx, &base2)) {
|
||
// P1.3: Track active when allocating from TLS SLL
|
||
tiny_active_track_alloc(base2);
|
||
ptr = base2;
|
||
} else {
|
||
ptr = NULL;
|
||
}
|
||
}
|
||
} else {
|
||
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
|
||
}
|
||
if (ptr) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, ptr);
|
||
}
|
||
}
|
||
}
|
||
|
||
// 5. Refill failure or still empty → slow path (OOM or new SuperSlab)
|
||
// Box Boundary: Delegate to Slow Path (Box 3 backend)
|
||
ptr = hak_tiny_alloc_slow(size, class_idx);
|
||
if (ptr) {
|
||
tiny_diag_track_size_ge1024_fast(size, class_idx);
|
||
HAK_RET_ALLOC(class_idx, ptr);
|
||
}
|
||
|
||
return ptr; // NULL if OOM
|
||
}
|
||
|
||
// ========== Push to TLS Freelist (for free path) ==========
|
||
|
||
// Push block to TLS freelist (used by free fast path)
|
||
// This is a "helper" for Box 6 (Free Fast Path)
|
||
//
|
||
// Invariant: ptr must belong to current thread (no ownership check here)
|
||
// Caller (Box 6) is responsible for ownership verification
|
||
static inline void tiny_alloc_fast_push(int class_idx, void* ptr) {
|
||
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
|
||
front_gate_push_tls(class_idx, ptr);
|
||
#else
|
||
// Box Boundary: Push to TLS freelist using Box TLS-SLL API (C7-safe)
|
||
uint32_t capacity = UINT32_MAX; // Unlimited for helper function
|
||
if (!tls_sll_push(class_idx, ptr, capacity)) {
|
||
// C7 rejected or SLL somehow full (should not happen)
|
||
// In release builds, this is a no-op (caller expects success)
|
||
#if !HAKMEM_BUILD_RELEASE
|
||
fprintf(stderr, "[WARN] tls_sll_push failed in tiny_alloc_fast_push cls=%d ptr=%p\n",
|
||
class_idx, ptr);
|
||
#endif
|
||
}
|
||
#endif
|
||
}
|
||
|
||
// ========== Statistics & Diagnostics ==========
|
||
|
||
// Get TLS freelist stats (for debugging/profiling)
|
||
typedef struct {
|
||
int class_idx;
|
||
void* head;
|
||
uint32_t count;
|
||
} TinyAllocFastStats;
|
||
|
||
static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
|
||
TinyAllocFastStats stats = {
|
||
.class_idx = class_idx,
|
||
.head = g_tls_sll[class_idx].head,
|
||
.count = g_tls_sll[class_idx].count
|
||
};
|
||
return stats;
|
||
}
|
||
|
||
// Reset TLS freelist (for testing/benchmarking)
|
||
// WARNING: This leaks memory! Only use in controlled test environments.
|
||
static inline void tiny_alloc_fast_reset(int class_idx) {
|
||
g_tls_sll[class_idx].head = NULL;
|
||
g_tls_sll[class_idx].count = 0;
|
||
}
|
||
|
||
// ========== Performance Notes ==========
|
||
//
|
||
// Expected metrics (based on System tcache & HAKX +171% results):
|
||
// - Fast path hit rate: 95%+ (workload dependent)
|
||
// - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs)
|
||
// - Miss penalty: ~20-50 instructions (refill from SuperSlab)
|
||
// - Throughput improvement: +10-25% vs current multi-layer design
|
||
//
|
||
// Key optimizations:
|
||
// 1. `__builtin_expect` for branch prediction (hot path first)
|
||
// 2. `static inline` for zero-cost abstraction
|
||
// 3. TLS variables (no atomic ops, no locks)
|
||
// 4. Minimal work in fast path (defer stats/accounting to backend)
|
||
//
|
||
// Comparison with current design:
|
||
// - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...)
|
||
// - New: 3-4 instructions (TLS freelist pop only)
|
||
// - Reduction: -80% instructions in hot path
|
||
//
|
||
// Inspired by:
|
||
// - System tcache (glibc malloc) - 3-4 instruction fast path
|
||
// - HAKX Mid-Large (+171%) - "Simple Front + Smart Back"
|
||
// - Box Theory - Clear boundaries, minimal coupling
|