// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions) // Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%) // Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend // Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart // // Box 5-NEW: SFC (Super Front Cache) Integration // Architecture: SFC (Layer 0, 128-256 slots) → SLL (Layer 1, unlimited) → SuperSlab (Layer 2+) // Cascade Refill: SFC ← SLL (one-way, safe) // Goal: +200% performance (4.19M → 12M+ ops/s) // // Phase 2b: Adaptive TLS Cache Sizing // Hot classes grow to 2048 slots, cold classes shrink to 16 slots // Expected: +3-10% performance, -30-50% TLS cache memory overhead #pragma once #include "tiny_atomic.h" #include "hakmem_tiny.h" #include "tiny_route.h" #include "tiny_alloc_fast_sfc.inc.h" // Box 5-NEW: SFC Layer #include "hakmem_tiny_fastcache.inc.h" // Array stack (FastCache) for C0–C3 #include "hakmem_tiny_tls_list.h" // TLS List (for tiny_fast_refill_and_take) #include "tiny_region_id.h" // Phase 7: Header-based class_idx lookup #include "tiny_adaptive_sizing.h" // Phase 2b: Adaptive sizing #include "box/tls_sll_box.h" // Box TLS-SLL: C7-safe push/pop/splice #include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write #include "box/tiny_front_config_box.h" // Phase 7-Step3: Compile-time config for dead code elimination #ifdef HAKMEM_TINY_FRONT_GATE_BOX #include "box/front_gate_box.h" #endif #include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection // Phase 7-Step6-Fix: Config wrapper functions moved to tiny_fastcache.c // (Forward declarations are in tiny_front_config_box.h) #ifdef HAKMEM_TINY_HEADER_CLASSIDX // Ring Cache and Unified Cache removed (A/B test: OFF is faster) #endif #include "box/front_metrics_box.h" // Phase 19-1: Frontend layer metrics #include "front/tiny_heap_v2.h" // Front-V2: TLS magazine (tcache-like) front #include "hakmem_tiny_lazy_init.inc.h" // Phase 22: Lazy per-class initialization #include "box/tiny_sizeclass_hist_box.h" // Phase 3-4: Tiny size class histogram (ACE learning) #include "box/ultra_slim_alloc_box.h" // Phase 19-2: Ultra SLIM 4-layer fast path #include #include // P1.3/P2.2: Helper to track active/tls_cached when allocating from TLS SLL // ENV gate: HAKMEM_TINY_ACTIVE_TRACK=1 to enable (default: 0 for performance) // Flow: TLS SLL → User means active++, tls_cached-- static inline void tiny_active_track_alloc(void* base) { static __thread int g_active_track = -1; if (__builtin_expect(g_active_track == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ACTIVE_TRACK"); g_active_track = (e && *e && *e != '0') ? 1 : 0; } if (__builtin_expect(g_active_track, 0)) { extern SuperSlab* ss_fast_lookup(void* ptr); SuperSlab* ss = ss_fast_lookup(base); if (ss && ss->magic == SUPERSLAB_MAGIC) { int slab_idx = slab_index_for(ss, base); if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) { TinySlabMeta* meta = &ss->slabs[slab_idx]; atomic_fetch_add_explicit(&meta->active, 1, memory_order_relaxed); atomic_fetch_sub_explicit(&meta->tls_cached, 1, memory_order_relaxed); // P2.2 } } } } // Diag counter: size>=1024 allocations routed to Tiny (env: HAKMEM_TINY_ALLOC_1024_METRIC) extern _Atomic uint64_t g_tiny_alloc_ge1024[]; static inline void tiny_diag_track_size_ge1024_fast(size_t req_size, int class_idx) { if (__builtin_expect(req_size < 1024, 1)) return; static int s_metric_en = -1; if (__builtin_expect(s_metric_en == -1, 0)) { const char* e = getenv("HAKMEM_TINY_ALLOC_1024_METRIC"); s_metric_en = (e && *e && *e != '0') ? 1 : 0; } if (!__builtin_expect(s_metric_en, 0)) return; if (__builtin_expect(class_idx >= 0 && class_idx < TINY_NUM_CLASSES, 1)) { atomic_fetch_add_explicit(&g_tiny_alloc_ge1024[class_idx], 1, memory_order_relaxed); } } // Phase 7 Task 2: Aggressive inline TLS cache access // Enable with: make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 #ifndef HAKMEM_TINY_AGGRESSIVE_INLINE #define HAKMEM_TINY_AGGRESSIVE_INLINE 0 #endif #if HAKMEM_TINY_AGGRESSIVE_INLINE #include "tiny_alloc_fast_inline.h" #endif // ========== Debug Counters (compile-time gated) ========== #if HAKMEM_DEBUG_COUNTERS // Refill-stage counters (defined in hakmem_tiny.c) extern unsigned long long g_rf_total_calls[]; extern unsigned long long g_rf_hit_bench[]; extern unsigned long long g_rf_hit_hot[]; extern unsigned long long g_rf_hit_mail[]; extern unsigned long long g_rf_hit_slab[]; extern unsigned long long g_rf_hit_ss[]; extern unsigned long long g_rf_hit_reg[]; extern unsigned long long g_rf_mmap_calls[]; // Publish hits (defined in hakmem_tiny.c) extern unsigned long long g_pub_mail_hits[]; extern unsigned long long g_pub_bench_hits[]; extern unsigned long long g_pub_hot_hits[]; // Free pipeline (defined in hakmem_tiny.c) extern unsigned long long g_free_via_tls_sll[]; #endif // ========== Box 5: Allocation Fast Path ========== // 箱理論の Fast Allocation 層。TLS freelist から直接 pop(3-4命令)。 // 不変条件: // - TLS freelist が非空なら即座に return (no lock, no sync) // - Miss なら Backend (Box 3: SuperSlab) に委譲 // - Cross-thread allocation は考慮しない(Backend が処理) // External TLS variables (defined in hakmem_tiny.c) // Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES]; // External backend functions // P0 Fix: Use appropriate refill function based on P0 status #if HAKMEM_TINY_P0_BATCH_REFILL extern int sll_refill_batch_from_ss(int class_idx, int max_take); #else extern int sll_refill_small_from_ss(int class_idx, int max_take); #endif extern void* hak_tiny_alloc_slow(size_t size, int class_idx); extern int hak_tiny_size_to_class(size_t size); extern int tiny_refill_failfast_level(void); extern const size_t g_tiny_class_sizes[]; // Global Front refill config (parsed at init; defined in hakmem_tiny.c) extern int g_refill_count_global; extern int g_refill_count_hot; extern int g_refill_count_mid; extern int g_refill_count_class[TINY_NUM_CLASSES]; // HAK_RET_ALLOC macro is now defined in core/hakmem_tiny.c // See lines 116-152 for single definition point based on HAKMEM_TINY_HEADER_CLASSIDX // ========== RDTSC Profiling (lightweight) ========== #ifdef __x86_64__ static inline uint64_t tiny_fast_rdtsc(void) { unsigned int lo, hi; __asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi)); return ((uint64_t)hi << 32) | lo; } #else static inline uint64_t tiny_fast_rdtsc(void) { return 0; } #endif // Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1) static __thread uint64_t g_tiny_alloc_hits = 0; static __thread uint64_t g_tiny_alloc_cycles = 0; static __thread uint64_t g_tiny_refill_calls = 0; static __thread uint64_t g_tiny_refill_cycles = 0; static int g_tiny_profile_enabled = -1; // -1: uninitialized static inline int tiny_profile_enabled(void) { if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) { const char* env = getenv("HAKMEM_TINY_PROFILE"); g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0; } return g_tiny_profile_enabled; } // Print profiling results at exit static void tiny_fast_print_profile(void) __attribute__((destructor)); static void tiny_fast_print_profile(void) { if (!tiny_profile_enabled()) return; if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return; fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n"); if (g_tiny_alloc_hits > 0) { fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n", (unsigned long)g_tiny_alloc_hits, (unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits)); } if (g_tiny_refill_calls > 0) { fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n", (unsigned long)g_tiny_refill_calls, (unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls)); } fprintf(stderr, "===================================================\n\n"); } // ========== Front-V2 helpers (tcache-like TLS magazine) ========== static inline int tiny_heap_v2_stats_enabled(void) { static int enabled = -1; if (__builtin_expect(enabled == -1, 0)) { const char* e = getenv("HAKMEM_TINY_HEAP_V2_STATS"); enabled = (e && *e && *e != '0') ? 1 : 0; } return enabled; } // TLS HeapV2 initialization barrier (ensures mag->top is zero on first use) static inline void tiny_heap_v2_ensure_init(void) { extern __thread int g_tls_heap_v2_initialized; extern __thread TinyHeapV2Mag g_tiny_heap_v2_mag[]; if (__builtin_expect(!g_tls_heap_v2_initialized, 0)) { for (int i = 0; i < TINY_NUM_CLASSES; i++) { g_tiny_heap_v2_mag[i].top = 0; } g_tls_heap_v2_initialized = 1; } } static inline int tiny_heap_v2_refill_mag(int class_idx) { // FIX: Ensure TLS is initialized before first magazine access tiny_heap_v2_ensure_init(); if (class_idx < 0 || class_idx > 3) return 0; if (!tiny_heap_v2_class_enabled(class_idx)) return 0; // Phase 7-Step7: Use config macro for dead code elimination in PGO mode if (!TINY_FRONT_TLS_SLL_ENABLED) return 0; TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx]; const int cap = TINY_HEAP_V2_MAG_CAP; int filled = 0; // FIX: Validate mag->top before use (prevent uninitialized TLS corruption) if (mag->top < 0 || mag->top > cap) { static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0}; if (!s_reset_logged[class_idx]) { fprintf(stderr, "[HEAP_V2_REFILL] C%d mag->top=%d corrupted, reset to 0\n", class_idx, mag->top); s_reset_logged[class_idx] = 1; } mag->top = 0; } // First, steal from TLS SLL if already available. while (mag->top < cap) { void* base = NULL; if (!tls_sll_pop(class_idx, &base)) break; mag->items[mag->top++] = base; filled++; } // If magazine is still empty, ask backend to refill SLL once, then steal again. if (mag->top < cap && filled == 0) { #if HAKMEM_TINY_P0_BATCH_REFILL (void)sll_refill_batch_from_ss(class_idx, cap); #else (void)sll_refill_small_from_ss(class_idx, cap); #endif while (mag->top < cap) { void* base = NULL; if (!tls_sll_pop(class_idx, &base)) break; mag->items[mag->top++] = base; filled++; } } if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) { if (filled > 0) { g_tiny_heap_v2_stats[class_idx].refill_calls++; g_tiny_heap_v2_stats[class_idx].refill_blocks += (uint64_t)filled; } } return filled; } static inline void* tiny_heap_v2_alloc_by_class(int class_idx) { // FIX: Ensure TLS is initialized before first magazine access tiny_heap_v2_ensure_init(); if (class_idx < 0 || class_idx > 3) return NULL; // Phase 7-Step8: Use config macro for dead code elimination in PGO mode if (!TINY_FRONT_HEAP_V2_ENABLED) return NULL; if (!tiny_heap_v2_class_enabled(class_idx)) return NULL; TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx]; // Hit: magazine has entries if (__builtin_expect(mag->top > 0, 1)) { // FIX: Add underflow protection before array access const int cap = TINY_HEAP_V2_MAG_CAP; if (mag->top > cap || mag->top < 0) { static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0}; if (!s_reset_logged[class_idx]) { fprintf(stderr, "[HEAP_V2_ALLOC] C%d mag->top=%d corrupted, reset to 0\n", class_idx, mag->top); s_reset_logged[class_idx] = 1; } mag->top = 0; return NULL; // Fall through to refill path } if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) { g_tiny_heap_v2_stats[class_idx].alloc_calls++; g_tiny_heap_v2_stats[class_idx].mag_hits++; } return mag->items[--mag->top]; } // Miss: try single refill from SLL/backend int filled = tiny_heap_v2_refill_mag(class_idx); if (filled > 0 && mag->top > 0) { if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) { g_tiny_heap_v2_stats[class_idx].alloc_calls++; g_tiny_heap_v2_stats[class_idx].mag_hits++; } return mag->items[--mag->top]; } if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) { g_tiny_heap_v2_stats[class_idx].backend_oom++; } return NULL; } // ========== Fast Path: TLS Freelist Pop (3-4 instructions) ========== // External SFC control (defined in hakmem_tiny_sfc.c) extern int g_sfc_enabled; // Allocation fast path (inline for zero-cost) // Returns: pointer on success, NULL on miss (caller should try refill/slow) // // Box 5-NEW Architecture: // Layer 0: SFC (128-256 slots, high hit rate) [if enabled] // Layer 1: SLL (unlimited, existing) // Cascade: SFC miss → try SLL → refill // // Assembly (x86-64, optimized): // mov rax, QWORD PTR g_sfc_head[class_idx] ; SFC: Load head // test rax, rax ; Check NULL // jne .sfc_hit ; If not empty, SFC hit! // mov rax, QWORD PTR g_tls_sll_head[class_idx] ; SLL: Load head // test rax, rax ; Check NULL // je .miss ; If empty, miss // mov rdx, QWORD PTR [rax] ; Load next // mov QWORD PTR g_tls_sll_head[class_idx], rdx ; Update head // ret ; Return ptr // .sfc_hit: // mov rdx, QWORD PTR [rax] ; Load next // mov QWORD PTR g_sfc_head[class_idx], rdx ; Update head // ret // .miss: // ; Fall through to refill // // Expected: 3-4 instructions on SFC hit, 6-8 on SLL hit static inline void* tiny_alloc_fast_pop(int class_idx) { // PRIORITY 1: Bounds check before any TLS array access HAK_CHECK_CLASS_IDX(class_idx, "tiny_alloc_fast_pop"); #if !HAKMEM_BUILD_RELEASE // Phase 3: Debug counters eliminated in release builds atomic_fetch_add(&g_integrity_check_class_bounds, 1); // DEBUG: Log class 2 pops (DISABLED for performance) static _Atomic uint64_t g_fast_pop_count = 0; uint64_t pop_call = atomic_fetch_add(&g_fast_pop_count, 1); if (0 && class_idx == 2 && pop_call > 5840 && pop_call < 5900) { fprintf(stderr, "[FAST_POP_C2] call=%lu cls=%d head=%p count=%u\n", pop_call, class_idx, g_tls_sll[class_idx].head, g_tls_sll[class_idx].count); fflush(stderr); } #endif // Phase E1-CORRECT: C7 now has headers, can use fast path #ifdef HAKMEM_TINY_FRONT_GATE_BOX void* out = NULL; if (front_gate_try_pop(class_idx, &out)) { return out; } return NULL; #else // ========== Phase 19-1: Quick Prune (Frontend SLIM mode) ========== // ENV: HAKMEM_TINY_FRONT_SLIM=1 // Goal: Skip FastCache + SFC layers, go straight to SLL (88-99% hit rate) // Expected: 22M → 27-30M ops/s (+22-36%) static __thread int g_front_slim_checked = 0; static __thread int g_front_slim_enabled = 0; if (__builtin_expect(!g_front_slim_checked, 0)) { const char* e = getenv("HAKMEM_TINY_FRONT_SLIM"); g_front_slim_enabled = (e && *e && *e != '0') ? 1 : 0; g_front_slim_checked = 1; } // SLIM MODE: Skip FastCache + SFC, go straight to SLL if (__builtin_expect(g_front_slim_enabled, 0)) { // Box Boundary: TLS SLL freelist pop (only layer in SLIM mode) // Phase 7-Step7: Use config macro for dead code elimination in PGO mode if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) { void* base = NULL; if (tls_sll_pop(class_idx, &base)) { // Front Gate: SLL hit (SLIM fast path - 3 instructions) extern unsigned long long g_front_sll_hit[]; g_front_sll_hit[class_idx]++; // P1.3: Track active when allocating from TLS SLL tiny_active_track_alloc(base); return base; } } // SLIM mode miss → return NULL (caller refills) return NULL; } // ========== End Phase 19-1: Quick Prune ========== // Phase 7 Task 3: Profiling overhead removed in release builds // In release mode, compiler can completely eliminate profiling code #if !HAKMEM_BUILD_RELEASE uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0; #endif // Phase 1: Try array stack (FastCache) first for hottest tiny classes (C0–C3) // Phase 7-Step4: Use config macro for dead code elimination in PGO mode if (__builtin_expect(TINY_FRONT_FASTCACHE_ENABLED && class_idx <= 3, 1)) { void* fc = fastcache_pop(class_idx); if (__builtin_expect(fc != NULL, 1)) { // Frontend FastCache hit (already tracked by g_front_fc_hit) extern unsigned long long g_front_fc_hit[]; g_front_fc_hit[class_idx]++; return fc; } else { // Frontend FastCache miss (already tracked by g_front_fc_miss) extern unsigned long long g_front_fc_miss[]; g_front_fc_miss[class_idx]++; } } // Box 5-NEW: Layer 0 - Try SFC first (if enabled) // Phase 7-Step8: Use config macro for dead code elimination in PGO mode static __thread int sfc_check_done = 0; static __thread int sfc_is_enabled = 0; if (__builtin_expect(!sfc_check_done, 0)) { sfc_is_enabled = TINY_FRONT_SFC_ENABLED; sfc_check_done = 1; } if (__builtin_expect(sfc_is_enabled, 1)) { void* base = sfc_alloc(class_idx); if (__builtin_expect(base != NULL, 1)) { // Front Gate: SFC hit extern unsigned long long g_front_sfc_hit[]; g_front_sfc_hit[class_idx]++; // 🚀 SFC HIT! (Layer 0) #if !HAKMEM_BUILD_RELEASE if (start) { g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start); g_tiny_alloc_hits++; } #endif // ✅ FIX #16: Return BASE pointer (not USER) // Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header // which does the BASE → USER conversion. Double conversion was causing corruption! return base; } // SFC miss → try SLL (Layer 1) } // Box Boundary: Layer 1 - TLS SLL freelist の先頭を pop(envで無効化可) // Note: This is in tiny_alloc_fast_pop(), not tiny_alloc_fast(), so use global variable // Phase 7-Step7: Use config macro for dead code elimination in PGO mode if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) { // Use Box TLS-SLL API (C7-safe pop) // CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!) // Reading head before pop causes stale read → rbp=0xa0 SEGV void* base = NULL; if (tls_sll_pop(class_idx, &base)) { // Front Gate: SLL hit (fast path 3 instructions) extern unsigned long long g_front_sll_hit[]; g_front_sll_hit[class_idx]++; // P1.3: Track active when allocating from TLS SLL tiny_active_track_alloc(base); #if HAKMEM_DEBUG_COUNTERS // Track TLS freelist hits (compile-time gated, zero runtime cost when disabled) g_free_via_tls_sll[class_idx]++; #endif #if !HAKMEM_BUILD_RELEASE // Debug: Track profiling (release builds skip this overhead) if (start) { g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start); g_tiny_alloc_hits++; } #endif // ✅ FIX #16: Return BASE pointer (not USER) // Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header // which does the BASE → USER conversion. Double conversion was causing corruption! return base; } } // Fast path miss → NULL (caller should refill) return NULL; #endif } // ========== Cascade Refill: SFC ← SLL (Box Theory boundary) ========== // Cascade refill: Transfer blocks from SLL to SFC (one-way, safe) // Returns: number of blocks transferred // // Contract: // - Transfer ownership: SLL → SFC // - No circular dependency: one-way only // - Boundary clear: SLL pop → SFC push // - Fallback safe: if SFC full, stop (no overflow) // Env-driven cascade percentage (0-100), default 50% static inline int sfc_cascade_pct(void) { static int pct = -1; if (__builtin_expect(pct == -1, 0)) { const char* e = getenv("HAKMEM_SFC_CASCADE_PCT"); int v = e && *e ? atoi(e) : 50; if (v < 0) v = 0; if (v > 100) v = 100; pct = v; } return pct; } static inline int sfc_refill_from_sll(int class_idx, int target_count) { // PRIORITY 1: Bounds check HAK_CHECK_CLASS_IDX(class_idx, "sfc_refill_from_sll"); #if !HAKMEM_BUILD_RELEASE atomic_fetch_add(&g_integrity_check_class_bounds, 1); #endif int transferred = 0; uint32_t cap = g_sfc_capacity[class_idx]; // Adjust target based on cascade percentage int pct = sfc_cascade_pct(); int want = (target_count * pct) / 100; if (want <= 0) want = target_count / 2; // safety fallback while (transferred < want && g_tls_sll[class_idx].count > 0) { // Check SFC capacity before transfer if (g_sfc_count[class_idx] >= cap) { break; // SFC full, stop } // Pop from SLL (Layer 1) using Box TLS-SLL API (C7-safe) void* ptr = NULL; if (!tls_sll_pop(class_idx, &ptr)) { break; // SLL empty } // Push to SFC (Layer 0) — header-aware tiny_next_write(class_idx, ptr, g_sfc_head[class_idx]); g_sfc_head[class_idx] = ptr; g_sfc_count[class_idx]++; transferred++; } return transferred; } // ========== Refill Path: Backend Integration ========== // Refill TLS freelist from backend (SuperSlab/ACE/Learning layer) // Returns: number of blocks refilled // // Box 5-NEW Architecture: // SFC enabled: SuperSlab → SLL → SFC (cascade) // SFC disabled: SuperSlab → SLL (direct, old path) // // This integrates with existing HAKMEM infrastructure: // - SuperSlab provides memory chunks // - ACE provides adaptive capacity learning // - L25 provides mid-large integration // // Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 16) // - Smaller count (8-16): better for diverse workloads, faster warmup // - Larger count (64-128): better for homogeneous workloads, fewer refills static inline int tiny_alloc_fast_refill(int class_idx) { // Phase E1-CORRECT: C7 now has headers, can use refill // Phase 7 Task 3: Profiling overhead removed in release builds // In release mode, compiler can completely eliminate profiling code #if !HAKMEM_BUILD_RELEASE uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0; #endif // Phase 2b: Check available capacity before refill int available_capacity = get_available_capacity(class_idx); if (available_capacity <= 0) { // Cache is full, don't refill return 0; } // Phase 7 Task 3: Simplified refill count (cached per-class in TLS) // Previous: Complex precedence logic on every miss (5-10 cycles overhead) // Now: Simple TLS cache lookup (1-2 cycles) static __thread int s_refill_count[TINY_NUM_CLASSES] = {0}; // Simple adaptive booster: bump per-class refill size when refills are frequent. static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0}; int cnt = s_refill_count[class_idx]; if (__builtin_expect(cnt == 0, 0)) { // First miss: Initialize from globals (parsed at init time) int v = HAKMEM_TINY_REFILL_DEFAULT; // Default from hakmem_build_flags.h // Precedence: per-class > hot/mid > global if (g_refill_count_class[class_idx] > 0) { v = g_refill_count_class[class_idx]; } else if (class_idx <= 3 && g_refill_count_hot > 0) { v = g_refill_count_hot; } else if (class_idx >= 4 && g_refill_count_mid > 0) { v = g_refill_count_mid; } else if (g_refill_count_global > 0) { v = g_refill_count_global; } // Clamp to sane range (min: 8, max: 256) if (v < 8) v = 8; // Minimum: avoid thrashing if (v > 256) v = 256; // Maximum: avoid excessive TLS memory s_refill_count[class_idx] = v; cnt = v; } // Phase 2b: Clamp refill count to available capacity if (cnt > available_capacity) { cnt = available_capacity; } #if HAKMEM_DEBUG_COUNTERS // Track refill calls (compile-time gated) g_rf_total_calls[class_idx]++; #endif // Box Boundary: Delegate to Backend (Box 3: SuperSlab) // Refill Dispatch: Standard (ss_refill_fc_fill) vs Legacy SLL (A/B only) // Standard: Enabled by FRONT_DIRECT=1, REFILL_BATCH=1, or P0_DIRECT_FC_ALL=1 // Legacy: Fallback for compatibility (will be deprecated) int refilled = 0; // Front-Direct A/B 実装は現 HEAD では非対応。 // 常にレガシー経路(SS→SLL→FC)を使う。 #if HAKMEM_TINY_P0_BATCH_REFILL refilled = sll_refill_batch_from_ss(class_idx, cnt); #else refilled = sll_refill_small_from_ss(class_idx, cnt); #endif // Lightweight adaptation: if refills keep happening, increase per-class refill. // Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads. if (refilled > 0) { uint8_t c = ++s_refill_calls[class_idx]; if (class_idx == 7) { // Every 4 refills, increase target by +16 up to 128 (unless overridden). if ((c & 0x03u) == 0) { int target = s_refill_count[class_idx]; if (target < 128) { target += 16; if (target > 128) target = 128; s_refill_count[class_idx] = target; } } } } else { // No refill performed (capacity full): slowly decay the counter. if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--; } // Phase 2b: Track refill and adapt cache size if (refilled > 0) { track_refill_for_adaptation(class_idx); } // Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default) static __thread int sfc_cascade_enabled = -1; if (__builtin_expect(sfc_cascade_enabled == -1, 0)) { // Check ENV flag (default: OFF) const char* e = getenv("HAKMEM_TINY_SFC_CASCADE"); sfc_cascade_enabled = (e && *e && *e != '0') ? 1 : 0; } // Only cascade if explicitly enabled AND we have refilled blocks in SLL // Phase 7-Step8: Use config macro for dead code elimination in PGO mode if (sfc_cascade_enabled && TINY_FRONT_SFC_ENABLED && refilled > 0) { // Transfer half of refilled blocks to SFC (keep half in SLL for future) int sfc_target = refilled / 2; if (sfc_target > 0) { #ifdef HAKMEM_TINY_FRONT_GATE_BOX front_gate_after_refill(class_idx, refilled); #else int transferred = sfc_refill_from_sll(class_idx, sfc_target); (void)transferred; // Unused, but could track stats #endif } } #if !HAKMEM_BUILD_RELEASE // Debug: Track profiling (release builds skip this overhead) if (start) { g_tiny_refill_cycles += (tiny_fast_rdtsc() - start); g_tiny_refill_calls++; } #endif return refilled; } // ========== Combined Fast Path (Alloc + Refill) ========== // Complete fast path allocation (inline for zero-cost) // Returns: pointer on success, NULL on failure (OOM or size too large) // // Flow: // 1. TLS freelist pop (3-4 instructions) - Hit rate ~95% // 2. Miss → Refill from backend (~5% cases) // 3. Refill success → Retry pop // 4. Refill failure → Slow path (OOM or new SuperSlab allocation) // // Example usage: // void* ptr = tiny_alloc_fast(64); // if (!ptr) { // // OOM handling // } static inline void* tiny_alloc_fast(size_t size) { #if !HAKMEM_BUILD_RELEASE // Phase 3: Debug counters eliminated in release builds static _Atomic uint64_t alloc_call_count = 0; uint64_t call_num = atomic_fetch_add(&alloc_call_count, 1); #endif // Phase 22: Global init (once per process) lazy_init_global(); // ========== Phase 19-2: Ultra SLIM 4-Layer Fast Path ========== // ENV: HAKMEM_TINY_ULTRA_SLIM=1 // Expected: 90-110M ops/s (mimalloc parity) // Architecture: Init Safety + Size-to-Class + ACE Learning + TLS SLL Direct // Note: ACE learning preserved (HAKMEM's differentiator vs mimalloc) // Debug: Check if Ultra SLIM is enabled (first call only) #if !HAKMEM_BUILD_RELEASE static __thread int debug_checked = 0; if (!debug_checked) { // Phase 7-Step8: Use config macro for dead code elimination in PGO mode int enabled = TINY_FRONT_ULTRA_SLIM_ENABLED; if (enabled) { fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: ENABLED (will use 4-layer path)\n"); } else { fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: DISABLED (will use standard path)\n"); } debug_checked = 1; } #endif // Phase 7-Step4: Use config macro for dead code elimination in PGO mode if (__builtin_expect(TINY_FRONT_ULTRA_SLIM_ENABLED, 0)) { return ultra_slim_alloc_with_refill(size); } // ========== End Phase 19-2: Ultra SLIM ========== // 1. Size → class index (inline, fast) int class_idx = hak_tiny_size_to_class(size); if (__builtin_expect(class_idx < 0, 0)) { return NULL; // Size > 1KB, not Tiny } // Phase 3c L1D Opt: Prefetch TLS cache head early // Phase 3d-B: Prefetch unified TLS SLL struct (single prefetch for both head+count) __builtin_prefetch(&g_tls_sll[class_idx], 0, 3); // Phase 22: Lazy per-class init (on first use) lazy_init_class(class_idx); // Phase 3-4: Record allocation for ACE Profile learning // TLS increment only (no atomic operation, amortized flush at threshold) tiny_sizeclass_hist_hit(class_idx); // P0.1: Cache g_tls_sll_enable once (Phase 3-4 instruction reduction) // Eliminates redundant global variable reads (2-3 instructions saved) // Phase 7-Step7: Use config macro for dead code elimination in PGO mode const int sll_enabled = TINY_FRONT_TLS_SLL_ENABLED; #if !HAKMEM_BUILD_RELEASE // Phase 3: Debug checks eliminated in release builds // CRITICAL: Bounds check to catch corruption if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) { fprintf(stderr, "[TINY_ALLOC_FAST] FATAL: class_idx=%d out of bounds! size=%zu call=%lu\n", class_idx, size, call_num); fflush(stderr); abort(); } // Debug logging (DISABLED for performance) if (0 && call_num > 14250 && call_num < 14280) { fprintf(stderr, "[TINY_ALLOC] call=%lu size=%zu class=%d sll_head[%d]=%p count=%u\n", call_num, size, class_idx, class_idx, g_tls_sll[class_idx].head, g_tls_sll[class_idx].count); fflush(stderr); } #endif ROUTE_BEGIN(class_idx); void* ptr = NULL; // Front-V2: TLS magazine front (A/B, default OFF) // Phase 7-Step4: Use config macro for dead code elimination in PGO mode if (__builtin_expect(TINY_FRONT_HEAP_V2_ENABLED && front_prune_heapv2_enabled() && class_idx <= 3, 0)) { void* hv2 = tiny_heap_v2_alloc_by_class(class_idx); if (hv2) { front_metrics_heapv2_hit(class_idx); tiny_diag_track_size_ge1024_fast(size, class_idx); HAK_RET_ALLOC(class_idx, hv2); } else { front_metrics_heapv2_miss(class_idx); } } // Generic front (FastCache/SFC/SLL) // Respect SLL global toggle // Phase 7-Step7: Use config macro for dead code elimination in PGO mode if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) { // For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads. if (class_idx <= 3) { #if HAKMEM_TINY_INLINE_SLL // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1) TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr); #else // Default: Safe Box API (Box TLS-SLL) for all standard builds ptr = tiny_alloc_fast_pop(class_idx); #endif } else { void* base = NULL; if (tls_sll_pop(class_idx, &base)) { // P1.3: Track active when allocating from TLS SLL tiny_active_track_alloc(base); ptr = base; } else { ptr = NULL; } } } else { ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL } // Phase 3c L1D Opt: Prefetch next freelist entry if we got a pointer if (__builtin_expect(ptr != NULL, 1)) { __builtin_prefetch(ptr, 0, 3); } if (__builtin_expect(ptr != NULL, 1)) { tiny_diag_track_size_ge1024_fast(size, class_idx); HAK_RET_ALLOC(class_idx, ptr); } // Refill to TLS List/SLL extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]); if (took) { tiny_diag_track_size_ge1024_fast(size, class_idx); HAK_RET_ALLOC(class_idx, took); } // Backend refill後に再トライ { int refilled = tiny_alloc_fast_refill(class_idx); if (__builtin_expect(refilled > 0, 1)) { // Retry SLL if enabled (P0.1: using cached sll_enabled) if (__builtin_expect(sll_enabled, 1)) { if (class_idx <= 3) { #if HAKMEM_TINY_INLINE_SLL // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1) TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr); #else // Default: Safe Box API (Box TLS-SLL) for all standard builds ptr = tiny_alloc_fast_pop(class_idx); #endif } else { void* base2 = NULL; if (tls_sll_pop(class_idx, &base2)) { // P1.3: Track active when allocating from TLS SLL tiny_active_track_alloc(base2); ptr = base2; } else { ptr = NULL; } } } else { ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL } if (ptr) { tiny_diag_track_size_ge1024_fast(size, class_idx); HAK_RET_ALLOC(class_idx, ptr); } } } // 5. Refill failure or still empty → slow path (OOM or new SuperSlab) // Box Boundary: Delegate to Slow Path (Box 3 backend) ptr = hak_tiny_alloc_slow(size, class_idx); if (ptr) { tiny_diag_track_size_ge1024_fast(size, class_idx); HAK_RET_ALLOC(class_idx, ptr); } return ptr; // NULL if OOM } // ========== Push to TLS Freelist (for free path) ========== // Push block to TLS freelist (used by free fast path) // This is a "helper" for Box 6 (Free Fast Path) // // Invariant: ptr must belong to current thread (no ownership check here) // Caller (Box 6) is responsible for ownership verification static inline void tiny_alloc_fast_push(int class_idx, void* ptr) { #ifdef HAKMEM_TINY_FRONT_GATE_BOX front_gate_push_tls(class_idx, ptr); #else // Box Boundary: Push to TLS freelist using Box TLS-SLL API (C7-safe) uint32_t capacity = UINT32_MAX; // Unlimited for helper function if (!tls_sll_push(class_idx, ptr, capacity)) { // C7 rejected or SLL somehow full (should not happen) // In release builds, this is a no-op (caller expects success) #if !HAKMEM_BUILD_RELEASE fprintf(stderr, "[WARN] tls_sll_push failed in tiny_alloc_fast_push cls=%d ptr=%p\n", class_idx, ptr); #endif } #endif } // ========== Statistics & Diagnostics ========== // Get TLS freelist stats (for debugging/profiling) typedef struct { int class_idx; void* head; uint32_t count; } TinyAllocFastStats; static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) { TinyAllocFastStats stats = { .class_idx = class_idx, .head = g_tls_sll[class_idx].head, .count = g_tls_sll[class_idx].count }; return stats; } // Reset TLS freelist (for testing/benchmarking) // WARNING: This leaks memory! Only use in controlled test environments. static inline void tiny_alloc_fast_reset(int class_idx) { g_tls_sll[class_idx].head = NULL; g_tls_sll[class_idx].count = 0; } // ========== Performance Notes ========== // // Expected metrics (based on System tcache & HAKX +171% results): // - Fast path hit rate: 95%+ (workload dependent) // - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs) // - Miss penalty: ~20-50 instructions (refill from SuperSlab) // - Throughput improvement: +10-25% vs current multi-layer design // // Key optimizations: // 1. `__builtin_expect` for branch prediction (hot path first) // 2. `static inline` for zero-cost abstraction // 3. TLS variables (no atomic ops, no locks) // 4. Minimal work in fast path (defer stats/accounting to backend) // // Comparison with current design: // - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...) // - New: 3-4 instructions (TLS freelist pop only) // - Reduction: -80% instructions in hot path // // Inspired by: // - System tcache (glibc malloc) - 3-4 instruction fast path // - HAKX Mid-Large (+171%) - "Simple Front + Smart Back" // - Box Theory - Clear boundaries, minimal coupling