Files
hakmem/core/tiny_alloc_fast.inc.h
Moe Charm (CI) c2716f5c01 Implement Phase 2: Headerless Allocator Support (Partial)
- Feature: Added HAKMEM_TINY_HEADERLESS toggle (A/B testing)
- Feature: Implemented Headerless layout logic (Offset=0)
- Refactor: Centralized layout definitions in tiny_layout_box.h
- Refactor: Abstracted pointer arithmetic in free path via ptr_conversion_box.h
- Verification: sh8bench passes in Headerless mode (No TLS_SLL_HDR_RESET)
- Known Issue: Regression in Phase 1 mode due to blind pointer conversion logic
2025-12-03 12:11:27 +09:00

944 lines
36 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions)
// Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%)
// Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend
// Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart
//
// Box 5-NEW: SFC (Super Front Cache) Integration
// Architecture: SFC (Layer 0, 128-256 slots) → SLL (Layer 1, unlimited) → SuperSlab (Layer 2+)
// Cascade Refill: SFC ← SLL (one-way, safe)
// Goal: +200% performance (4.19M → 12M+ ops/s)
//
// Phase 2b: Adaptive TLS Cache Sizing
// Hot classes grow to 2048 slots, cold classes shrink to 16 slots
// Expected: +3-10% performance, -30-50% TLS cache memory overhead
#pragma once
#include "tiny_atomic.h"
#include "hakmem_tiny.h"
#include "tiny_route.h"
#include "tiny_alloc_fast_sfc.inc.h" // Box 5-NEW: SFC Layer
#include "hakmem_tiny_fastcache.inc.h" // Array stack (FastCache) for C0C3
#include "hakmem_tiny_tls_list.h" // TLS List (for tiny_fast_refill_and_take)
#include "tiny_region_id.h" // Phase 7: Header-based class_idx lookup
#include "tiny_adaptive_sizing.h" // Phase 2b: Adaptive sizing
#include "box/tls_sll_box.h" // Box TLS-SLL: C7-safe push/pop/splice
#include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write
#include "box/tiny_front_config_box.h" // Phase 7-Step3: Compile-time config for dead code elimination
#include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls)
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
#include "box/front_gate_box.h"
#endif
#include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection
// Phase 7-Step6-Fix: Config wrapper functions moved to tiny_fastcache.c
// (Forward declarations are in tiny_front_config_box.h)
#if HAKMEM_TINY_HEADER_CLASSIDX
// Ring Cache and Unified Cache removed (A/B test: OFF is faster)
#endif
#include "box/front_metrics_box.h" // Phase 19-1: Frontend layer metrics
#include "front/tiny_heap_v2.h" // Front-V2: TLS magazine (tcache-like) front
#include "hakmem_tiny_lazy_init.inc.h" // Phase 22: Lazy per-class initialization
#include "box/tiny_sizeclass_hist_box.h" // Phase 3-4: Tiny size class histogram (ACE learning)
#include "box/ultra_slim_alloc_box.h" // Phase 19-2: Ultra SLIM 4-layer fast path
#include <stdio.h>
#include <stdatomic.h>
// P1.3/P2.2: Helper to track active/tls_cached when allocating from TLS SLL
// ENV gate: HAKMEM_TINY_ACTIVE_TRACK=1 to enable (default: 0 for performance)
// Flow: TLS SLL → User means active++, tls_cached--
// Priority-2: Use cached ENV (eliminate lazy-init syscall overhead)
static inline void tiny_active_track_alloc(void* base) {
if (__builtin_expect(HAK_ENV_TINY_ACTIVE_TRACK(), 0)) {
extern SuperSlab* ss_fast_lookup(void* ptr);
SuperSlab* ss = ss_fast_lookup(base);
if (ss && ss->magic == SUPERSLAB_MAGIC) {
int slab_idx = slab_index_for(ss, base);
if (slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss)) {
TinySlabMeta* meta = &ss->slabs[slab_idx];
atomic_fetch_add_explicit(&meta->active, 1, memory_order_relaxed);
atomic_fetch_sub_explicit(&meta->tls_cached, 1, memory_order_relaxed); // P2.2
}
}
}
}
// Diag counter: size>=1024 allocations routed to Tiny (env: HAKMEM_TINY_ALLOC_1024_METRIC)
extern _Atomic uint64_t g_tiny_alloc_ge1024[];
// Priority-2: Use cached ENV (eliminate lazy-init syscall overhead)
static inline void tiny_diag_track_size_ge1024_fast(size_t req_size, int class_idx) {
if (__builtin_expect(req_size < 1024, 1)) return;
if (!__builtin_expect(HAK_ENV_TINY_ALLOC_1024_METRIC(), 0)) return;
if (__builtin_expect(class_idx >= 0 && class_idx < TINY_NUM_CLASSES, 1)) {
atomic_fetch_add_explicit(&g_tiny_alloc_ge1024[class_idx], 1, memory_order_relaxed);
}
}
// Phase 7 Task 2: Aggressive inline TLS cache access
// Enable with: make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1
#ifndef HAKMEM_TINY_AGGRESSIVE_INLINE
#define HAKMEM_TINY_AGGRESSIVE_INLINE 0
#endif
#if HAKMEM_TINY_AGGRESSIVE_INLINE
#include "tiny_alloc_fast_inline.h"
#endif
// ========== Debug Counters (compile-time gated) ==========
#if HAKMEM_DEBUG_COUNTERS
// Refill-stage counters (defined in hakmem_tiny.c)
extern unsigned long long g_rf_total_calls[];
extern unsigned long long g_rf_hit_bench[];
extern unsigned long long g_rf_hit_hot[];
extern unsigned long long g_rf_hit_mail[];
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_hit_ss[];
extern unsigned long long g_rf_hit_reg[];
extern unsigned long long g_rf_mmap_calls[];
// Publish hits (defined in hakmem_tiny.c)
extern unsigned long long g_pub_mail_hits[];
extern unsigned long long g_pub_bench_hits[];
extern unsigned long long g_pub_hot_hits[];
// Free pipeline (defined in hakmem_tiny.c)
extern unsigned long long g_free_via_tls_sll[];
#endif
// ========== Box 5: Allocation Fast Path ==========
// 箱理論の Fast Allocation 層。TLS freelist から直接 pop3-4命令
// 不変条件:
// - TLS freelist が非空なら即座に return (no lock, no sync)
// - Miss なら Backend (Box 3: SuperSlab) に委譲
// - Cross-thread allocation は考慮しないBackend が処理)
// External TLS variables (defined in hakmem_tiny.c)
// Phase 3d-B: TLS Cache Merge - Unified TLS SLL structure
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
// External backend functions
// P0 Fix: Use appropriate refill function based on P0 status
#if HAKMEM_TINY_P0_BATCH_REFILL
extern int sll_refill_batch_from_ss(int class_idx, int max_take);
#else
extern int sll_refill_small_from_ss(int class_idx, int max_take);
#endif
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
extern int hak_tiny_size_to_class(size_t size);
extern int tiny_refill_failfast_level(void);
extern const size_t g_tiny_class_sizes[];
// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
extern int g_refill_count_global;
extern int g_refill_count_hot;
extern int g_refill_count_mid;
extern int g_refill_count_class[TINY_NUM_CLASSES];
// HAK_RET_ALLOC macro is now defined in core/hakmem_tiny.c
// See lines 116-152 for single definition point based on HAKMEM_TINY_HEADER_CLASSIDX
// ========== RDTSC Profiling (lightweight) ==========
#ifdef __x86_64__
static inline uint64_t tiny_fast_rdtsc(void) {
unsigned int lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
#endif
// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
static __thread uint64_t g_tiny_alloc_hits = 0;
static __thread uint64_t g_tiny_alloc_cycles = 0;
static __thread uint64_t g_tiny_refill_calls = 0;
static __thread uint64_t g_tiny_refill_cycles = 0;
// Priority-2: Use cached ENV (eliminate lazy-init + static var overhead)
static inline int tiny_profile_enabled(void) {
return HAK_ENV_TINY_PROFILE();
}
// Print profiling results at exit
static void tiny_fast_print_profile(void) __attribute__((destructor));
static void tiny_fast_print_profile(void) {
if (!tiny_profile_enabled()) return;
if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;
fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
if (g_tiny_alloc_hits > 0) {
fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_alloc_hits,
(unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
}
if (g_tiny_refill_calls > 0) {
fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_refill_calls,
(unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
}
fprintf(stderr, "===================================================\n\n");
}
// ========== Front-V2 helpers (tcache-like TLS magazine) ==========
// Priority-2: Use cached ENV (eliminate lazy-init overhead)
static inline int tiny_heap_v2_stats_enabled(void) {
return HAK_ENV_TINY_HEAP_V2_STATS();
}
// TLS HeapV2 initialization barrier (ensures mag->top is zero on first use)
static inline void tiny_heap_v2_ensure_init(void) {
extern __thread int g_tls_heap_v2_initialized;
extern __thread TinyHeapV2Mag g_tiny_heap_v2_mag[];
if (__builtin_expect(!g_tls_heap_v2_initialized, 0)) {
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
g_tiny_heap_v2_mag[i].top = 0;
}
g_tls_heap_v2_initialized = 1;
}
}
static inline int tiny_heap_v2_refill_mag(int class_idx) {
// FIX: Ensure TLS is initialized before first magazine access
tiny_heap_v2_ensure_init();
if (class_idx < 0 || class_idx > 3) return 0;
if (!tiny_heap_v2_class_enabled(class_idx)) return 0;
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
if (!TINY_FRONT_TLS_SLL_ENABLED) return 0;
TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx];
const int cap = TINY_HEAP_V2_MAG_CAP;
int filled = 0;
// FIX: Validate mag->top before use (prevent uninitialized TLS corruption)
if (mag->top < 0 || mag->top > cap) {
static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0};
if (!s_reset_logged[class_idx]) {
fprintf(stderr, "[HEAP_V2_REFILL] C%d mag->top=%d corrupted, reset to 0\n",
class_idx, mag->top);
s_reset_logged[class_idx] = 1;
}
mag->top = 0;
}
// First, steal from TLS SLL if already available.
while (mag->top < cap) {
void* base = NULL;
if (!tls_sll_pop(class_idx, &base)) break;
mag->items[mag->top++] = base;
filled++;
}
// If magazine is still empty, ask backend to refill SLL once, then steal again.
if (mag->top < cap && filled == 0) {
#if HAKMEM_TINY_P0_BATCH_REFILL
(void)sll_refill_batch_from_ss(class_idx, cap);
#else
(void)sll_refill_small_from_ss(class_idx, cap);
#endif
while (mag->top < cap) {
void* base = NULL;
if (!tls_sll_pop(class_idx, &base)) break;
mag->items[mag->top++] = base;
filled++;
}
}
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
if (filled > 0) {
g_tiny_heap_v2_stats[class_idx].refill_calls++;
g_tiny_heap_v2_stats[class_idx].refill_blocks += (uint64_t)filled;
}
}
return filled;
}
static inline void* tiny_heap_v2_alloc_by_class(int class_idx) {
// FIX: Ensure TLS is initialized before first magazine access
tiny_heap_v2_ensure_init();
if (class_idx < 0 || class_idx > 3) return NULL;
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
if (!TINY_FRONT_HEAP_V2_ENABLED) return NULL;
if (!tiny_heap_v2_class_enabled(class_idx)) return NULL;
TinyHeapV2Mag* mag = &g_tiny_heap_v2_mag[class_idx];
// Hit: magazine has entries
if (__builtin_expect(mag->top > 0, 1)) {
// FIX: Add underflow protection before array access
const int cap = TINY_HEAP_V2_MAG_CAP;
if (mag->top > cap || mag->top < 0) {
static __thread int s_reset_logged[TINY_NUM_CLASSES] = {0};
if (!s_reset_logged[class_idx]) {
fprintf(stderr, "[HEAP_V2_ALLOC] C%d mag->top=%d corrupted, reset to 0\n",
class_idx, mag->top);
s_reset_logged[class_idx] = 1;
}
mag->top = 0;
return NULL; // Fall through to refill path
}
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
g_tiny_heap_v2_stats[class_idx].alloc_calls++;
g_tiny_heap_v2_stats[class_idx].mag_hits++;
}
return mag->items[--mag->top];
}
// Miss: try single refill from SLL/backend
int filled = tiny_heap_v2_refill_mag(class_idx);
if (filled > 0 && mag->top > 0) {
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
g_tiny_heap_v2_stats[class_idx].alloc_calls++;
g_tiny_heap_v2_stats[class_idx].mag_hits++;
}
return mag->items[--mag->top];
}
if (__builtin_expect(tiny_heap_v2_stats_enabled(), 0)) {
g_tiny_heap_v2_stats[class_idx].backend_oom++;
}
return NULL;
}
// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========
// External SFC control (defined in hakmem_tiny_sfc.c)
extern int g_sfc_enabled;
// Allocation fast path (inline for zero-cost)
// Returns: pointer on success, NULL on miss (caller should try refill/slow)
//
// Box 5-NEW Architecture:
// Layer 0: SFC (128-256 slots, high hit rate) [if enabled]
// Layer 1: SLL (unlimited, existing)
// Cascade: SFC miss → try SLL → refill
//
// Assembly (x86-64, optimized):
// mov rax, QWORD PTR g_sfc_head[class_idx] ; SFC: Load head
// test rax, rax ; Check NULL
// jne .sfc_hit ; If not empty, SFC hit!
// mov rax, QWORD PTR g_tls_sll_head[class_idx] ; SLL: Load head
// test rax, rax ; Check NULL
// je .miss ; If empty, miss
// mov rdx, QWORD PTR [rax] ; Load next
// mov QWORD PTR g_tls_sll_head[class_idx], rdx ; Update head
// ret ; Return ptr
// .sfc_hit:
// mov rdx, QWORD PTR [rax] ; Load next
// mov QWORD PTR g_sfc_head[class_idx], rdx ; Update head
// ret
// .miss:
// ; Fall through to refill
//
// Expected: 3-4 instructions on SFC hit, 6-8 on SLL hit
static inline void* tiny_alloc_fast_pop(int class_idx) {
// PRIORITY 1: Bounds check before any TLS array access
HAK_CHECK_CLASS_IDX(class_idx, "tiny_alloc_fast_pop");
#if !HAKMEM_BUILD_RELEASE
// Phase 3: Debug counters eliminated in release builds
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
// DEBUG: Log class 2 pops (DISABLED for performance)
static _Atomic uint64_t g_fast_pop_count = 0;
uint64_t pop_call = atomic_fetch_add(&g_fast_pop_count, 1);
if (0 && class_idx == 2 && pop_call > 5840 && pop_call < 5900) {
fprintf(stderr, "[FAST_POP_C2] call=%lu cls=%d head=%p count=%u\n",
pop_call, class_idx, g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
fflush(stderr);
}
#endif
// Phase E1-CORRECT: C7 now has headers, can use fast path
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
void* out = NULL;
if (front_gate_try_pop(class_idx, &out)) {
return out;
}
return NULL;
#else
// ========== Phase 19-1: Quick Prune (Frontend SLIM mode) ==========
// ENV: HAKMEM_TINY_FRONT_SLIM=1
// Goal: Skip FastCache + SFC layers, go straight to SLL (88-99% hit rate)
// Expected: 22M → 27-30M ops/s (+22-36%)
// Priority-2: Use cached ENV (eliminate lazy-init TLS overhead)
// SLIM MODE: Skip FastCache + SFC, go straight to SLL
if (__builtin_expect(HAK_ENV_TINY_FRONT_SLIM(), 0)) {
// Box Boundary: TLS SLL freelist pop (only layer in SLIM mode)
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) {
void* base = NULL;
if (tls_sll_pop(class_idx, &base)) {
// Front Gate: SLL hit (SLIM fast path - 3 instructions)
extern unsigned long long g_front_sll_hit[];
g_front_sll_hit[class_idx]++;
// P1.3: Track active when allocating from TLS SLL
tiny_active_track_alloc(base);
return base;
}
}
// SLIM mode miss → return NULL (caller refills)
return NULL;
}
// ========== End Phase 19-1: Quick Prune ==========
// Phase 7 Task 3: Profiling overhead removed in release builds
// In release mode, compiler can completely eliminate profiling code
#if !HAKMEM_BUILD_RELEASE
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
#endif
// Phase 1: Try array stack (FastCache) first for hottest tiny classes (C0C3)
// Phase 7-Step4: Use config macro for dead code elimination in PGO mode
if (__builtin_expect(TINY_FRONT_FASTCACHE_ENABLED && class_idx <= 3, 1)) {
void* fc = fastcache_pop(class_idx);
if (__builtin_expect(fc != NULL, 1)) {
// Frontend FastCache hit (already tracked by g_front_fc_hit)
extern unsigned long long g_front_fc_hit[];
g_front_fc_hit[class_idx]++;
return fc;
} else {
// Frontend FastCache miss (already tracked by g_front_fc_miss)
extern unsigned long long g_front_fc_miss[];
g_front_fc_miss[class_idx]++;
}
}
// Box 5-NEW: Layer 0 - Try SFC first (if enabled)
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
static __thread int sfc_check_done = 0;
static __thread int sfc_is_enabled = 0;
if (__builtin_expect(!sfc_check_done, 0)) {
sfc_is_enabled = TINY_FRONT_SFC_ENABLED;
sfc_check_done = 1;
}
if (__builtin_expect(sfc_is_enabled, 1)) {
void* base = sfc_alloc(class_idx);
if (__builtin_expect(base != NULL, 1)) {
// Front Gate: SFC hit
extern unsigned long long g_front_sfc_hit[];
g_front_sfc_hit[class_idx]++;
// 🚀 SFC HIT! (Layer 0)
#if !HAKMEM_BUILD_RELEASE
if (start) {
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
g_tiny_alloc_hits++;
}
#endif
// ✅ FIX #16: Return BASE pointer (not USER)
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
// which does the BASE → USER conversion. Double conversion was causing corruption!
return base;
}
// SFC miss → try SLL (Layer 1)
}
// Box Boundary: Layer 1 - TLS SLL freelist の先頭を popenvで無効化可
// Note: This is in tiny_alloc_fast_pop(), not tiny_alloc_fast(), so use global variable
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) {
// Use Box TLS-SLL API (C7-safe pop)
// CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!)
// Reading head before pop causes stale read → rbp=0xa0 SEGV
void* base = NULL;
if (tls_sll_pop(class_idx, &base)) {
// Front Gate: SLL hit (fast path 3 instructions)
extern unsigned long long g_front_sll_hit[];
g_front_sll_hit[class_idx]++;
// P1.3: Track active when allocating from TLS SLL
tiny_active_track_alloc(base);
#if HAKMEM_DEBUG_COUNTERS
// Track TLS freelist hits (compile-time gated, zero runtime cost when disabled)
g_free_via_tls_sll[class_idx]++;
#endif
#if !HAKMEM_BUILD_RELEASE
// Debug: Track profiling (release builds skip this overhead)
if (start) {
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
g_tiny_alloc_hits++;
}
#endif
// ✅ FIX #16: Return BASE pointer (not USER)
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
// which does the BASE → USER conversion. Double conversion was causing corruption!
return base;
}
}
// Fast path miss → NULL (caller should refill)
return NULL;
#endif
}
// ========== Cascade Refill: SFC ← SLL (Box Theory boundary) ==========
// Cascade refill: Transfer blocks from SLL to SFC (one-way, safe)
// Returns: number of blocks transferred
//
// Contract:
// - Transfer ownership: SLL → SFC
// - No circular dependency: one-way only
// - Boundary clear: SLL pop → SFC push
// - Fallback safe: if SFC full, stop (no overflow)
// Env-driven cascade percentage (0-100), default 50%
// Priority-2: Use cached ENV (eliminate lazy-init + atoi() overhead)
static inline int sfc_cascade_pct(void) {
return HAK_ENV_SFC_CASCADE_PCT();
}
static inline int sfc_refill_from_sll(int class_idx, int target_count) {
// PRIORITY 1: Bounds check
HAK_CHECK_CLASS_IDX(class_idx, "sfc_refill_from_sll");
#if !HAKMEM_BUILD_RELEASE
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
#endif
int transferred = 0;
uint32_t cap = g_sfc_capacity[class_idx];
// Adjust target based on cascade percentage
int pct = sfc_cascade_pct();
int want = (target_count * pct) / 100;
if (want <= 0) want = target_count / 2; // safety fallback
while (transferred < want && g_tls_sll[class_idx].count > 0) {
// Check SFC capacity before transfer
if (g_sfc_count[class_idx] >= cap) {
break; // SFC full, stop
}
// Pop from SLL (Layer 1) using Box TLS-SLL API (C7-safe)
void* ptr = NULL;
if (!tls_sll_pop(class_idx, &ptr)) {
break; // SLL empty
}
// Push to SFC (Layer 0) — header-aware
tiny_next_write(class_idx, ptr, g_sfc_head[class_idx]);
g_sfc_head[class_idx] = ptr;
g_sfc_count[class_idx]++;
transferred++;
}
return transferred;
}
// ========== Refill Path: Backend Integration ==========
// Refill TLS freelist from backend (SuperSlab/ACE/Learning layer)
// Returns: number of blocks refilled
//
// Box 5-NEW Architecture:
// SFC enabled: SuperSlab → SLL → SFC (cascade)
// SFC disabled: SuperSlab → SLL (direct, old path)
//
// This integrates with existing HAKMEM infrastructure:
// - SuperSlab provides memory chunks
// - ACE provides adaptive capacity learning
// - L25 provides mid-large integration
//
// Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 16)
// - Smaller count (8-16): better for diverse workloads, faster warmup
// - Larger count (64-128): better for homogeneous workloads, fewer refills
static inline int tiny_alloc_fast_refill(int class_idx) {
// Phase E1-CORRECT: C7 now has headers, can use refill
// Phase 7 Task 3: Profiling overhead removed in release builds
// In release mode, compiler can completely eliminate profiling code
#if !HAKMEM_BUILD_RELEASE
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
#endif
// Phase 2b: Check available capacity before refill
int available_capacity = get_available_capacity(class_idx);
if (available_capacity <= 0) {
// Cache is full, don't refill
return 0;
}
// Phase 7 Task 3: Simplified refill count (cached per-class in TLS)
// Previous: Complex precedence logic on every miss (5-10 cycles overhead)
// Now: Simple TLS cache lookup (1-2 cycles)
static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
// Simple adaptive booster: bump per-class refill size when refills are frequent.
static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0};
int cnt = s_refill_count[class_idx];
if (__builtin_expect(cnt == 0, 0)) {
// First miss: Initialize from globals (parsed at init time)
int v = HAKMEM_TINY_REFILL_DEFAULT; // Default from hakmem_build_flags.h
// Precedence: per-class > hot/mid > global
if (g_refill_count_class[class_idx] > 0) {
v = g_refill_count_class[class_idx];
} else if (class_idx <= 3 && g_refill_count_hot > 0) {
v = g_refill_count_hot;
} else if (class_idx >= 4 && g_refill_count_mid > 0) {
v = g_refill_count_mid;
} else if (g_refill_count_global > 0) {
v = g_refill_count_global;
}
// Clamp to sane range (min: 8, max: 256)
if (v < 8) v = 8; // Minimum: avoid thrashing
if (v > 256) v = 256; // Maximum: avoid excessive TLS memory
s_refill_count[class_idx] = v;
cnt = v;
}
// Phase 2b: Clamp refill count to available capacity
if (cnt > available_capacity) {
cnt = available_capacity;
}
#if HAKMEM_DEBUG_COUNTERS
// Track refill calls (compile-time gated)
g_rf_total_calls[class_idx]++;
#endif
// Box Boundary: Delegate to Backend (Box 3: SuperSlab)
// Refill Dispatch: Standard (ss_refill_fc_fill) vs Legacy SLL (A/B only)
// Standard: Enabled by FRONT_DIRECT=1, REFILL_BATCH=1, or P0_DIRECT_FC_ALL=1
// Legacy: Fallback for compatibility (will be deprecated)
int refilled = 0;
// Front-Direct A/B 実装は現 HEAD では非対応。
// 常にレガシー経路SS→SLL→FCを使う。
#if HAKMEM_TINY_P0_BATCH_REFILL
refilled = sll_refill_batch_from_ss(class_idx, cnt);
#else
refilled = sll_refill_small_from_ss(class_idx, cnt);
#endif
// Lightweight adaptation: if refills keep happening, increase per-class refill.
// Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
if (refilled > 0) {
uint8_t c = ++s_refill_calls[class_idx];
if (class_idx == 7) {
// Every 4 refills, increase target by +16 up to 128 (unless overridden).
if ((c & 0x03u) == 0) {
int target = s_refill_count[class_idx];
if (target < 128) {
target += 16;
if (target > 128) target = 128;
s_refill_count[class_idx] = target;
}
}
}
} else {
// No refill performed (capacity full): slowly decay the counter.
if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--;
}
// Phase 2b: Track refill and adapt cache size
if (refilled > 0) {
track_refill_for_adaptation(class_idx);
}
// Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default)
// Priority-2: Use cached ENV (eliminate lazy-init TLS overhead)
// Only cascade if explicitly enabled AND we have refilled blocks in SLL
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
if (HAK_ENV_TINY_SFC_CASCADE() && TINY_FRONT_SFC_ENABLED && refilled > 0) {
// Transfer half of refilled blocks to SFC (keep half in SLL for future)
int sfc_target = refilled / 2;
if (sfc_target > 0) {
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
front_gate_after_refill(class_idx, refilled);
#else
int transferred = sfc_refill_from_sll(class_idx, sfc_target);
(void)transferred; // Unused, but could track stats
#endif
}
}
#if !HAKMEM_BUILD_RELEASE
// Debug: Track profiling (release builds skip this overhead)
if (start) {
g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
g_tiny_refill_calls++;
}
#endif
return refilled;
}
// ========== Combined Fast Path (Alloc + Refill) ==========
// Complete fast path allocation (inline for zero-cost)
// Returns: pointer on success, NULL on failure (OOM or size too large)
//
// Flow:
// 1. TLS freelist pop (3-4 instructions) - Hit rate ~95%
// 2. Miss → Refill from backend (~5% cases)
// 3. Refill success → Retry pop
// 4. Refill failure → Slow path (OOM or new SuperSlab allocation)
//
// Example usage:
// void* ptr = tiny_alloc_fast(64);
// if (!ptr) {
// // OOM handling
// }
static inline void* tiny_alloc_fast(size_t size) {
#if !HAKMEM_BUILD_RELEASE
// Phase 3: Debug counters eliminated in release builds
static _Atomic uint64_t alloc_call_count = 0;
uint64_t call_num = atomic_fetch_add(&alloc_call_count, 1);
#endif
// Phase 22: Global init (once per process)
lazy_init_global();
// ========== Phase 19-2: Ultra SLIM 4-Layer Fast Path ==========
// ENV: HAKMEM_TINY_ULTRA_SLIM=1
// Expected: 90-110M ops/s (mimalloc parity)
// Architecture: Init Safety + Size-to-Class + ACE Learning + TLS SLL Direct
// Note: ACE learning preserved (HAKMEM's differentiator vs mimalloc)
// Debug: Check if Ultra SLIM is enabled (first call only)
#if !HAKMEM_BUILD_RELEASE
static __thread int debug_checked = 0;
if (!debug_checked) {
// Phase 7-Step8: Use config macro for dead code elimination in PGO mode
int enabled = TINY_FRONT_ULTRA_SLIM_ENABLED;
if (enabled) {
fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: ENABLED (will use 4-layer path)\n");
} else {
fprintf(stderr, "[TINY_ALLOC_FAST] Ultra SLIM gate: DISABLED (will use standard path)\n");
}
debug_checked = 1;
}
#endif
// Phase 7-Step4: Use config macro for dead code elimination in PGO mode
if (__builtin_expect(TINY_FRONT_ULTRA_SLIM_ENABLED, 0)) {
return ultra_slim_alloc_with_refill(size);
}
// ========== End Phase 19-2: Ultra SLIM ==========
// 1. Size → class index (inline, fast)
int class_idx = hak_tiny_size_to_class(size);
if (__builtin_expect(class_idx < 0, 0)) {
return NULL; // Size > 1KB, not Tiny
}
// Phase 3c L1D Opt: Prefetch TLS cache head early
// Phase 3d-B: Prefetch unified TLS SLL struct (single prefetch for both head+count)
__builtin_prefetch(&g_tls_sll[class_idx], 0, 3);
// Phase 22: Lazy per-class init (on first use)
lazy_init_class(class_idx);
// Phase 3-4: Record allocation for ACE Profile learning
// TLS increment only (no atomic operation, amortized flush at threshold)
tiny_sizeclass_hist_hit(class_idx);
// P0.1: Cache g_tls_sll_enable once (Phase 3-4 instruction reduction)
// Eliminates redundant global variable reads (2-3 instructions saved)
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
const int sll_enabled = TINY_FRONT_TLS_SLL_ENABLED;
#if !HAKMEM_BUILD_RELEASE
// Phase 3: Debug checks eliminated in release builds
// CRITICAL: Bounds check to catch corruption
if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) {
fprintf(stderr, "[TINY_ALLOC_FAST] FATAL: class_idx=%d out of bounds! size=%zu call=%lu\n",
class_idx, size, call_num);
fflush(stderr);
abort();
}
// Debug logging (DISABLED for performance)
if (0 && call_num > 14250 && call_num < 14280) {
fprintf(stderr, "[TINY_ALLOC] call=%lu size=%zu class=%d sll_head[%d]=%p count=%u\n",
call_num, size, class_idx, class_idx,
g_tls_sll[class_idx].head, g_tls_sll[class_idx].count);
fflush(stderr);
}
#endif
ROUTE_BEGIN(class_idx);
void* ptr = NULL;
// Front-V2: TLS magazine front (A/B, default OFF)
// Phase 7-Step4: Use config macro for dead code elimination in PGO mode
if (__builtin_expect(TINY_FRONT_HEAP_V2_ENABLED && front_prune_heapv2_enabled() && class_idx <= 3, 0)) {
void* hv2 = tiny_heap_v2_alloc_by_class(class_idx);
if (hv2) {
front_metrics_heapv2_hit(class_idx);
tiny_diag_track_size_ge1024_fast(size, class_idx);
HAK_RET_ALLOC(class_idx, hv2);
} else {
front_metrics_heapv2_miss(class_idx);
}
}
// Generic front (FastCache/SFC/SLL)
// Respect SLL global toggle
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
if (__builtin_expect(TINY_FRONT_TLS_SLL_ENABLED, 1)) {
// For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads.
if (class_idx <= 3) {
#if HAKMEM_TINY_INLINE_SLL
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
#else
// Default: Safe Box API (Box TLS-SLL) for all standard builds
ptr = tiny_alloc_fast_pop(class_idx);
#endif
} else {
void* base = NULL;
if (tls_sll_pop(class_idx, &base)) {
// P1.3: Track active when allocating from TLS SLL
tiny_active_track_alloc(base);
ptr = base;
} else {
ptr = NULL;
}
}
} else {
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
}
// Phase 3c L1D Opt: Prefetch next freelist entry if we got a pointer
if (__builtin_expect(ptr != NULL, 1)) {
__builtin_prefetch(ptr, 0, 3);
}
if (__builtin_expect(ptr != NULL, 1)) {
tiny_diag_track_size_ge1024_fast(size, class_idx);
HAK_RET_ALLOC(class_idx, ptr);
}
// Refill to TLS List/SLL
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
if (took) {
tiny_diag_track_size_ge1024_fast(size, class_idx);
HAK_RET_ALLOC(class_idx, took);
}
// Backend refill後に再トライ
{
int refilled = tiny_alloc_fast_refill(class_idx);
if (__builtin_expect(refilled > 0, 1)) {
// Retry SLL if enabled (P0.1: using cached sll_enabled)
if (__builtin_expect(sll_enabled, 1)) {
if (class_idx <= 3) {
#if HAKMEM_TINY_INLINE_SLL
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
#else
// Default: Safe Box API (Box TLS-SLL) for all standard builds
ptr = tiny_alloc_fast_pop(class_idx);
#endif
} else {
void* base2 = NULL;
if (tls_sll_pop(class_idx, &base2)) {
// P1.3: Track active when allocating from TLS SLL
tiny_active_track_alloc(base2);
ptr = base2;
} else {
ptr = NULL;
}
}
} else {
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
}
if (ptr) {
tiny_diag_track_size_ge1024_fast(size, class_idx);
HAK_RET_ALLOC(class_idx, ptr);
}
}
}
// 5. Refill failure or still empty → slow path (OOM or new SuperSlab)
// Box Boundary: Delegate to Slow Path (Box 3 backend)
ptr = hak_tiny_alloc_slow(size, class_idx);
if (ptr) {
tiny_diag_track_size_ge1024_fast(size, class_idx);
HAK_RET_ALLOC(class_idx, ptr);
}
return ptr; // NULL if OOM
}
// ========== Push to TLS Freelist (for free path) ==========
// Push block to TLS freelist (used by free fast path)
// This is a "helper" for Box 6 (Free Fast Path)
//
// Invariant: ptr must belong to current thread (no ownership check here)
// Caller (Box 6) is responsible for ownership verification
static inline void tiny_alloc_fast_push(int class_idx, void* ptr) {
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
front_gate_push_tls(class_idx, ptr);
#else
// Box Boundary: Push to TLS freelist using Box TLS-SLL API (C7-safe)
uint32_t capacity = UINT32_MAX; // Unlimited for helper function
if (!tls_sll_push(class_idx, ptr, capacity)) {
// C7 rejected or SLL somehow full (should not happen)
// In release builds, this is a no-op (caller expects success)
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[WARN] tls_sll_push failed in tiny_alloc_fast_push cls=%d ptr=%p\n",
class_idx, ptr);
#endif
}
#endif
}
// ========== Statistics & Diagnostics ==========
// Get TLS freelist stats (for debugging/profiling)
typedef struct {
int class_idx;
void* head;
uint32_t count;
} TinyAllocFastStats;
static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
TinyAllocFastStats stats = {
.class_idx = class_idx,
.head = g_tls_sll[class_idx].head,
.count = g_tls_sll[class_idx].count
};
return stats;
}
// Reset TLS freelist (for testing/benchmarking)
// WARNING: This leaks memory! Only use in controlled test environments.
static inline void tiny_alloc_fast_reset(int class_idx) {
g_tls_sll[class_idx].head = NULL;
g_tls_sll[class_idx].count = 0;
}
// ========== Performance Notes ==========
//
// Expected metrics (based on System tcache & HAKX +171% results):
// - Fast path hit rate: 95%+ (workload dependent)
// - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs)
// - Miss penalty: ~20-50 instructions (refill from SuperSlab)
// - Throughput improvement: +10-25% vs current multi-layer design
//
// Key optimizations:
// 1. `__builtin_expect` for branch prediction (hot path first)
// 2. `static inline` for zero-cost abstraction
// 3. TLS variables (no atomic ops, no locks)
// 4. Minimal work in fast path (defer stats/accounting to backend)
//
// Comparison with current design:
// - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...)
// - New: 3-4 instructions (TLS freelist pop only)
// - Reduction: -80% instructions in hot path
//
// Inspired by:
// - System tcache (glibc malloc) - 3-4 instruction fast path
// - HAKX Mid-Large (+171%) - "Simple Front + Smart Back"
// - Box Theory - Clear boundaries, minimal coupling