Files
hakmem/core/tiny_alloc_fast.inc.h
Moe Charm (CI) bb70d422dc Phase 13-B: TinyHeapV2 supply path with dual-mode A/B framework (Stealing vs Leftover)
Summary:
- Implemented free path supply with ENV-gated A/B modes (HAKMEM_TINY_HEAP_V2_LEFTOVER_MODE)
- Mode 0 (Stealing, default): L0 gets freed blocks first → +18% @ 32B
- Mode 1 (Leftover): L1 primary owner, L0 gets leftovers → Box-clean but -5% @ 16B
- Decision: Default to Stealing for performance (ChatGPT analysis: L0 doesn't corrupt learning layer signals)

Performance (100K iterations, workset=128):
- 16B: 43.9M → 45.6M ops/s (+3.9%)
- 32B: 41.9M → 49.6M ops/s (+18.4%) 
- 64B: 51.2M → 51.5M ops/s (+0.6%)
- 100% magazine hit rate (supply from free path working correctly)

Implementation:
- tiny_free_fast_v2.inc.h: Dual-mode supply (lines 134-166)
- tiny_heap_v2.h: Add tiny_heap_v2_leftover_mode() flag + rationale doc
- tiny_alloc_fast.inc.h: Alloc hook with tiny_heap_v2_alloc_by_class()
- CURRENT_TASK.md: Updated Phase 13-B status (complete) with A/B results

ENV flags:
- HAKMEM_TINY_HEAP_V2=1                      # Enable TinyHeapV2
- HAKMEM_TINY_HEAP_V2_LEFTOVER_MODE=0        # Mode 0 (Stealing, default)
- HAKMEM_TINY_HEAP_V2_CLASS_MASK=0xE         # C1-C3 only (skip C0 -5% regression)
- HAKMEM_TINY_HEAP_V2_STATS=1                # Print statistics

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-15 16:28:40 +09:00

792 lines
30 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// tiny_alloc_fast.inc.h - Box 5: Allocation Fast Path (3-4 instructions)
// Purpose: Ultra-fast TLS freelist pop (inspired by System tcache & Mid-Large HAKX +171%)
// Invariant: Hit rate > 95% → 3-4 instructions, Miss → refill from backend
// Design: "Simple Front + Smart Back" - Front is dumb & fast, Back is smart
//
// Box 5-NEW: SFC (Super Front Cache) Integration
// Architecture: SFC (Layer 0, 128-256 slots) → SLL (Layer 1, unlimited) → SuperSlab (Layer 2+)
// Cascade Refill: SFC ← SLL (one-way, safe)
// Goal: +200% performance (4.19M → 12M+ ops/s)
//
// Phase 2b: Adaptive TLS Cache Sizing
// Hot classes grow to 2048 slots, cold classes shrink to 16 slots
// Expected: +3-10% performance, -30-50% TLS cache memory overhead
#pragma once
#include "tiny_atomic.h"
#include "hakmem_tiny.h"
#include "tiny_route.h"
#include "tiny_alloc_fast_sfc.inc.h" // Box 5-NEW: SFC Layer
#include "hakmem_tiny_fastcache.inc.h" // Array stack (FastCache) for C0C3
#include "hakmem_tiny_tls_list.h" // TLS List (for tiny_fast_refill_and_take)
#include "tiny_region_id.h" // Phase 7: Header-based class_idx lookup
#include "tiny_adaptive_sizing.h" // Phase 2b: Adaptive sizing
#include "box/tls_sll_box.h" // Box TLS-SLL: C7-safe push/pop/splice
#include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
#include "box/front_gate_box.h"
#endif
#include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
#include "front/tiny_front_c23.h" // Phase B: Ultra-simple C2/C3 front
#include "front/tiny_heap_v2.h" // Phase 13-A: TinyHeapV2 magazine front
#endif
#include <stdio.h>
// Phase 7 Task 2: Aggressive inline TLS cache access
// Enable with: make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1
#ifndef HAKMEM_TINY_AGGRESSIVE_INLINE
#define HAKMEM_TINY_AGGRESSIVE_INLINE 0
#endif
#if HAKMEM_TINY_AGGRESSIVE_INLINE
#include "tiny_alloc_fast_inline.h"
#endif
// ========== Debug Counters (compile-time gated) ==========
#if HAKMEM_DEBUG_COUNTERS
// Refill-stage counters (defined in hakmem_tiny.c)
extern unsigned long long g_rf_total_calls[];
extern unsigned long long g_rf_hit_bench[];
extern unsigned long long g_rf_hit_hot[];
extern unsigned long long g_rf_hit_mail[];
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_hit_ss[];
extern unsigned long long g_rf_hit_reg[];
extern unsigned long long g_rf_mmap_calls[];
// Publish hits (defined in hakmem_tiny.c)
extern unsigned long long g_pub_mail_hits[];
extern unsigned long long g_pub_bench_hits[];
extern unsigned long long g_pub_hot_hits[];
// Free pipeline (defined in hakmem_tiny.c)
extern unsigned long long g_free_via_tls_sll[];
#endif
// ========== Box 5: Allocation Fast Path ==========
// 箱理論の Fast Allocation 層。TLS freelist から直接 pop3-4命令
// 不変条件:
// - TLS freelist が非空なら即座に return (no lock, no sync)
// - Miss なら Backend (Box 3: SuperSlab) に委譲
// - Cross-thread allocation は考慮しないBackend が処理)
// External TLS variables (defined in hakmem_tiny.c)
extern __thread void* g_tls_sll_head[TINY_NUM_CLASSES];
extern __thread uint32_t g_tls_sll_count[TINY_NUM_CLASSES];
// External backend functions
// P0 Fix: Use appropriate refill function based on P0 status
#if HAKMEM_TINY_P0_BATCH_REFILL
extern int sll_refill_batch_from_ss(int class_idx, int max_take);
#else
extern int sll_refill_small_from_ss(int class_idx, int max_take);
#endif
// NEW: Direct SS→FC refill (bypasses SLL)
extern int ss_refill_fc_fill(int class_idx, int want);
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
extern int hak_tiny_size_to_class(size_t size);
extern int tiny_refill_failfast_level(void);
extern const size_t g_tiny_class_sizes[];
// Hot-class toggle: class5 (256B) dedicated TLS fast path
extern int g_tiny_hotpath_class5;
// Minimal class5 refill helper: fixed, branch-light refill into TLS List, then take one
// Preconditions: class_idx==5 and g_tiny_hotpath_class5==1
static inline void* tiny_class5_minirefill_take(void) {
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
TinyTLSList* tls5 = &g_tls_lists[5];
// Fast pop if available
void* base = tls_list_pop(tls5, 5);
if (base) {
// ✅ FIX #16: Return BASE pointer (not USER)
// Caller will apply HAK_RET_ALLOC which does BASE → USER conversion
return base;
}
// Robust refill via generic helperheader対応・境界検証済み
return tiny_fast_refill_and_take(5, tls5);
}
// Global Front refill config (parsed at init; defined in hakmem_tiny.c)
extern int g_refill_count_global;
extern int g_refill_count_hot;
extern int g_refill_count_mid;
extern int g_refill_count_class[TINY_NUM_CLASSES];
// HAK_RET_ALLOC macro is now defined in core/hakmem_tiny.c
// See lines 116-152 for single definition point based on HAKMEM_TINY_HEADER_CLASSIDX
// ========== RDTSC Profiling (lightweight) ==========
#ifdef __x86_64__
static inline uint64_t tiny_fast_rdtsc(void) {
unsigned int lo, hi;
__asm__ __volatile__ ("rdtsc" : "=a" (lo), "=d" (hi));
return ((uint64_t)hi << 32) | lo;
}
#else
static inline uint64_t tiny_fast_rdtsc(void) { return 0; }
#endif
// Per-thread profiling counters (enable with HAKMEM_TINY_PROFILE=1)
static __thread uint64_t g_tiny_alloc_hits = 0;
static __thread uint64_t g_tiny_alloc_cycles = 0;
static __thread uint64_t g_tiny_refill_calls = 0;
static __thread uint64_t g_tiny_refill_cycles = 0;
static int g_tiny_profile_enabled = -1; // -1: uninitialized
static inline int tiny_profile_enabled(void) {
if (__builtin_expect(g_tiny_profile_enabled == -1, 0)) {
const char* env = getenv("HAKMEM_TINY_PROFILE");
g_tiny_profile_enabled = (env && *env && *env != '0') ? 1 : 0;
}
return g_tiny_profile_enabled;
}
// Print profiling results at exit
static void tiny_fast_print_profile(void) __attribute__((destructor));
static void tiny_fast_print_profile(void) {
if (!tiny_profile_enabled()) return;
if (g_tiny_alloc_hits == 0 && g_tiny_refill_calls == 0) return;
fprintf(stderr, "\n========== Box Theory Fast Path Profile ==========\n");
if (g_tiny_alloc_hits > 0) {
fprintf(stderr, "[ALLOC HIT] count=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_alloc_hits,
(unsigned long)(g_tiny_alloc_cycles / g_tiny_alloc_hits));
}
if (g_tiny_refill_calls > 0) {
fprintf(stderr, "[REFILL] count=%lu, avg_cycles=%lu\n",
(unsigned long)g_tiny_refill_calls,
(unsigned long)(g_tiny_refill_cycles / g_tiny_refill_calls));
}
fprintf(stderr, "===================================================\n\n");
}
// ========== Fast Path: TLS Freelist Pop (3-4 instructions) ==========
// External SFC control (defined in hakmem_tiny_sfc.c)
extern int g_sfc_enabled;
// Allocation fast path (inline for zero-cost)
// Returns: pointer on success, NULL on miss (caller should try refill/slow)
//
// Box 5-NEW Architecture:
// Layer 0: SFC (128-256 slots, high hit rate) [if enabled]
// Layer 1: SLL (unlimited, existing)
// Cascade: SFC miss → try SLL → refill
//
// Assembly (x86-64, optimized):
// mov rax, QWORD PTR g_sfc_head[class_idx] ; SFC: Load head
// test rax, rax ; Check NULL
// jne .sfc_hit ; If not empty, SFC hit!
// mov rax, QWORD PTR g_tls_sll_head[class_idx] ; SLL: Load head
// test rax, rax ; Check NULL
// je .miss ; If empty, miss
// mov rdx, QWORD PTR [rax] ; Load next
// mov QWORD PTR g_tls_sll_head[class_idx], rdx ; Update head
// ret ; Return ptr
// .sfc_hit:
// mov rdx, QWORD PTR [rax] ; Load next
// mov QWORD PTR g_sfc_head[class_idx], rdx ; Update head
// ret
// .miss:
// ; Fall through to refill
//
// Expected: 3-4 instructions on SFC hit, 6-8 on SLL hit
static inline void* tiny_alloc_fast_pop(int class_idx) {
// PRIORITY 1: Bounds check before any TLS array access
HAK_CHECK_CLASS_IDX(class_idx, "tiny_alloc_fast_pop");
#if !HAKMEM_BUILD_RELEASE
// Phase 3: Debug counters eliminated in release builds
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
// DEBUG: Log class 2 pops (DISABLED for performance)
static _Atomic uint64_t g_fast_pop_count = 0;
uint64_t pop_call = atomic_fetch_add(&g_fast_pop_count, 1);
if (0 && class_idx == 2 && pop_call > 5840 && pop_call < 5900) {
fprintf(stderr, "[FAST_POP_C2] call=%lu cls=%d head=%p count=%u\n",
pop_call, class_idx, g_tls_sll_head[class_idx], g_tls_sll_count[class_idx]);
fflush(stderr);
}
#endif
// Phase E1-CORRECT: C7 now has headers, can use fast path
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
void* out = NULL;
if (front_gate_try_pop(class_idx, &out)) {
return out;
}
return NULL;
#else
// Phase 7 Task 3: Profiling overhead removed in release builds
// In release mode, compiler can completely eliminate profiling code
#if !HAKMEM_BUILD_RELEASE
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
#endif
// Phase 1: Try array stack (FastCache) first for hottest tiny classes (C0C3)
if (__builtin_expect(g_fastcache_enable && class_idx <= 3, 1)) {
void* fc = fastcache_pop(class_idx);
if (__builtin_expect(fc != NULL, 1)) {
// Frontend FastCache hit
extern unsigned long long g_front_fc_hit[];
g_front_fc_hit[class_idx]++;
return fc;
} else {
extern unsigned long long g_front_fc_miss[];
g_front_fc_miss[class_idx]++;
}
}
// Box 5-NEW: Layer 0 - Try SFC first (if enabled)
// Cache g_sfc_enabled in TLS to avoid global load on every allocation
static __thread int sfc_check_done = 0;
static __thread int sfc_is_enabled = 0;
if (__builtin_expect(!sfc_check_done, 0)) {
sfc_is_enabled = g_sfc_enabled;
sfc_check_done = 1;
}
if (__builtin_expect(sfc_is_enabled, 1)) {
void* base = sfc_alloc(class_idx);
if (__builtin_expect(base != NULL, 1)) {
// Front Gate: SFC hit
extern unsigned long long g_front_sfc_hit[];
g_front_sfc_hit[class_idx]++;
// 🚀 SFC HIT! (Layer 0)
#if !HAKMEM_BUILD_RELEASE
if (start) {
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
g_tiny_alloc_hits++;
}
#endif
// ✅ FIX #16: Return BASE pointer (not USER)
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
// which does the BASE → USER conversion. Double conversion was causing corruption!
return base;
}
// SFC miss → try SLL (Layer 1)
}
// Box Boundary: Layer 1 - TLS SLL freelist の先頭を popenvで無効化可
extern int g_tls_sll_enable; // set at init via HAKMEM_TINY_TLS_SLL
if (__builtin_expect(g_tls_sll_enable, 1)) {
// Use Box TLS-SLL API (C7-safe pop)
// CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!)
// Reading head before pop causes stale read → rbp=0xa0 SEGV
void* base = NULL;
if (tls_sll_pop(class_idx, &base)) {
// Front Gate: SLL hit (fast path 3 instructions)
extern unsigned long long g_front_sll_hit[];
g_front_sll_hit[class_idx]++;
#if HAKMEM_DEBUG_COUNTERS
// Track TLS freelist hits (compile-time gated, zero runtime cost when disabled)
g_free_via_tls_sll[class_idx]++;
#endif
#if !HAKMEM_BUILD_RELEASE
// Debug: Track profiling (release builds skip this overhead)
if (start) {
g_tiny_alloc_cycles += (tiny_fast_rdtsc() - start);
g_tiny_alloc_hits++;
}
#endif
// ✅ FIX #16: Return BASE pointer (not USER)
// Caller (tiny_alloc_fast) will call HAK_RET_ALLOC → tiny_region_id_write_header
// which does the BASE → USER conversion. Double conversion was causing corruption!
return base;
}
}
// Fast path miss → NULL (caller should refill)
return NULL;
#endif
}
// ========== Cascade Refill: SFC ← SLL (Box Theory boundary) ==========
// Cascade refill: Transfer blocks from SLL to SFC (one-way, safe)
// Returns: number of blocks transferred
//
// Contract:
// - Transfer ownership: SLL → SFC
// - No circular dependency: one-way only
// - Boundary clear: SLL pop → SFC push
// - Fallback safe: if SFC full, stop (no overflow)
// Env-driven cascade percentage (0-100), default 50%
static inline int sfc_cascade_pct(void) {
static int pct = -1;
if (__builtin_expect(pct == -1, 0)) {
const char* e = getenv("HAKMEM_SFC_CASCADE_PCT");
int v = e && *e ? atoi(e) : 50;
if (v < 0) v = 0; if (v > 100) v = 100;
pct = v;
}
return pct;
}
static inline int sfc_refill_from_sll(int class_idx, int target_count) {
// PRIORITY 1: Bounds check
HAK_CHECK_CLASS_IDX(class_idx, "sfc_refill_from_sll");
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
int transferred = 0;
uint32_t cap = g_sfc_capacity[class_idx];
// Adjust target based on cascade percentage
int pct = sfc_cascade_pct();
int want = (target_count * pct) / 100;
if (want <= 0) want = target_count / 2; // safety fallback
while (transferred < want && g_tls_sll_count[class_idx] > 0) {
// Check SFC capacity before transfer
if (g_sfc_count[class_idx] >= cap) {
break; // SFC full, stop
}
// Pop from SLL (Layer 1) using Box TLS-SLL API (C7-safe)
void* ptr = NULL;
if (!tls_sll_pop(class_idx, &ptr)) {
break; // SLL empty
}
// Push to SFC (Layer 0) — header-aware
tiny_next_write(class_idx, ptr, g_sfc_head[class_idx]);
g_sfc_head[class_idx] = ptr;
g_sfc_count[class_idx]++;
transferred++;
}
return transferred;
}
// ========== Refill Path: Backend Integration ==========
// Refill TLS freelist from backend (SuperSlab/ACE/Learning layer)
// Returns: number of blocks refilled
//
// Box 5-NEW Architecture:
// SFC enabled: SuperSlab → SLL → SFC (cascade)
// SFC disabled: SuperSlab → SLL (direct, old path)
//
// This integrates with existing HAKMEM infrastructure:
// - SuperSlab provides memory chunks
// - ACE provides adaptive capacity learning
// - L25 provides mid-large integration
//
// Refill count is tunable via HAKMEM_TINY_REFILL_COUNT (default: 16)
// - Smaller count (8-16): better for diverse workloads, faster warmup
// - Larger count (64-128): better for homogeneous workloads, fewer refills
static inline int tiny_alloc_fast_refill(int class_idx) {
// Phase E1-CORRECT: C7 now has headers, can use refill
// Phase 7 Task 3: Profiling overhead removed in release builds
// In release mode, compiler can completely eliminate profiling code
#if !HAKMEM_BUILD_RELEASE
uint64_t start = tiny_profile_enabled() ? tiny_fast_rdtsc() : 0;
#endif
// Phase 2b: Check available capacity before refill
int available_capacity = get_available_capacity(class_idx);
if (available_capacity <= 0) {
// Cache is full, don't refill
return 0;
}
// Phase 7 Task 3: Simplified refill count (cached per-class in TLS)
// Previous: Complex precedence logic on every miss (5-10 cycles overhead)
// Now: Simple TLS cache lookup (1-2 cycles)
static __thread int s_refill_count[TINY_NUM_CLASSES] = {0};
// Simple adaptive booster: bump per-class refill size when refills are frequent.
static __thread uint8_t s_refill_calls[TINY_NUM_CLASSES] = {0};
int cnt = s_refill_count[class_idx];
if (__builtin_expect(cnt == 0, 0)) {
// First miss: Initialize from globals (parsed at init time)
int v = HAKMEM_TINY_REFILL_DEFAULT; // Default from hakmem_build_flags.h
// Precedence: per-class > hot/mid > global
if (g_refill_count_class[class_idx] > 0) {
v = g_refill_count_class[class_idx];
} else if (class_idx <= 3 && g_refill_count_hot > 0) {
v = g_refill_count_hot;
} else if (class_idx >= 4 && g_refill_count_mid > 0) {
v = g_refill_count_mid;
} else if (g_refill_count_global > 0) {
v = g_refill_count_global;
}
// Clamp to sane range (min: 8, max: 256)
if (v < 8) v = 8; // Minimum: avoid thrashing
if (v > 256) v = 256; // Maximum: avoid excessive TLS memory
s_refill_count[class_idx] = v;
cnt = v;
}
// Phase 2b: Clamp refill count to available capacity
if (cnt > available_capacity) {
cnt = available_capacity;
}
#if HAKMEM_DEBUG_COUNTERS
// Track refill calls (compile-time gated)
g_rf_total_calls[class_idx]++;
#endif
// Box Boundary: Delegate to Backend (Box 3: SuperSlab)
// Refill Dispatch: Standard (ss_refill_fc_fill) vs Legacy SLL (A/B only)
// Standard: Enabled by FRONT_DIRECT=1, REFILL_BATCH=1, or P0_DIRECT_FC_ALL=1
// Legacy: Fallback for compatibility (will be deprecated)
int refilled = 0;
// NEW: Front-Direct refill control (A/B toggle)
static __thread int s_use_front_direct = -1;
if (__builtin_expect(s_use_front_direct == -1, 0)) {
// Check multiple ENV flags (any one enables Front-Direct)
const char* e1 = getenv("HAKMEM_TINY_FRONT_DIRECT");
const char* e2 = getenv("HAKMEM_TINY_P0_DIRECT_FC_ALL");
const char* e3 = getenv("HAKMEM_TINY_REFILL_BATCH");
s_use_front_direct = ((e1 && *e1 && *e1 != '0') ||
(e2 && *e2 && *e2 != '0') ||
(e3 && *e3 && *e3 != '0')) ? 1 : 0;
}
// Refill dispatch
if (s_use_front_direct) {
// NEW: Direct SS→FC (bypasses SLL)
refilled = ss_refill_fc_fill(class_idx, cnt);
} else {
// Legacy: SS→SLL→FC (via batch or generic)
#if HAKMEM_TINY_P0_BATCH_REFILL
refilled = sll_refill_batch_from_ss(class_idx, cnt);
#else
refilled = sll_refill_small_from_ss(class_idx, cnt);
#endif
}
// Lightweight adaptation: if refills keep happening, increase per-class refill.
// Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
if (refilled > 0) {
uint8_t c = ++s_refill_calls[class_idx];
if (class_idx == 7) {
// Every 4 refills, increase target by +16 up to 128 (unless overridden).
if ((c & 0x03u) == 0) {
int target = s_refill_count[class_idx];
if (target < 128) {
target += 16;
if (target > 128) target = 128;
s_refill_count[class_idx] = target;
}
}
}
} else {
// No refill performed (capacity full): slowly decay the counter.
if (s_refill_calls[class_idx] > 0) s_refill_calls[class_idx]--;
}
// Phase 2b: Track refill and adapt cache size
if (refilled > 0) {
track_refill_for_adaptation(class_idx);
}
// Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default)
// NEW: Default OFF, enable via HAKMEM_TINY_SFC_CASCADE=1
// Skip entirely when Front-Direct is active (direct SS→FC path)
static __thread int sfc_cascade_enabled = -1;
if (__builtin_expect(sfc_cascade_enabled == -1, 0)) {
// Front-Direct bypasses SLL, so SFC cascade is pointless
if (s_use_front_direct) {
sfc_cascade_enabled = 0;
} else {
// Check ENV flag (default: OFF)
const char* e = getenv("HAKMEM_TINY_SFC_CASCADE");
sfc_cascade_enabled = (e && *e && *e != '0') ? 1 : 0;
}
}
// Only cascade if explicitly enabled AND we have refilled blocks in SLL
if (sfc_cascade_enabled && g_sfc_enabled && refilled > 0) {
// Skip SFC cascade for class5 when dedicated hotpath is enabled
if (g_tiny_hotpath_class5 && class_idx == 5) {
// no-op: keep refilled blocks in TLS List/SLL
} else {
// Transfer half of refilled blocks to SFC (keep half in SLL for future)
int sfc_target = refilled / 2;
if (sfc_target > 0) {
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
front_gate_after_refill(class_idx, refilled);
#else
int transferred = sfc_refill_from_sll(class_idx, sfc_target);
(void)transferred; // Unused, but could track stats
#endif
}
}
}
#if !HAKMEM_BUILD_RELEASE
// Debug: Track profiling (release builds skip this overhead)
if (start) {
g_tiny_refill_cycles += (tiny_fast_rdtsc() - start);
g_tiny_refill_calls++;
}
#endif
return refilled;
}
// ========== Combined Fast Path (Alloc + Refill) ==========
// Complete fast path allocation (inline for zero-cost)
// Returns: pointer on success, NULL on failure (OOM or size too large)
//
// Flow:
// 1. TLS freelist pop (3-4 instructions) - Hit rate ~95%
// 2. Miss → Refill from backend (~5% cases)
// 3. Refill success → Retry pop
// 4. Refill failure → Slow path (OOM or new SuperSlab allocation)
//
// Example usage:
// void* ptr = tiny_alloc_fast(64);
// if (!ptr) {
// // OOM handling
// }
static inline void* tiny_alloc_fast(size_t size) {
#if !HAKMEM_BUILD_RELEASE
// Phase 3: Debug counters eliminated in release builds
static _Atomic uint64_t alloc_call_count = 0;
uint64_t call_num = atomic_fetch_add(&alloc_call_count, 1);
#endif
// 1. Size → class index (inline, fast)
int class_idx = hak_tiny_size_to_class(size);
if (__builtin_expect(class_idx < 0, 0)) {
return NULL; // Size > 1KB, not Tiny
}
#if !HAKMEM_BUILD_RELEASE
// Phase 3: Debug checks eliminated in release builds
// CRITICAL: Bounds check to catch corruption
if (__builtin_expect(class_idx >= TINY_NUM_CLASSES, 0)) {
fprintf(stderr, "[TINY_ALLOC_FAST] FATAL: class_idx=%d out of bounds! size=%zu call=%lu\n",
class_idx, size, call_num);
fflush(stderr);
abort();
}
// Debug logging (DISABLED for performance)
if (0 && call_num > 14250 && call_num < 14280) {
fprintf(stderr, "[TINY_ALLOC] call=%lu size=%zu class=%d sll_head[%d]=%p count=%u\n",
call_num, size, class_idx, class_idx,
g_tls_sll_head[class_idx], g_tls_sll_count[class_idx]);
fflush(stderr);
}
#endif
ROUTE_BEGIN(class_idx);
void* ptr = NULL;
const int hot_c5 = (g_tiny_hotpath_class5 && class_idx == 5);
// Phase B: Ultra-simple front for C2/C3 (128B/256B)
// ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1
// Target: 15-20M ops/s (vs current 8-9M ops/s)
#ifdef HAKMEM_TINY_HEADER_CLASSIDX
if (tiny_front_c23_enabled() && (class_idx == 2 || class_idx == 3)) {
void* c23_ptr = tiny_front_c23_alloc(size, class_idx);
if (c23_ptr) {
HAK_RET_ALLOC(class_idx, c23_ptr);
}
// Fall through to existing path if C23 path failed (NULL)
}
#endif
// Phase 13-A: TinyHeapV2 (per-thread magazine, experimental)
// ENV-gated: HAKMEM_TINY_HEAP_V2=1
// Targets class 0-3 (8-64B) only, falls back to existing path if NULL
// PERF: Pass class_idx directly to avoid redundant size→class conversion
if (__builtin_expect(tiny_heap_v2_enabled(), 0) && class_idx <= 3) {
void* base = tiny_heap_v2_alloc_by_class(class_idx);
if (base) {
HAK_RET_ALLOC(class_idx, base); // Header write + return USER pointer
}
}
// NEW: Front-Direct/SLL-OFF bypass control (TLS cached, lazy init)
static __thread int s_front_direct_alloc = -1;
if (__builtin_expect(s_front_direct_alloc == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_FRONT_DIRECT");
s_front_direct_alloc = (e && *e && *e != '0') ? 1 : 0;
}
if (__builtin_expect(hot_c5, 0)) {
// class5: 専用最短経路generic frontは一切通らない
void* p = tiny_class5_minirefill_take();
if (p) HAK_RET_ALLOC(class_idx, p);
int refilled = tiny_alloc_fast_refill(class_idx);
if (__builtin_expect(refilled > 0, 1)) {
p = tiny_class5_minirefill_take();
if (p) HAK_RET_ALLOC(class_idx, p);
}
// slow pathへgenericフロントは回避
ptr = hak_tiny_alloc_slow(size, class_idx);
if (ptr) HAK_RET_ALLOC(class_idx, ptr);
return ptr; // NULL if OOM
}
// Generic front (FastCache/SFC/SLL)
// Respect SLL global toggle AND Front-Direct mode; when either disabled, skip TLS SLL entirely
if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
// For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads.
if (class_idx <= 3) {
#if HAKMEM_TINY_INLINE_SLL
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
#else
// Default: Safe Box API (Box TLS-SLL) for all standard builds
ptr = tiny_alloc_fast_pop(class_idx);
#endif
} else {
void* base = NULL;
if (tls_sll_pop(class_idx, &base)) ptr = base; else ptr = NULL;
}
} else {
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
}
if (__builtin_expect(ptr != NULL, 1)) {
HAK_RET_ALLOC(class_idx, ptr);
}
// Generic: Refill and take (Front-Direct vs Legacy)
if (s_front_direct_alloc) {
// Front-Direct: Direct SS→FC refill (bypasses SLL/TLS List)
int refilled_fc = tiny_alloc_fast_refill(class_idx);
if (__builtin_expect(refilled_fc > 0, 1)) {
void* fc_ptr = fastcache_pop(class_idx);
if (fc_ptr) {
HAK_RET_ALLOC(class_idx, fc_ptr);
}
}
} else {
// Legacy: Refill to TLS List/SLL
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
if (took) {
HAK_RET_ALLOC(class_idx, took);
}
}
// Backend refill後に再トライ
{
int refilled = tiny_alloc_fast_refill(class_idx);
if (__builtin_expect(refilled > 0, 1)) {
// Skip SLL retry if Front-Direct OR SLL disabled
if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
if (class_idx <= 3) {
#if HAKMEM_TINY_INLINE_SLL
// Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
#else
// Default: Safe Box API (Box TLS-SLL) for all standard builds
ptr = tiny_alloc_fast_pop(class_idx);
#endif
} else {
void* base2 = NULL;
if (tls_sll_pop(class_idx, &base2)) ptr = base2; else ptr = NULL;
}
} else {
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
}
if (ptr) {
HAK_RET_ALLOC(class_idx, ptr);
}
}
}
// 5. Refill failure or still empty → slow path (OOM or new SuperSlab)
// Box Boundary: Delegate to Slow Path (Box 3 backend)
ptr = hak_tiny_alloc_slow(size, class_idx);
if (ptr) {
HAK_RET_ALLOC(class_idx, ptr);
}
return ptr; // NULL if OOM
}
// ========== Push to TLS Freelist (for free path) ==========
// Push block to TLS freelist (used by free fast path)
// This is a "helper" for Box 6 (Free Fast Path)
//
// Invariant: ptr must belong to current thread (no ownership check here)
// Caller (Box 6) is responsible for ownership verification
static inline void tiny_alloc_fast_push(int class_idx, void* ptr) {
#ifdef HAKMEM_TINY_FRONT_GATE_BOX
front_gate_push_tls(class_idx, ptr);
#else
// Box Boundary: Push to TLS freelist using Box TLS-SLL API (C7-safe)
uint32_t capacity = UINT32_MAX; // Unlimited for helper function
if (!tls_sll_push(class_idx, ptr, capacity)) {
// C7 rejected or SLL somehow full (should not happen)
// In release builds, this is a no-op (caller expects success)
#if !HAKMEM_BUILD_RELEASE
fprintf(stderr, "[WARN] tls_sll_push failed in tiny_alloc_fast_push cls=%d ptr=%p\n",
class_idx, ptr);
#endif
}
#endif
}
// ========== Statistics & Diagnostics ==========
// Get TLS freelist stats (for debugging/profiling)
typedef struct {
int class_idx;
void* head;
uint32_t count;
} TinyAllocFastStats;
static inline TinyAllocFastStats tiny_alloc_fast_stats(int class_idx) {
TinyAllocFastStats stats = {
.class_idx = class_idx,
.head = g_tls_sll_head[class_idx],
.count = g_tls_sll_count[class_idx]
};
return stats;
}
// Reset TLS freelist (for testing/benchmarking)
// WARNING: This leaks memory! Only use in controlled test environments.
static inline void tiny_alloc_fast_reset(int class_idx) {
g_tls_sll_head[class_idx] = NULL;
g_tls_sll_count[class_idx] = 0;
}
// ========== Performance Notes ==========
//
// Expected metrics (based on System tcache & HAKX +171% results):
// - Fast path hit rate: 95%+ (workload dependent)
// - Fast path latency: 3-4 instructions (1-2 cycles on modern CPUs)
// - Miss penalty: ~20-50 instructions (refill from SuperSlab)
// - Throughput improvement: +10-25% vs current multi-layer design
//
// Key optimizations:
// 1. `__builtin_expect` for branch prediction (hot path first)
// 2. `static inline` for zero-cost abstraction
// 3. TLS variables (no atomic ops, no locks)
// 4. Minimal work in fast path (defer stats/accounting to backend)
//
// Comparison with current design:
// - Current: 20+ instructions (Magazine → SuperSlab → ACE → ...)
// - New: 3-4 instructions (TLS freelist pop only)
// - Reduction: -80% instructions in hot path
//
// Inspired by:
// - System tcache (glibc malloc) - 3-4 instruction fast path
// - HAKX Mid-Large (+171%) - "Simple Front + Smart Back"
// - Box Theory - Clear boundaries, minimal coupling