diff --git a/core/hakmem_tiny_refill_p0.inc.h b/core/hakmem_tiny_refill_p0.inc.h index a6500daf..820ea5b8 100644 --- a/core/hakmem_tiny_refill_p0.inc.h +++ b/core/hakmem_tiny_refill_p0.inc.h @@ -58,6 +58,13 @@ static inline int sll_refill_batch_from_ss(int class_idx, int max_take) { } TinyTLSSlab* tls = &g_tls_slabs[class_idx]; + + // Phase 3c L1D Opt: Prefetch SuperSlab hot fields early + if (tls->ss) { + __builtin_prefetch(&tls->ss->slab_bitmap, 0, 3); + __builtin_prefetch(&tls->ss->total_active_blocks, 0, 3); + } + uint32_t active_before = 0; if (tls->ss) { active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed); @@ -77,6 +84,9 @@ static inline int sll_refill_batch_from_ss(int class_idx, int max_take) { return 0; } + // Phase 3c L1D Opt: Prefetch SlabMeta hot fields (freelist, used, capacity) + __builtin_prefetch(&meta->freelist, 0, 3); + #if HAKMEM_INTEGRITY_LEVEL >= 4 uint8_t* initial_slab_base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx); @@ -224,6 +234,12 @@ static inline int sll_refill_batch_from_ss(int class_idx, int max_take) { &g_tls_sll_count[class_idx]); ss_active_add(tls->ss, from_freelist); meta->used = (uint16_t)((uint32_t)meta->used + from_freelist); + + // Phase 3c L1D Opt: Prefetch next freelist entry after refill + if (meta->freelist) { + __builtin_prefetch(meta->freelist, 0, 3); + } + #if HAKMEM_DEBUG_COUNTERS extern unsigned long long g_rf_freelist_items[]; g_rf_freelist_items[class_idx] += from_freelist; diff --git a/core/tiny_alloc_fast.inc.h b/core/tiny_alloc_fast.inc.h index a54bb60c..91584965 100644 --- a/core/tiny_alloc_fast.inc.h +++ b/core/tiny_alloc_fast.inc.h @@ -27,14 +27,11 @@ #endif #include "hakmem_tiny_integrity.h" // PRIORITY 1-4: Corruption detection #ifdef HAKMEM_TINY_HEADER_CLASSIDX -#include "front/tiny_front_c23.h" // Phase B: Ultra-simple C2/C3 front -#include "front/tiny_ring_cache.h" // Phase 21-1: Ring cache (C2/C3 array-based TLS cache) -#include "front/tiny_unified_cache.h" // Phase 23: Unified frontend cache (tcache-style, all classes) -#include "front/tiny_heap_v2.h" // Phase 13-A: TinyHeapV2 magazine front -#include "front/tiny_ultra_hot.h" // Phase 14: TinyUltraHot C1/C2 ultra-fast path +// Ring Cache and Unified Cache removed (A/B test: OFF is faster) #endif #include "box/front_metrics_box.h" // Phase 19-1: Frontend layer metrics #include "hakmem_tiny_lazy_init.inc.h" // Phase 22: Lazy per-class initialization +#include "box/tiny_sizeclass_hist_box.h" // Phase 3-4: Tiny size class histogram (ACE learning) #include // Phase 7 Task 2: Aggressive inline TLS cache access @@ -86,31 +83,10 @@ extern int sll_refill_batch_from_ss(int class_idx, int max_take); #else extern int sll_refill_small_from_ss(int class_idx, int max_take); #endif -// NEW: Direct SS→FC refill (bypasses SLL) -extern int ss_refill_fc_fill(int class_idx, int want); extern void* hak_tiny_alloc_slow(size_t size, int class_idx); extern int hak_tiny_size_to_class(size_t size); extern int tiny_refill_failfast_level(void); extern const size_t g_tiny_class_sizes[]; -// Hot-class toggle: class5 (256B) dedicated TLS fast path -extern int g_tiny_hotpath_class5; - -// Minimal class5 refill helper: fixed, branch-light refill into TLS List, then take one -// Preconditions: class_idx==5 and g_tiny_hotpath_class5==1 -static inline void* tiny_class5_minirefill_take(void) { - extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; - TinyTLSList* tls5 = &g_tls_lists[5]; - // Fast pop if available - void* base = tls_list_pop(tls5, 5); - if (base) { - // ✅ FIX #16: Return BASE pointer (not USER) - // Caller will apply HAK_RET_ALLOC which does BASE → USER conversion - return base; - } - // Robust refill via generic helper(header対応・境界検証済み) - return tiny_fast_refill_and_take(5, tls5); -} - // Global Front refill config (parsed at init; defined in hakmem_tiny.c) extern int g_refill_count_global; extern int g_refill_count_hot; @@ -274,7 +250,8 @@ static inline void* tiny_alloc_fast_pop(int class_idx) { } // Box Boundary: Layer 1 - TLS SLL freelist の先頭を pop(envで無効化可) - extern int g_tls_sll_enable; // set at init via HAKMEM_TINY_TLS_SLL + // Note: This is in tiny_alloc_fast_pop(), not tiny_alloc_fast(), so use global variable + extern int g_tls_sll_enable; if (__builtin_expect(g_tls_sll_enable, 1)) { // Use Box TLS-SLL API (C7-safe pop) // CRITICAL: Pop FIRST, do NOT read g_tls_sll_head directly (race condition!) @@ -334,7 +311,9 @@ static inline int sfc_cascade_pct(void) { static inline int sfc_refill_from_sll(int class_idx, int target_count) { // PRIORITY 1: Bounds check HAK_CHECK_CLASS_IDX(class_idx, "sfc_refill_from_sll"); +#if !HAKMEM_BUILD_RELEASE atomic_fetch_add(&g_integrity_check_class_bounds, 1); +#endif int transferred = 0; uint32_t cap = g_sfc_capacity[class_idx]; @@ -446,30 +425,13 @@ static inline int tiny_alloc_fast_refill(int class_idx) { // Legacy: Fallback for compatibility (will be deprecated) int refilled = 0; - // NEW: Front-Direct refill control (A/B toggle) - static __thread int s_use_front_direct = -1; - if (__builtin_expect(s_use_front_direct == -1, 0)) { - // Check multiple ENV flags (any one enables Front-Direct) - const char* e1 = getenv("HAKMEM_TINY_FRONT_DIRECT"); - const char* e2 = getenv("HAKMEM_TINY_P0_DIRECT_FC_ALL"); - const char* e3 = getenv("HAKMEM_TINY_REFILL_BATCH"); - s_use_front_direct = ((e1 && *e1 && *e1 != '0') || - (e2 && *e2 && *e2 != '0') || - (e3 && *e3 && *e3 != '0')) ? 1 : 0; - } - - // Refill dispatch - if (s_use_front_direct) { - // NEW: Direct SS→FC (bypasses SLL) - refilled = ss_refill_fc_fill(class_idx, cnt); - } else { - // Legacy: SS→SLL→FC (via batch or generic) + // Front-Direct A/B 実装は現 HEAD では非対応。 + // 常にレガシー経路(SS→SLL→FC)を使う。 #if HAKMEM_TINY_P0_BATCH_REFILL - refilled = sll_refill_batch_from_ss(class_idx, cnt); + refilled = sll_refill_batch_from_ss(class_idx, cnt); #else - refilled = sll_refill_small_from_ss(class_idx, cnt); + refilled = sll_refill_small_from_ss(class_idx, cnt); #endif - } // Lightweight adaptation: if refills keep happening, increase per-class refill. // Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads. @@ -497,26 +459,15 @@ static inline int tiny_alloc_fast_refill(int class_idx) { } // Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default) - // NEW: Default OFF, enable via HAKMEM_TINY_SFC_CASCADE=1 - // Skip entirely when Front-Direct is active (direct SS→FC path) static __thread int sfc_cascade_enabled = -1; if (__builtin_expect(sfc_cascade_enabled == -1, 0)) { - // Front-Direct bypasses SLL, so SFC cascade is pointless - if (s_use_front_direct) { - sfc_cascade_enabled = 0; - } else { - // Check ENV flag (default: OFF) - const char* e = getenv("HAKMEM_TINY_SFC_CASCADE"); - sfc_cascade_enabled = (e && *e && *e != '0') ? 1 : 0; - } + // Check ENV flag (default: OFF) + const char* e = getenv("HAKMEM_TINY_SFC_CASCADE"); + sfc_cascade_enabled = (e && *e && *e != '0') ? 1 : 0; } // Only cascade if explicitly enabled AND we have refilled blocks in SLL if (sfc_cascade_enabled && g_sfc_enabled && refilled > 0) { - // Skip SFC cascade for class5 when dedicated hotpath is enabled - if (g_tiny_hotpath_class5 && class_idx == 5) { - // no-op: keep refilled blocks in TLS List/SLL - } else { // Transfer half of refilled blocks to SFC (keep half in SLL for future) int sfc_target = refilled / 2; if (sfc_target > 0) { @@ -527,7 +478,6 @@ static inline int tiny_alloc_fast_refill(int class_idx) { (void)transferred; // Unused, but could track stats #endif } - } } #if !HAKMEM_BUILD_RELEASE @@ -574,9 +524,22 @@ static inline void* tiny_alloc_fast(size_t size) { return NULL; // Size > 1KB, not Tiny } + // Phase 3c L1D Opt: Prefetch TLS cache head early + __builtin_prefetch(&g_tls_sll_head[class_idx], 0, 3); + __builtin_prefetch(&g_tls_sll_count[class_idx], 0, 3); + // Phase 22: Lazy per-class init (on first use) lazy_init_class(class_idx); + // Phase 3-4: Record allocation for ACE Profile learning + // TLS increment only (no atomic operation, amortized flush at threshold) + tiny_sizeclass_hist_hit(class_idx); + + // P0.1: Cache g_tls_sll_enable once (Phase 3-4 instruction reduction) + // Eliminates redundant global variable reads (2-3 instructions saved) + extern int g_tls_sll_enable; + const int sll_enabled = g_tls_sll_enable; + #if !HAKMEM_BUILD_RELEASE // Phase 3: Debug checks eliminated in release builds // CRITICAL: Bounds check to catch corruption @@ -599,139 +562,10 @@ static inline void* tiny_alloc_fast(size_t size) { ROUTE_BEGIN(class_idx); void* ptr = NULL; - const int hot_c5 = (g_tiny_hotpath_class5 && class_idx == 5); - - // Phase B: Ultra-simple front for C2/C3 (128B/256B) - // ENV-gated: HAKMEM_TINY_FRONT_C23_SIMPLE=1 - // Target: 15-20M ops/s (vs current 8-9M ops/s) -#ifdef HAKMEM_TINY_HEADER_CLASSIDX - if (tiny_front_c23_enabled() && (class_idx == 2 || class_idx == 3)) { - void* c23_ptr = tiny_front_c23_alloc(size, class_idx); - if (c23_ptr) { - HAK_RET_ALLOC(class_idx, c23_ptr); - } - // Fall through to existing path if C23 path failed (NULL) - } -#endif - - // Phase 23-E: Unified Frontend Cache (self-contained, single-layer tcache) - // ENV-gated: HAKMEM_TINY_UNIFIED_CACHE=1 (default: OFF) - // Design: Pop-or-Refill → Direct SuperSlab batch refill (bypasses ALL frontend layers) - // Target: 20-30% improvement (25-27M ops/s) via cache miss reduction (8-10 → 2-3) - if (__builtin_expect(unified_cache_enabled(), 0)) { - void* base = unified_cache_pop_or_refill(class_idx); - if (base) { - // Unified cache hit OR refill success - return USER pointer (BASE + 1) - HAK_RET_ALLOC(class_idx, base); - } - // Unified cache is enabled but refill failed (OOM) → go directly to slow path. - ptr = hak_tiny_alloc_slow(size, class_idx); - if (ptr) { - HAK_RET_ALLOC(class_idx, ptr); - } - return ptr; - } - - // Phase 21-1: Ring Cache (C2/C3 only) - Array-based TLS cache - // ENV-gated: HAKMEM_TINY_HOT_RING_ENABLE=1 (default: ON after Phase 21-1-D) - // Target: +15-20% (54.4M → 62-65M ops/s) by eliminating pointer chasing - // Design: Ring (L0) → SLL (L1) → SuperSlab (L2) cascade hierarchy - if (class_idx == 2 || class_idx == 3) { - void* base = ring_cache_pop(class_idx); - if (base) { - // Ring hit - return USER pointer (BASE + 1) - HAK_RET_ALLOC(class_idx, base); - } - - // Phase 21-1-C: Ring miss - try refill from TLS SLL (cascade) - // ENV-gated: HAKMEM_TINY_HOT_RING_CASCADE=1 - if (ring_cascade_enabled()) { - int refilled = ring_refill_from_sll(class_idx, 32); // Refill 32 blocks - if (refilled > 0) { - // Retry after refill - base = ring_cache_pop(class_idx); - if (base) HAK_RET_ALLOC(class_idx, base); - } - } - // Still miss → fall through to existing path (TLS SLL/UltraHot/HeapV2) - } - - // Phase 14-C: TinyUltraHot Borrowing Design (正史から借りる設計) - // ENV-gated: HAKMEM_TINY_ULTRA_HOT=1 (internal control) - // Phase 19-4: HAKMEM_TINY_FRONT_ENABLE_ULTRAHOT=1 to enable (DEFAULT: OFF for +12.9% perf) - // Targets C2-C5 (16B-128B) - // Design: UltraHot は TLS SLL から借りたブロックを magazine に保持 - // - Hit: magazine から返す (L0, fastest) - // - Miss: TLS SLL から refill して再試行 - // A/B Test Result: UltraHot adds branch overhead (11.7% hit) → HeapV2-only is faster - if (__builtin_expect(ultra_hot_enabled() && front_prune_ultrahot_enabled(), 0)) { // expect=0 (default OFF) - void* base = ultra_hot_alloc(size); - if (base) { - front_metrics_ultrahot_hit(class_idx); // Phase 19-1: Metrics - HAK_RET_ALLOC(class_idx, base); // Header write + return USER pointer - } - // Miss → TLS SLL から借りて refill(正史から借用) - if (class_idx >= 2 && class_idx <= 5) { - front_metrics_ultrahot_miss(class_idx); // Phase 19-1: Metrics - ultra_hot_try_refill(class_idx); - // Retry after refill - base = ultra_hot_alloc(size); - if (base) { - front_metrics_ultrahot_hit(class_idx); // Phase 19-1: Metrics (refill hit) - HAK_RET_ALLOC(class_idx, base); - } - } - } - - // Phase 13-A: TinyHeapV2 (per-thread magazine, experimental) - // ENV-gated: HAKMEM_TINY_HEAP_V2=1 - // Phase 19-3: + HAKMEM_TINY_FRONT_DISABLE_HEAPV2=1 to disable (Box FrontPrune) - // Targets class 0-3 (8-64B) only, falls back to existing path if NULL - // PERF: Pass class_idx directly to avoid redundant size→class conversion - if (__builtin_expect(tiny_heap_v2_enabled() && front_prune_heapv2_enabled(), 0) && class_idx <= 3) { - void* base = tiny_heap_v2_alloc_by_class(class_idx); - if (base) { - front_metrics_heapv2_hit(class_idx); // Phase 19-1: Metrics - HAK_RET_ALLOC(class_idx, base); // Header write + return USER pointer - } else { - front_metrics_heapv2_miss(class_idx); // Phase 19-1: Metrics - } - } - - // NEW: Front-Direct/SLL-OFF bypass control (TLS cached, lazy init) - static __thread int s_front_direct_alloc = -1; - if (__builtin_expect(s_front_direct_alloc == -1, 0)) { - const char* e = getenv("HAKMEM_TINY_FRONT_DIRECT"); - s_front_direct_alloc = (e && *e && *e != '0') ? 1 : 0; - } - - if (__builtin_expect(hot_c5, 0)) { - // class5: 専用最短経路(generic frontは一切通らない) - void* p = tiny_class5_minirefill_take(); - if (p) { - front_metrics_class5_hit(class_idx); // Phase 19-1: Metrics - HAK_RET_ALLOC(class_idx, p); - } - - front_metrics_class5_miss(class_idx); // Phase 19-1: Metrics (first miss) - int refilled = tiny_alloc_fast_refill(class_idx); - if (__builtin_expect(refilled > 0, 1)) { - p = tiny_class5_minirefill_take(); - if (p) { - front_metrics_class5_hit(class_idx); // Phase 19-1: Metrics (refill hit) - HAK_RET_ALLOC(class_idx, p); - } - } - - // slow pathへ(genericフロントは回避) - ptr = hak_tiny_alloc_slow(size, class_idx); - if (ptr) HAK_RET_ALLOC(class_idx, ptr); - return ptr; // NULL if OOM - } // Generic front (FastCache/SFC/SLL) - // Respect SLL global toggle AND Front-Direct mode; when either disabled, skip TLS SLL entirely - if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) { + // Respect SLL global toggle + if (__builtin_expect(g_tls_sll_enable, 1)) { // For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads. if (class_idx <= 3) { #if HAKMEM_TINY_INLINE_SLL @@ -749,35 +583,28 @@ static inline void* tiny_alloc_fast(size_t size) { ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL } + // Phase 3c L1D Opt: Prefetch next freelist entry if we got a pointer + if (__builtin_expect(ptr != NULL, 1)) { + __builtin_prefetch(ptr, 0, 3); + } + if (__builtin_expect(ptr != NULL, 1)) { HAK_RET_ALLOC(class_idx, ptr); } - // Generic: Refill and take (Front-Direct vs Legacy) - if (s_front_direct_alloc) { - // Front-Direct: Direct SS→FC refill (bypasses SLL/TLS List) - int refilled_fc = tiny_alloc_fast_refill(class_idx); - if (__builtin_expect(refilled_fc > 0, 1)) { - void* fc_ptr = fastcache_pop(class_idx); - if (fc_ptr) { - HAK_RET_ALLOC(class_idx, fc_ptr); - } - } - } else { - // Legacy: Refill to TLS List/SLL - extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; - void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]); - if (took) { - HAK_RET_ALLOC(class_idx, took); - } + // Refill to TLS List/SLL + extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES]; + void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]); + if (took) { + HAK_RET_ALLOC(class_idx, took); } // Backend refill後に再トライ { int refilled = tiny_alloc_fast_refill(class_idx); if (__builtin_expect(refilled > 0, 1)) { - // Skip SLL retry if Front-Direct OR SLL disabled - if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) { + // Retry SLL if enabled (P0.1: using cached sll_enabled) + if (__builtin_expect(sll_enabled, 1)) { if (class_idx <= 3) { #if HAKMEM_TINY_INLINE_SLL // Experimental: Inline SLL pop (A/B only, requires HAKMEM_TINY_INLINE_SLL=1)