Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary
Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)
## New Modules
- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
- Remote drain → Freelist → Carve priority
- Header restoration for C1-C6 (NOT C0/C7)
- ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN
- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition
## Allocation Path (core/tiny_alloc_fast.inc.h)
- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
- Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
- Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)
## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)
- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)
## Legacy Sealing
- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry
## ENV Controls
- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)
## Benchmarks (Front-Direct Enabled)
```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
HAKMEM_TINY_BUMP_CHUNK=256
bench_random_mixed (16-1040B random, 200K iter):
256 slots: 1.44M ops/s (STABLE, 0 SEGV)
128 slots: 1.44M ops/s (STABLE, 0 SEGV)
bench_fixed_size (fixed size, 200K iter):
256B: 4.06M ops/s (has debug logs, expected >10M without logs)
128B: Similar (debug logs affect)
```
## Verification
- TRACE_RING test (10K iter): **0 SLL events** detected ✅
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV
## Next Steps
- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)
Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
This commit is contained in:
@ -77,6 +77,8 @@ extern int sll_refill_batch_from_ss(int class_idx, int max_take);
|
||||
#else
|
||||
extern int sll_refill_small_from_ss(int class_idx, int max_take);
|
||||
#endif
|
||||
// NEW: Direct SS→FC refill (bypasses SLL)
|
||||
extern int ss_refill_fc_fill(int class_idx, int want);
|
||||
extern void* hak_tiny_alloc_slow(size_t size, int class_idx);
|
||||
extern int hak_tiny_size_to_class(size_t size);
|
||||
extern int tiny_refill_failfast_level(void);
|
||||
@ -429,13 +431,35 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
|
||||
#endif
|
||||
|
||||
// Box Boundary: Delegate to Backend (Box 3: SuperSlab)
|
||||
// This gives us ACE, Learning layer, L25 integration for free!
|
||||
// P0 Fix: Use appropriate refill function based on P0 status
|
||||
// Refill Dispatch: Standard (ss_refill_fc_fill) vs Legacy SLL (A/B only)
|
||||
// Standard: Enabled by FRONT_DIRECT=1, REFILL_BATCH=1, or P0_DIRECT_FC_ALL=1
|
||||
// Legacy: Fallback for compatibility (will be deprecated)
|
||||
int refilled = 0;
|
||||
|
||||
// NEW: Front-Direct refill control (A/B toggle)
|
||||
static __thread int s_use_front_direct = -1;
|
||||
if (__builtin_expect(s_use_front_direct == -1, 0)) {
|
||||
// Check multiple ENV flags (any one enables Front-Direct)
|
||||
const char* e1 = getenv("HAKMEM_TINY_FRONT_DIRECT");
|
||||
const char* e2 = getenv("HAKMEM_TINY_P0_DIRECT_FC_ALL");
|
||||
const char* e3 = getenv("HAKMEM_TINY_REFILL_BATCH");
|
||||
s_use_front_direct = ((e1 && *e1 && *e1 != '0') ||
|
||||
(e2 && *e2 && *e2 != '0') ||
|
||||
(e3 && *e3 && *e3 != '0')) ? 1 : 0;
|
||||
}
|
||||
|
||||
// Refill dispatch
|
||||
if (s_use_front_direct) {
|
||||
// NEW: Direct SS→FC (bypasses SLL)
|
||||
refilled = ss_refill_fc_fill(class_idx, cnt);
|
||||
} else {
|
||||
// Legacy: SS→SLL→FC (via batch or generic)
|
||||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||||
int refilled = sll_refill_batch_from_ss(class_idx, cnt);
|
||||
refilled = sll_refill_batch_from_ss(class_idx, cnt);
|
||||
#else
|
||||
int refilled = sll_refill_small_from_ss(class_idx, cnt);
|
||||
refilled = sll_refill_small_from_ss(class_idx, cnt);
|
||||
#endif
|
||||
}
|
||||
|
||||
// Lightweight adaptation: if refills keep happening, increase per-class refill.
|
||||
// Focus on class 7 (1024B) to reduce mmap/refill frequency under Tiny-heavy loads.
|
||||
@ -462,16 +486,23 @@ static inline int tiny_alloc_fast_refill(int class_idx) {
|
||||
track_refill_for_adaptation(class_idx);
|
||||
}
|
||||
|
||||
// Box 5-NEW: Cascade refill SFC ← SLL (if SFC enabled)
|
||||
// This happens AFTER SuperSlab → SLL refill, so SLL has blocks
|
||||
static __thread int sfc_check_done_refill = 0;
|
||||
static __thread int sfc_is_enabled_refill = 0;
|
||||
if (__builtin_expect(!sfc_check_done_refill, 0)) {
|
||||
sfc_is_enabled_refill = g_sfc_enabled;
|
||||
sfc_check_done_refill = 1;
|
||||
// Box 5-NEW: Cascade refill SFC ← SLL (opt-in via HAKMEM_TINY_SFC_CASCADE, off by default)
|
||||
// NEW: Default OFF, enable via HAKMEM_TINY_SFC_CASCADE=1
|
||||
// Skip entirely when Front-Direct is active (direct SS→FC path)
|
||||
static __thread int sfc_cascade_enabled = -1;
|
||||
if (__builtin_expect(sfc_cascade_enabled == -1, 0)) {
|
||||
// Front-Direct bypasses SLL, so SFC cascade is pointless
|
||||
if (s_use_front_direct) {
|
||||
sfc_cascade_enabled = 0;
|
||||
} else {
|
||||
// Check ENV flag (default: OFF)
|
||||
const char* e = getenv("HAKMEM_TINY_SFC_CASCADE");
|
||||
sfc_cascade_enabled = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
}
|
||||
|
||||
if (sfc_is_enabled_refill && refilled > 0) {
|
||||
// Only cascade if explicitly enabled AND we have refilled blocks in SLL
|
||||
if (sfc_cascade_enabled && g_sfc_enabled && refilled > 0) {
|
||||
// Skip SFC cascade for class5 when dedicated hotpath is enabled
|
||||
if (g_tiny_hotpath_class5 && class_idx == 5) {
|
||||
// no-op: keep refilled blocks in TLS List/SLL
|
||||
@ -552,6 +583,13 @@ static inline void* tiny_alloc_fast(size_t size) {
|
||||
void* ptr = NULL;
|
||||
const int hot_c5 = (g_tiny_hotpath_class5 && class_idx == 5);
|
||||
|
||||
// NEW: Front-Direct/SLL-OFF bypass control (TLS cached, lazy init)
|
||||
static __thread int s_front_direct_alloc = -1;
|
||||
if (__builtin_expect(s_front_direct_alloc == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_TINY_FRONT_DIRECT");
|
||||
s_front_direct_alloc = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
|
||||
if (__builtin_expect(hot_c5, 0)) {
|
||||
// class5: 専用最短経路(generic frontは一切通らない)
|
||||
void* p = tiny_class5_minirefill_take();
|
||||
@ -570,15 +608,15 @@ static inline void* tiny_alloc_fast(size_t size) {
|
||||
}
|
||||
|
||||
// Generic front (FastCache/SFC/SLL)
|
||||
// Respect SLL global toggle; when disabled, skip TLS SLL fast pop entirely
|
||||
if (__builtin_expect(g_tls_sll_enable, 1)) {
|
||||
// Respect SLL global toggle AND Front-Direct mode; when either disabled, skip TLS SLL entirely
|
||||
if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
|
||||
// For classes 0..3 keep ultra-inline POP; for >=4 use safe Box POP to avoid UB on bad heads.
|
||||
if (class_idx <= 3) {
|
||||
#if HAKMEM_TINY_AGGRESSIVE_INLINE
|
||||
// Phase 2: Use inline macro (3-4 instructions, zero call overhead)
|
||||
#if defined(HAKMEM_TINY_INLINE_SLL) && HAKMEM_TINY_AGGRESSIVE_INLINE
|
||||
// Experimental: Use inline SLL pop macro (enable via HAKMEM_TINY_INLINE_SLL=1)
|
||||
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
|
||||
#else
|
||||
// Legacy: Function call (10-15 instructions, 5-10 cycle overhead)
|
||||
// Default: Safe Box API (bypasses inline SLL when Front-Direct)
|
||||
ptr = tiny_alloc_fast_pop(class_idx);
|
||||
#endif
|
||||
} else {
|
||||
@ -586,14 +624,24 @@ static inline void* tiny_alloc_fast(size_t size) {
|
||||
if (tls_sll_pop(class_idx, &base)) ptr = base; else ptr = NULL;
|
||||
}
|
||||
} else {
|
||||
ptr = NULL;
|
||||
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
|
||||
}
|
||||
if (__builtin_expect(ptr != NULL, 1)) {
|
||||
HAK_RET_ALLOC(class_idx, ptr);
|
||||
}
|
||||
|
||||
// Generic: Refill and take(FastCacheやTLS Listへ)
|
||||
{
|
||||
// Generic: Refill and take (Front-Direct vs Legacy)
|
||||
if (s_front_direct_alloc) {
|
||||
// Front-Direct: Direct SS→FC refill (bypasses SLL/TLS List)
|
||||
int refilled_fc = tiny_alloc_fast_refill(class_idx);
|
||||
if (__builtin_expect(refilled_fc > 0, 1)) {
|
||||
void* fc_ptr = fastcache_pop(class_idx);
|
||||
if (fc_ptr) {
|
||||
HAK_RET_ALLOC(class_idx, fc_ptr);
|
||||
}
|
||||
}
|
||||
} else {
|
||||
// Legacy: Refill to TLS List/SLL
|
||||
extern __thread TinyTLSList g_tls_lists[TINY_NUM_CLASSES];
|
||||
void* took = tiny_fast_refill_and_take(class_idx, &g_tls_lists[class_idx]);
|
||||
if (took) {
|
||||
@ -605,13 +653,14 @@ static inline void* tiny_alloc_fast(size_t size) {
|
||||
{
|
||||
int refilled = tiny_alloc_fast_refill(class_idx);
|
||||
if (__builtin_expect(refilled > 0, 1)) {
|
||||
if (__builtin_expect(g_tls_sll_enable, 1)) {
|
||||
// Skip SLL retry if Front-Direct OR SLL disabled
|
||||
if (__builtin_expect(g_tls_sll_enable && !s_front_direct_alloc, 1)) {
|
||||
if (class_idx <= 3) {
|
||||
#if HAKMEM_TINY_AGGRESSIVE_INLINE
|
||||
// Phase 2: Use inline macro (3-4 instructions, zero call overhead)
|
||||
#if defined(HAKMEM_TINY_INLINE_SLL) && HAKMEM_TINY_AGGRESSIVE_INLINE
|
||||
// Experimental: Use inline SLL pop macro (enable via HAKMEM_TINY_INLINE_SLL=1)
|
||||
TINY_ALLOC_FAST_POP_INLINE(class_idx, ptr);
|
||||
#else
|
||||
// Legacy: Function call (10-15 instructions, 5-10 cycle overhead)
|
||||
// Default: Safe Box API (bypasses inline SLL when Front-Direct)
|
||||
ptr = tiny_alloc_fast_pop(class_idx);
|
||||
#endif
|
||||
} else {
|
||||
@ -619,7 +668,7 @@ static inline void* tiny_alloc_fast(size_t size) {
|
||||
if (tls_sll_pop(class_idx, &base2)) ptr = base2; else ptr = NULL;
|
||||
}
|
||||
} else {
|
||||
ptr = NULL;
|
||||
ptr = NULL; // SLL disabled OR Front-Direct active → bypass SLL
|
||||
}
|
||||
if (ptr) {
|
||||
HAK_RET_ALLOC(class_idx, ptr);
|
||||
|
||||
Reference in New Issue
Block a user