Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary
Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)
## New Modules
- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
- Remote drain → Freelist → Carve priority
- Header restoration for C1-C6 (NOT C0/C7)
- ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN
- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition
## Allocation Path (core/tiny_alloc_fast.inc.h)
- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
- Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
- Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)
## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)
- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)
## Legacy Sealing
- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry
## ENV Controls
- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)
## Benchmarks (Front-Direct Enabled)
```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
HAKMEM_TINY_BUMP_CHUNK=256
bench_random_mixed (16-1040B random, 200K iter):
256 slots: 1.44M ops/s (STABLE, 0 SEGV)
128 slots: 1.44M ops/s (STABLE, 0 SEGV)
bench_fixed_size (fixed size, 200K iter):
256B: 4.06M ops/s (has debug logs, expected >10M without logs)
128B: Similar (debug logs affect)
```
## Verification
- TRACE_RING test (10K iter): **0 SLL events** detected ✅
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV
## Next Steps
- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)
Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
This commit is contained in:
@ -1184,16 +1184,10 @@ static inline __attribute__((always_inline)) int tiny_refill_max_for_class(int c
|
||||
return g_tiny_refill_max;
|
||||
}
|
||||
|
||||
// Phase 9.5: Frontend/Backend split - Tiny FastCache (array stack)
|
||||
// Enabled via HAKMEM_TINY_FASTCACHE=1 (default: 0)
|
||||
// Compile-out: define HAKMEM_TINY_NO_FRONT_CACHE=1 to exclude this path
|
||||
#define TINY_FASTCACHE_CAP 128
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* items[TINY_FASTCACHE_CAP];
|
||||
int top;
|
||||
int _pad[15];
|
||||
} TinyFastCache;
|
||||
static __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
|
||||
// Phase 9.5: Frontend/Backend split - Tiny Front modules(QuickSlot / FastCache)
|
||||
#include "front/quick_slot.h"
|
||||
#include "front/fast_cache.h"
|
||||
__thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
|
||||
static int g_frontend_enable = 0; // HAKMEM_TINY_FRONTEND=1 (experimental ultra-fast frontend)
|
||||
// SLL capacity multiplier for hot tiny classes (env: HAKMEM_SLL_MULTIPLIER)
|
||||
int g_sll_multiplier = 2;
|
||||
@ -1270,21 +1264,17 @@ static __thread TinyHotMag g_tls_hot_mag[TINY_NUM_CLASSES];
|
||||
// TinyQuickSlot: 1 cache line per class (quick 6 items + small metadata)
|
||||
// Opt-in via HAKMEM_TINY_QUICK=1
|
||||
// NOTE: This type definition must come BEFORE the Phase 2D-1 includes below
|
||||
typedef struct __attribute__((aligned(64))) {
|
||||
void* items[6]; // 48B
|
||||
uint8_t top; // 1B (0..6)
|
||||
uint8_t _pad1; // 1B
|
||||
uint16_t _pad2; // 2B
|
||||
uint32_t _pad3; // 4B (padding to 64B)
|
||||
} TinyQuickSlot;
|
||||
static int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1
|
||||
static __thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below
|
||||
int g_quick_enable = 0; // HAKMEM_TINY_QUICK=1
|
||||
__thread TinyQuickSlot g_tls_quick[TINY_NUM_CLASSES]; // compile-out via guards below
|
||||
|
||||
// Phase 2D-1: Hot-path inline function extractions
|
||||
// NOTE: These includes require TinyFastCache, TinyQuickSlot, and TinyTLSSlab to be fully defined
|
||||
// Phase 2D-1: Hot-path inline function extractions(Front)
|
||||
// NOTE: TinyFastCache/TinyQuickSlot は front/ で定義済み
|
||||
#include "hakmem_tiny_hot_pop.inc.h" // 4 functions: tiny_hot_pop_class{0..3}
|
||||
#include "hakmem_tiny_fastcache.inc.h" // 5 functions: tiny_fast_pop/push, fastcache_pop/push, quick_pop
|
||||
#include "hakmem_tiny_refill.inc.h" // 8 functions: refill operations
|
||||
#if HAKMEM_TINY_P0_BATCH_REFILL
|
||||
#include "hakmem_tiny_refill_p0.inc.h" // P0 batch refill → FastCache 直補充
|
||||
#endif
|
||||
#include "refill/ss_refill_fc.h" // NEW: Direct SS→FC refill
|
||||
|
||||
// Phase 7 Task 3: Pre-warm TLS cache at init
|
||||
// Pre-allocate blocks to reduce first-allocation miss penalty
|
||||
@ -1775,6 +1765,17 @@ TinySlab* hak_tiny_owner_slab(void* ptr) {
|
||||
// Export wrapper functions for hakmem.c to call
|
||||
// Phase 6-1.7 Optimization: Remove diagnostic overhead, rely on LTO for inlining
|
||||
void* hak_tiny_alloc_fast_wrapper(size_t size) {
|
||||
// Bench-only ultra-short path: bypass diagnostics and pointer tracking
|
||||
// Enable with: HAKMEM_BENCH_FAST_FRONT=1
|
||||
static int g_bench_fast_front = -1;
|
||||
if (__builtin_expect(g_bench_fast_front == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_BENCH_FAST_FRONT");
|
||||
g_bench_fast_front = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
if (__builtin_expect(g_bench_fast_front, 0)) {
|
||||
return tiny_alloc_fast(size);
|
||||
}
|
||||
|
||||
static _Atomic uint64_t wrapper_call_count = 0;
|
||||
uint64_t call_num = atomic_fetch_add(&wrapper_call_count, 1);
|
||||
|
||||
@ -1798,7 +1799,6 @@ TinySlab* hak_tiny_owner_slab(void* ptr) {
|
||||
fflush(stderr);
|
||||
}
|
||||
#endif
|
||||
// Diagnostic removed - use HAKMEM_TINY_FRONT_DIAG in tiny_alloc_fast_pop if needed
|
||||
void* result = tiny_alloc_fast(size);
|
||||
#if !HAKMEM_BUILD_RELEASE
|
||||
if (call_num > 14250 && call_num < 14280 && size <= 1024) {
|
||||
@ -1864,6 +1864,16 @@ TinySlab* hak_tiny_owner_slab(void* ptr) {
|
||||
// Free path implementations
|
||||
#include "hakmem_tiny_free.inc"
|
||||
|
||||
// ---- Phase 1: Provide default batch-refill symbol (fallback to small refill)
|
||||
// Allows runtime gate HAKMEM_TINY_REFILL_BATCH=1 without requiring a rebuild.
|
||||
#ifndef HAKMEM_TINY_P0_BATCH_REFILL
|
||||
int sll_refill_small_from_ss(int class_idx, int max_take);
|
||||
__attribute__((weak)) int sll_refill_batch_from_ss(int class_idx, int max_take)
|
||||
{
|
||||
return sll_refill_small_from_ss(class_idx, max_take);
|
||||
}
|
||||
#endif
|
||||
|
||||
// ============================================================================
|
||||
// EXTRACTED TO hakmem_tiny_lifecycle.inc (Phase 2D-3)
|
||||
// ============================================================================
|
||||
|
||||
Reference in New Issue
Block a user