556 lines
23 KiB
Markdown
556 lines
23 KiB
Markdown
|
|
<!--
|
|||
|
|
NOTE: This file is a snapshot for copy/paste review.
|
|||
|
|
Regenerate with:
|
|||
|
|
scripts/make_chatgpt_pro_packet_free_path.sh > docs/analysis/FREE_PATH_REVIEW_PACKET_CHATGPT.md
|
|||
|
|
-->
|
|||
|
|
|
|||
|
|
# Hakmem free-path review packet (compact)
|
|||
|
|
|
|||
|
|
Goal: understand remaining fixed costs vs mimalloc/tcmalloc, with Box Theory (single boundary, reversible ENV gates).
|
|||
|
|
|
|||
|
|
SSOT bench conditions (current practice):
|
|||
|
|
- `HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE`
|
|||
|
|
- `ITERS=20000000 WS=400 RUNS=10`
|
|||
|
|
- run via `scripts/run_mixed_10_cleanenv.sh`
|
|||
|
|
|
|||
|
|
Request:
|
|||
|
|
1) Where is the dominant fixed cost on free path now?
|
|||
|
|
2) What structural change would give +5–10% without breaking Box Theory?
|
|||
|
|
3) What NOT to do (layout tax pitfalls)?
|
|||
|
|
|
|||
|
|
## Code excerpts (clipped)
|
|||
|
|
|
|||
|
|
### `core/box/tiny_free_gate_box.h`
|
|||
|
|
```c
|
|||
|
|
static inline int tiny_free_gate_try_fast(void* user_ptr)
|
|||
|
|
{
|
|||
|
|
#if !HAKMEM_TINY_HEADER_CLASSIDX
|
|||
|
|
(void)user_ptr;
|
|||
|
|
// Header 無効構成では Tiny Fast Path 自体を使わない
|
|||
|
|
return 0;
|
|||
|
|
#else
|
|||
|
|
if (__builtin_expect(!user_ptr, 0)) {
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Layer 3a: 軽量 Fail-Fast(常時ON)
|
|||
|
|
// 明らかに不正なアドレス(極端に小さい値)は Fast Path では扱わない。
|
|||
|
|
// Slow Path 側(hak_free_at + registry/header)に任せる。
|
|||
|
|
{
|
|||
|
|
uintptr_t addr = (uintptr_t)user_ptr;
|
|||
|
|
if (__builtin_expect(addr < 4096, 0)) {
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
static _Atomic uint32_t g_free_gate_range_invalid = 0;
|
|||
|
|
uint32_t n = atomic_fetch_add_explicit(&g_free_gate_range_invalid, 1, memory_order_relaxed);
|
|||
|
|
if (n < 8) {
|
|||
|
|
fprintf(stderr,
|
|||
|
|
"[TINY_FREE_GATE_RANGE_INVALID] ptr=%p\n",
|
|||
|
|
user_ptr);
|
|||
|
|
fflush(stderr);
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 将来の拡張ポイント:
|
|||
|
|
// - DIAG ON のときだけ Bridge + Guard を実行し、
|
|||
|
|
// Tiny 管理外と判定された場合は Fast Path をスキップする。
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
if (__builtin_expect(tiny_free_gate_diag_enabled(), 0)) {
|
|||
|
|
TinyFreeGateContext ctx;
|
|||
|
|
if (!tiny_free_gate_classify(user_ptr, &ctx)) {
|
|||
|
|
// Tiny 管理外 or Bridge 失敗 → Fast Path は使わない
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
(void)ctx; // 現時点ではログ専用。将来はここから Guard を挿入。
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// 本体は既存の ultra-fast free に丸投げ(挙動を変えない)
|
|||
|
|
return hak_tiny_free_fast_v2(user_ptr);
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### `core/front/malloc_tiny_fast.h`
|
|||
|
|
```c
|
|||
|
|
static inline int free_tiny_fast(void* ptr) {
|
|||
|
|
if (__builtin_expect(!ptr, 0)) return 0;
|
|||
|
|
|
|||
|
|
#if HAKMEM_TINY_HEADER_CLASSIDX
|
|||
|
|
// 1. ページ境界ガード:
|
|||
|
|
// ptr がページ先頭 (offset==0) の場合、ptr-1 は別ページか未マップ領域になる可能性がある。
|
|||
|
|
// その場合はヘッダ読みを行わず、通常 free 経路にフォールバックする。
|
|||
|
|
uintptr_t off = (uintptr_t)ptr & 0xFFFu;
|
|||
|
|
if (__builtin_expect(off == 0, 0)) {
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 2. Fast header magic validation (必須)
|
|||
|
|
// Release ビルドでは tiny_region_id_read_header() が magic を省略するため、
|
|||
|
|
// ここで自前に Tiny 専用ヘッダ (0xA0) を検証しておく。
|
|||
|
|
uint8_t* header_ptr = (uint8_t*)ptr - 1;
|
|||
|
|
uint8_t header = *header_ptr;
|
|||
|
|
uint8_t magic = header & 0xF0u;
|
|||
|
|
if (__builtin_expect(magic != HEADER_MAGIC, 0)) {
|
|||
|
|
// Tiny ヘッダではない → Mid/Large/外部ポインタなので通常 free 経路へ
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 3. class_idx 抽出(下位4bit)
|
|||
|
|
int class_idx = (int)(header & HEADER_CLASS_MASK);
|
|||
|
|
if (__builtin_expect(class_idx < 0 || class_idx >= TINY_NUM_CLASSES, 0)) {
|
|||
|
|
return 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// 4. BASE を計算して Unified Cache に push
|
|||
|
|
void* base = tiny_user_to_base_inline(ptr);
|
|||
|
|
tiny_front_free_stat_inc(class_idx);
|
|||
|
|
|
|||
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (1. 関数入口)
|
|||
|
|
FREE_PATH_STAT_INC(total_calls);
|
|||
|
|
|
|||
|
|
// Phase 19-3b: Consolidate ENV snapshot reads (capture once per free_tiny_fast call).
|
|||
|
|
const HakmemEnvSnapshot* env = hakmem_env_snapshot_enabled() ? hakmem_env_snapshot() : NULL;
|
|||
|
|
|
|||
|
|
// Phase 9: MONO DUALHOT early-exit for C0-C3 (skip policy snapshot, direct to legacy)
|
|||
|
|
// Conditions:
|
|||
|
|
// - ENV: HAKMEM_FREE_TINY_FAST_MONO_DUALHOT=1
|
|||
|
|
// - class_idx <= 3 (C0-C3)
|
|||
|
|
// - !HAKMEM_TINY_LARSON_FIX (cross-thread handling requires full validation)
|
|||
|
|
// - g_tiny_route_snapshot_done == 1 && route == TINY_ROUTE_LEGACY (断定できないときは既存経路)
|
|||
|
|
if ((unsigned)class_idx <= 3u) {
|
|||
|
|
if (free_tiny_fast_mono_dualhot_enabled()) {
|
|||
|
|
static __thread int g_larson_fix = -1;
|
|||
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|||
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|||
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!g_larson_fix &&
|
|||
|
|
g_tiny_route_snapshot_done == 1 &&
|
|||
|
|
g_tiny_route_class[class_idx] == TINY_ROUTE_LEGACY) {
|
|||
|
|
// Direct path: Skip policy snapshot, go straight to legacy fallback
|
|||
|
|
FREE_PATH_STAT_INC(mono_dualhot_hit);
|
|||
|
|
tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 10: MONO LEGACY DIRECT early-exit for C4-C7 (skip policy snapshot, direct to legacy)
|
|||
|
|
// Conditions:
|
|||
|
|
// - ENV: HAKMEM_FREE_TINY_FAST_MONO_LEGACY_DIRECT=1
|
|||
|
|
// - cached nonlegacy_mask: class is NOT in non-legacy mask (= ULTRA/MID/V7 not active)
|
|||
|
|
// - g_tiny_route_snapshot_done == 1 && route == TINY_ROUTE_LEGACY (断定できないときは既存経路)
|
|||
|
|
// - !HAKMEM_TINY_LARSON_FIX (cross-thread handling requires full validation)
|
|||
|
|
if (free_tiny_fast_mono_legacy_direct_enabled()) {
|
|||
|
|
// 1. Check nonlegacy mask (computed once at init)
|
|||
|
|
uint8_t nonlegacy_mask = free_tiny_fast_mono_legacy_direct_nonlegacy_mask();
|
|||
|
|
if ((nonlegacy_mask & (1u << class_idx)) == 0) {
|
|||
|
|
// 2. Check route snapshot
|
|||
|
|
if (g_tiny_route_snapshot_done == 1 && g_tiny_route_class[class_idx] == TINY_ROUTE_LEGACY) {
|
|||
|
|
// 3. Check Larson fix
|
|||
|
|
static __thread int g_larson_fix = -1;
|
|||
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|||
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|||
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (!g_larson_fix) {
|
|||
|
|
// Direct path: Skip policy snapshot, go straight to legacy fallback
|
|||
|
|
FREE_PATH_STAT_INC(mono_legacy_direct_hit);
|
|||
|
|
tiny_legacy_fallback_free_base_with_env(base, (uint32_t)class_idx, env);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase v11b-1: C7 ULTRA early-exit (skip policy snapshot for most common case)
|
|||
|
|
// Phase 4 E1: Use ENV snapshot when enabled (consolidates 3 TLS reads → 1)
|
|||
|
|
// Phase 19-3a: Remove UNLIKELY hint (snapshot is ON by default in presets, hint is backwards)
|
|||
|
|
const bool c7_ultra_free = env ? env->tiny_c7_ultra_enabled : tiny_c7_ultra_enabled_env();
|
|||
|
|
|
|||
|
|
if (class_idx == 7 && c7_ultra_free) {
|
|||
|
|
tiny_c7_ultra_free(ptr);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase POLICY-FAST-PATH-V2: Skip policy snapshot for known-legacy classes
|
|||
|
|
if (free_policy_fast_v2_can_skip((uint8_t)class_idx)) {
|
|||
|
|
FREE_PATH_STAT_INC(policy_fast_v2_skip);
|
|||
|
|
goto legacy_fallback;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase v11b-1: Policy-based single switch (replaces serial ULTRA checks)
|
|||
|
|
const SmallPolicyV7* policy_free = small_policy_v7_snapshot();
|
|||
|
|
SmallRouteKind route_kind_free = policy_free->route_kind[class_idx];
|
|||
|
|
|
|||
|
|
switch (route_kind_free) {
|
|||
|
|
case SMALL_ROUTE_ULTRA: {
|
|||
|
|
// Phase TLS-UNIFY-1: Unified ULTRA TLS push for C4-C6 (C7 handled above)
|
|||
|
|
if (class_idx >= 4 && class_idx <= 6) {
|
|||
|
|
tiny_ultra_tls_push((uint8_t)class_idx, base);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
// ULTRA for other classes → fallback to LEGACY
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
case SMALL_ROUTE_MID_V35: {
|
|||
|
|
// Phase v11a-3: MID v3.5 free
|
|||
|
|
small_mid_v35_free(ptr, class_idx);
|
|||
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
case SMALL_ROUTE_V7: {
|
|||
|
|
// Phase v7: SmallObject v7 free (research box)
|
|||
|
|
if (small_heap_free_fast_v7_stub(ptr, (uint8_t)class_idx)) {
|
|||
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
// V7 miss → fallback to LEGACY
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
case SMALL_ROUTE_MID_V3: {
|
|||
|
|
// Phase MID-V3: delegate to MID v3.5
|
|||
|
|
small_mid_v35_free(ptr, class_idx);
|
|||
|
|
FREE_PATH_STAT_INC(smallheap_v7_fast);
|
|||
|
|
return 1;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
case SMALL_ROUTE_LEGACY:
|
|||
|
|
default:
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
legacy_fallback:
|
|||
|
|
// LEGACY fallback path
|
|||
|
|
// Phase 19-6C: Compute route once using helper (avoid redundant tiny_route_for_class)
|
|||
|
|
tiny_route_kind_t route;
|
|||
|
|
int use_tiny_heap;
|
|||
|
|
free_tiny_fast_compute_route_and_heap(class_idx, &route, &use_tiny_heap);
|
|||
|
|
|
|||
|
|
// TWO-SPEED: SuperSlab registration check is DEBUG-ONLY to keep HOT PATH fast.
|
|||
|
|
// In Release builds, we trust header magic (0xA0) as sufficient validation.
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
// 5. Superslab 登録確認(誤分類防止)
|
|||
|
|
SuperSlab* ss_guard = hak_super_lookup(ptr);
|
|||
|
|
if (__builtin_expect(!(ss_guard && ss_guard->magic == SUPERSLAB_MAGIC), 0)) {
|
|||
|
|
return 0; // hakmem 管理外 → 通常 free 経路へ
|
|||
|
|
}
|
|||
|
|
#endif // !HAKMEM_BUILD_RELEASE
|
|||
|
|
|
|||
|
|
// Cross-thread free detection (Larson MT crash fix, ENV gated) + TinyHeap free path
|
|||
|
|
{
|
|||
|
|
static __thread int g_larson_fix = -1;
|
|||
|
|
if (__builtin_expect(g_larson_fix == -1, 0)) {
|
|||
|
|
const char* e = getenv("HAKMEM_TINY_LARSON_FIX");
|
|||
|
|
g_larson_fix = (e && *e && *e != '0') ? 1 : 0;
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
fprintf(stderr, "[LARSON_FIX_INIT] g_larson_fix=%d (env=%s)\n", g_larson_fix, e ? e : "NULL");
|
|||
|
|
fflush(stderr);
|
|||
|
|
#endif
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
if (__builtin_expect(g_larson_fix || use_tiny_heap, 0)) {
|
|||
|
|
// Phase 12 optimization: Use fast mask-based lookup (~5-10 cycles vs 50-100)
|
|||
|
|
SuperSlab* ss = ss_fast_lookup(base);
|
|||
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (5. super_lookup 呼び出し)
|
|||
|
|
FREE_PATH_STAT_INC(super_lookup_called);
|
|||
|
|
if (ss) {
|
|||
|
|
int slab_idx = slab_index_for(ss, base);
|
|||
|
|
if (__builtin_expect(slab_idx >= 0 && slab_idx < ss_slabs_capacity(ss), 1)) {
|
|||
|
|
uint32_t self_tid = tiny_self_u32_local();
|
|||
|
|
uint8_t owner_tid_low = ss_slab_meta_owner_tid_low_get(ss, slab_idx);
|
|||
|
|
TinySlabMeta* meta = &ss->slabs[slab_idx];
|
|||
|
|
// LARSON FIX: Use bits 8-15 for comparison (pthread TIDs aligned to 256 bytes)
|
|||
|
|
uint8_t self_tid_cmp = (uint8_t)((self_tid >> 8) & 0xFFu);
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
static _Atomic uint64_t g_owner_check_count = 0;
|
|||
|
|
uint64_t oc = atomic_fetch_add(&g_owner_check_count, 1);
|
|||
|
|
if (oc < 10) {
|
|||
|
|
fprintf(stderr, "[LARSON_FIX] Owner check: ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x match=%d\n",
|
|||
|
|
ptr, owner_tid_low, self_tid_cmp, self_tid, (owner_tid_low == self_tid_cmp));
|
|||
|
|
fflush(stderr);
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
if (__builtin_expect(owner_tid_low != self_tid_cmp, 0)) {
|
|||
|
|
// Cross-thread free → route to remote queue instead of poisoning TLS cache
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
static _Atomic uint64_t g_cross_thread_count = 0;
|
|||
|
|
uint64_t ct = atomic_fetch_add(&g_cross_thread_count, 1);
|
|||
|
|
if (ct < 20) {
|
|||
|
|
fprintf(stderr, "[LARSON_FIX] Cross-thread free detected! ptr=%p owner_tid_low=0x%02x self_tid_cmp=0x%02x self_tid=0x%08x\n",
|
|||
|
|
ptr, owner_tid_low, self_tid_cmp, self_tid);
|
|||
|
|
fflush(stderr);
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
if (tiny_free_remote_box(ss, slab_idx, meta, ptr, self_tid)) {
|
|||
|
|
// Phase FREE-LEGACY-BREAKDOWN-1: カウンタ散布 (6. cross-thread free)
|
|||
|
|
FREE_PATH_STAT_INC(remote_free);
|
|||
|
|
return 1; // handled via remote queue
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### `core/box/tiny_front_hot_box.h`
|
|||
|
|
```c
|
|||
|
|
static inline int tiny_hot_free_fast(int class_idx, void* base) {
|
|||
|
|
extern __thread TinyUnifiedCache g_unified_cache[];
|
|||
|
|
|
|||
|
|
// TLS cache access (1 cache miss)
|
|||
|
|
// NOTE: Range check removed - caller guarantees valid class_idx
|
|||
|
|
TinyUnifiedCache* cache = &g_unified_cache[class_idx];
|
|||
|
|
|
|||
|
|
#if HAKMEM_TINY_UNIFIED_LIFO_COMPILED
|
|||
|
|
// Phase 15 v1: Mode check at entry (once per call, not scattered in hot path)
|
|||
|
|
// Phase 22: Compile-out when disabled (default OFF)
|
|||
|
|
int lifo_mode = tiny_unified_lifo_enabled();
|
|||
|
|
|
|||
|
|
// Phase 15 v1: LIFO vs FIFO mode switch
|
|||
|
|
if (lifo_mode) {
|
|||
|
|
// === LIFO MODE: Stack-based (LIFO) ===
|
|||
|
|
// Try push to stack (tail is stack depth)
|
|||
|
|
if (unified_cache_try_push_lifo(class_idx, base)) {
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
extern __thread uint64_t g_unified_cache_push[];
|
|||
|
|
g_unified_cache_push[class_idx]++;
|
|||
|
|
#endif
|
|||
|
|
return 1; // SUCCESS
|
|||
|
|
}
|
|||
|
|
// LIFO overflow → fall through to cold path
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
extern __thread uint64_t g_unified_cache_full[];
|
|||
|
|
g_unified_cache_full[class_idx]++;
|
|||
|
|
#endif
|
|||
|
|
return 0; // FULL
|
|||
|
|
}
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
// === FIFO MODE: Ring-based (existing, default) ===
|
|||
|
|
// Calculate next tail (for full check)
|
|||
|
|
uint16_t next_tail = (cache->tail + 1) & cache->mask;
|
|||
|
|
|
|||
|
|
// Branch 1: Cache full check (UNLIKELY full)
|
|||
|
|
// Hot path: cache has space (next_tail != head)
|
|||
|
|
// Cold path: cache full (next_tail == head) → drain needed
|
|||
|
|
if (TINY_HOT_LIKELY(next_tail != cache->head)) {
|
|||
|
|
// === HOT PATH: Cache has space (2-3 instructions) ===
|
|||
|
|
|
|||
|
|
// Push to cache (1 cache miss for array write)
|
|||
|
|
cache->slots[cache->tail] = base;
|
|||
|
|
cache->tail = next_tail;
|
|||
|
|
|
|||
|
|
// Debug metrics (zero overhead in release)
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
extern __thread uint64_t g_unified_cache_push[];
|
|||
|
|
g_unified_cache_push[class_idx]++;
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
return 1; // SUCCESS
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// === COLD PATH: Cache full ===
|
|||
|
|
// Don't drain here - let caller handle via tiny_cold_drain_and_free()
|
|||
|
|
#if !HAKMEM_BUILD_RELEASE
|
|||
|
|
extern __thread uint64_t g_unified_cache_full[];
|
|||
|
|
g_unified_cache_full[class_idx]++;
|
|||
|
|
#endif
|
|||
|
|
|
|||
|
|
return 0; // FULL
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
### `core/box/tiny_legacy_fallback_box.h`
|
|||
|
|
```c
|
|||
|
|
static inline void tiny_legacy_fallback_free_base_with_env(void* base, uint32_t class_idx, const HakmemEnvSnapshot* env) {
|
|||
|
|
// Phase 80-1: Switch dispatch for C4/C5/C6 (branch reduction optimization)
|
|||
|
|
// Phase 83-1: Per-op branch removed via fixed-mode caching
|
|||
|
|
// C2/C3 excluded (NO-GO from Phase 77-1/79-1)
|
|||
|
|
if (tiny_inline_slots_switch_dispatch_enabled_fast()) {
|
|||
|
|
// Switch mode: Direct jump to case (zero comparison overhead for C4/C5/C6)
|
|||
|
|
switch (class_idx) {
|
|||
|
|
case 4:
|
|||
|
|
if (tiny_c4_inline_slots_enabled_fast()) {
|
|||
|
|
if (c4_inline_push(c4_inline_tls(), base)) {
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
break;
|
|||
|
|
case 5:
|
|||
|
|
if (tiny_c5_inline_slots_enabled_fast()) {
|
|||
|
|
if (c5_inline_push(c5_inline_tls(), base)) {
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
break;
|
|||
|
|
case 6:
|
|||
|
|
if (tiny_c6_inline_slots_enabled_fast()) {
|
|||
|
|
if (c6_inline_push(c6_inline_tls(), base)) {
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
break;
|
|||
|
|
default:
|
|||
|
|
// C0-C3, C7: fall through to unified_cache push
|
|||
|
|
break;
|
|||
|
|
}
|
|||
|
|
// Switch mode: fall through to unified_cache push after miss
|
|||
|
|
} else {
|
|||
|
|
// If-chain mode (Phase 80-1 baseline): C3/C4/C5/C6 sequential checks
|
|||
|
|
// NOTE: C2 local cache (Phase 79-1 NO-GO) removed from hot path
|
|||
|
|
|
|||
|
|
// Phase 77-1: C3 Inline Slots early-exit (ENV gated)
|
|||
|
|
// Try C3 inline slots SECOND (before C4/C5/C6/unified cache) for class 3
|
|||
|
|
if (class_idx == 3 && tiny_c3_inline_slots_enabled_fast()) {
|
|||
|
|
if (c3_inline_push(c3_inline_tls(), base)) {
|
|||
|
|
// Success: pushed to C3 inline slots
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
// FULL → fall through to C4/C5/C6/unified cache
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 76-1: C4 Inline Slots early-exit (ENV gated)
|
|||
|
|
// Try C4 inline slots SECOND (before C5/C6/unified cache) for class 4
|
|||
|
|
if (class_idx == 4 && tiny_c4_inline_slots_enabled_fast()) {
|
|||
|
|
if (c4_inline_push(c4_inline_tls(), base)) {
|
|||
|
|
// Success: pushed to C4 inline slots
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
// FULL → fall through to C5/C6/unified cache
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 75-2: C5 Inline Slots early-exit (ENV gated)
|
|||
|
|
// Try C5 inline slots SECOND (before C6 and unified cache) for class 5
|
|||
|
|
if (class_idx == 5 && tiny_c5_inline_slots_enabled_fast()) {
|
|||
|
|
if (c5_inline_push(c5_inline_tls(), base)) {
|
|||
|
|
// Success: pushed to C5 inline slots
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
// FULL → fall through to C6/unified cache
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Phase 75-1: C6 Inline Slots early-exit (ENV gated)
|
|||
|
|
// Try C6 inline slots THIRD (before unified cache) for class 6
|
|||
|
|
if (class_idx == 6 && tiny_c6_inline_slots_enabled_fast()) {
|
|||
|
|
if (c6_inline_push(c6_inline_tls(), base)) {
|
|||
|
|
// Success: pushed to C6 inline slots
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
// FULL → fall through to unified cache
|
|||
|
|
}
|
|||
|
|
} // End of if-chain mode
|
|||
|
|
|
|||
|
|
const TinyFrontV3Snapshot* front_snap =
|
|||
|
|
env ? (env->tiny_front_v3_enabled ? tiny_front_v3_snapshot_get() : NULL)
|
|||
|
|
: (__builtin_expect(tiny_front_v3_enabled(), 0) ? tiny_front_v3_snapshot_get() : NULL);
|
|||
|
|
const bool metadata_cache_on = env ? env->tiny_metadata_cache_eff : tiny_metadata_cache_enabled();
|
|||
|
|
|
|||
|
|
// Phase 3 C2 Patch 2: First page cache hint (optional fast-path)
|
|||
|
|
// Check if pointer is in cached page (avoids metadata lookup in future optimizations)
|
|||
|
|
if (__builtin_expect(metadata_cache_on, 0)) {
|
|||
|
|
// Note: This is a hint-only check. Even if it hits, we still use the standard path.
|
|||
|
|
// The cache will be populated during refill operations for future use.
|
|||
|
|
// Currently this just validates the cache state; actual optimization TBD.
|
|||
|
|
if (tiny_first_page_cache_hit(class_idx, base, 4096)) {
|
|||
|
|
// Future: could optimize metadata access here
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Legacy fallback - Unified Cache push
|
|||
|
|
if (!front_snap || front_snap->unified_cache_on) {
|
|||
|
|
// Phase 74-3 (P0): FASTAPI path (ENV-gated)
|
|||
|
|
if (tiny_uc_fastapi_enabled()) {
|
|||
|
|
// Preconditions guaranteed:
|
|||
|
|
// - unified_cache_on == true (checked above)
|
|||
|
|
// - TLS init guaranteed by front_gate_unified_enabled() in malloc_tiny_fast.h
|
|||
|
|
// - Stats compiled-out in FAST builds
|
|||
|
|
if (unified_cache_push_fast(class_idx, HAK_BASE_FROM_RAW(base))) {
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
|
|||
|
|
// Per-class breakdown (Phase 4-1)
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
if (class_idx < 8) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
// FULL → fallback to slow path (rare)
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Original path (FASTAPI=0 or fallback)
|
|||
|
|
if (unified_cache_push(class_idx, HAK_BASE_FROM_RAW(base))) {
|
|||
|
|
FREE_PATH_STAT_INC(legacy_fallback);
|
|||
|
|
|
|||
|
|
// Per-class breakdown (Phase 4-1)
|
|||
|
|
if (__builtin_expect(free_path_stats_enabled(), 0)) {
|
|||
|
|
if (class_idx < 8) {
|
|||
|
|
g_free_path_stats.legacy_by_class[class_idx]++;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
return;
|
|||
|
|
}
|
|||
|
|
}
|
|||
|
|
|
|||
|
|
// Final fallback
|
|||
|
|
tiny_hot_free_fast(class_idx, base);
|
|||
|
|
}
|
|||
|
|
```
|
|||
|
|
|
|||
|
|
## Questions to answer (please be concrete)
|
|||
|
|
|
|||
|
|
1) In these snippets, which checks/branches are still "per-op fixed taxes" on the hot free path?
|
|||
|
|
- Please point to specific lines/conditions and estimate cost (branches/instructions or dependency chain).
|
|||
|
|
|
|||
|
|
2) Is `tiny_hot_free_fast()` already close to optimal, and the real bottleneck is upstream (user->base/classify/route)?
|
|||
|
|
- If yes, what’s the smallest structural refactor that removes that upstream fixed tax?
|
|||
|
|
|
|||
|
|
3) Should we introduce a "commit once" plan (freeze the chosen free path) — or is branch prediction already making lazy-init checks ~free here?
|
|||
|
|
- If "commit once", where should it live to avoid runtime gate overhead (bench_profile refresh boundary vs per-op)?
|
|||
|
|
|
|||
|
|
4) We have had many layout-tax regressions from code removal/reordering.
|
|||
|
|
- What patterns here are most likely to trigger layout tax if changed?
|
|||
|
|
- How would you stage a safe A/B (same binary, ENV toggle) for your proposal?
|
|||
|
|
|
|||
|
|
5) If you could change just ONE of:
|
|||
|
|
- pointer classification to base/class_idx,
|
|||
|
|
- route determination,
|
|||
|
|
- unified cache push/pop structure,
|
|||
|
|
which is highest ROI for +5–10% on WS=400?
|
|||
|
|
|
|||
|
|
|
|||
|
|
[packet] done
|