Files
hakmem/core/hakmem_tiny_refill_p0.inc.h
Moe Charm (CI) ccf604778c Front-Direct implementation: SS→FC direct refill + SLL complete bypass
## Summary

Implemented Front-Direct architecture with complete SLL bypass:
- Direct SuperSlab → FastCache refill (1-hop, bypasses SLL)
- SLL-free allocation/free paths when Front-Direct enabled
- Legacy path sealing (SLL inline opt-in, SFC cascade ENV-only)

## New Modules

- core/refill/ss_refill_fc.h (236 lines): Standard SS→FC refill entry point
  - Remote drain → Freelist → Carve priority
  - Header restoration for C1-C6 (NOT C0/C7)
  - ENV: HAKMEM_TINY_P0_DRAIN_THRESH, HAKMEM_TINY_P0_NO_DRAIN

- core/front/fast_cache.h: FastCache (L1) type definition
- core/front/quick_slot.h: QuickSlot (L0) type definition

## Allocation Path (core/tiny_alloc_fast.inc.h)

- Added s_front_direct_alloc TLS flag (lazy ENV check)
- SLL pop guarded by: g_tls_sll_enable && !s_front_direct_alloc
- Refill dispatch:
  - Front-Direct: ss_refill_fc_fill() → fastcache_pop() (1-hop)
  - Legacy: sll_refill_batch_from_ss() → SLL → FC (2-hop, A/B only)
- SLL inline pop sealed (requires HAKMEM_TINY_INLINE_SLL=1 opt-in)

## Free Path (core/hakmem_tiny_free.inc, core/hakmem_tiny_fastcache.inc.h)

- FC priority: Try fastcache_push() first (same-thread free)
- tiny_fast_push() bypass: Returns 0 when s_front_direct_free || !g_tls_sll_enable
- Fallback: Magazine/slow path (safe, bypasses SLL)

## Legacy Sealing

- SFC cascade: Default OFF (ENV-only via HAKMEM_TINY_SFC_CASCADE=1)
- Deleted: core/hakmem_tiny_free.inc.bak, core/pool_refill_legacy.c.bak
- Documentation: ss_refill_fc_fill() promoted as CANONICAL refill entry

## ENV Controls

- HAKMEM_TINY_FRONT_DIRECT=1: Enable Front-Direct (SS→FC direct)
- HAKMEM_TINY_P0_DIRECT_FC_ALL=1: Same as above (alt name)
- HAKMEM_TINY_REFILL_BATCH=1: Enable batch refill (also enables Front-Direct)
- HAKMEM_TINY_SFC_CASCADE=1: Enable SFC cascade (default OFF)
- HAKMEM_TINY_INLINE_SLL=1: Enable inline SLL pop (default OFF, requires AGGRESSIVE_INLINE)

## Benchmarks (Front-Direct Enabled)

```bash
ENV: HAKMEM_BENCH_FAST_FRONT=1 HAKMEM_TINY_FRONT_DIRECT=1
     HAKMEM_TINY_REFILL_BATCH=1 HAKMEM_TINY_P0_DIRECT_FC_ALL=1
     HAKMEM_TINY_REFILL_COUNT_HOT=256 HAKMEM_TINY_REFILL_COUNT_MID=96
     HAKMEM_TINY_BUMP_CHUNK=256

bench_random_mixed (16-1040B random, 200K iter):
  256 slots: 1.44M ops/s (STABLE, 0 SEGV)
  128 slots: 1.44M ops/s (STABLE, 0 SEGV)

bench_fixed_size (fixed size, 200K iter):
  256B: 4.06M ops/s (has debug logs, expected >10M without logs)
  128B: Similar (debug logs affect)
```

## Verification

- TRACE_RING test (10K iter): **0 SLL events** detected 
- Complete SLL bypass confirmed when Front-Direct=1
- Stable execution: 200K iterations × multiple sizes, 0 SEGV

## Next Steps

- Disable debug logs in hak_alloc_api.inc.h (call_num 14250-14280 range)
- Re-benchmark with clean Release build (target: 10-15M ops/s)
- 128/256B shortcut path optimization (FC hit rate improvement)

Co-Authored-By: ChatGPT <chatgpt@openai.com>
Suggested-By: ultrathink
2025-11-14 05:41:49 +09:00

287 lines
10 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#ifndef HAKMEM_TINY_REFILL_P0_INC_H
#define HAKMEM_TINY_REFILL_P0_INC_H
// hakmem_tiny_refill_p0.inc.h
// P0: Batch refill implementation (sll_refill_batch_from_ss only).
// Phase 12: DO NOT alias or redefine sll_refill_small_from_ss here.
// NOTE: This file is active only when HAKMEM_TINY_P0_BATCH_REFILL=1.
#if HAKMEM_TINY_P0_BATCH_REFILL
#include "tiny_box_geometry.h" // Box 3: Geometry & Capacity Calculator
#include "tiny_refill_opt.h"
#include "tiny_fc_api.h"
#include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe()
#include "box/integrity_box.h" // Box I: Integrity verification (Priority ALPHA)
#include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write
// Debug counters (compile-time gated)
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_hit_slab[];
extern unsigned long long g_rf_early_no_ss[];
extern unsigned long long g_rf_early_no_meta[];
extern unsigned long long g_rf_early_no_room[];
extern unsigned long long g_rf_early_want_zero[];
#endif
// Optional P0 diagnostic logging helper
static inline int p0_should_log(void) {
static int en = -1;
if (__builtin_expect(en == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_LOG");
en = (e && *e && *e != '0') ? 1 : 0;
}
return en;
}
// P0 batch refill entry point
static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
// Phase E1-CORRECT: C7 now has headers, can use P0 batch refill
// Runtime A/B kill switch (defensive). Set HAKMEM_TINY_P0_DISABLE=1 to bypass P0 path.
do {
static int g_p0_disable = -1;
if (__builtin_expect(g_p0_disable == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DISABLE");
g_p0_disable = (e && *e && *e != '0') ? 1 : 0;
}
if (__builtin_expect(g_p0_disable, 0)) {
return 0;
}
} while (0);
if (!g_use_superslab || max_take <= 0) {
#if HAKMEM_DEBUG_COUNTERS
if (!g_use_superslab) g_rf_early_no_ss[class_idx]++;
#endif
return 0;
}
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
uint32_t active_before = 0;
if (tls->ss) {
active_before = atomic_load_explicit(&tls->ss->total_active_blocks, memory_order_relaxed);
}
if (!tls->ss) {
if (!superslab_refill(class_idx)) {
return 0;
}
}
TinySlabMeta* meta = tls->meta;
if (!meta) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_no_meta[class_idx]++;
#endif
return 0;
}
#if HAKMEM_INTEGRITY_LEVEL >= 4
uint8_t* initial_slab_base =
tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
SlabMetadataState meta_initial =
integrity_capture_slab_metadata(meta, initial_slab_base, class_idx);
INTEGRITY_CHECK_SLAB_METADATA(meta_initial, "P0 refill entry");
#endif
// Optional: Direct-FC fast path全クラス対応 A/B
// Env:
// - HAKMEM_TINY_P0_DIRECT_FC=1 → C5優先互換
// - HAKMEM_TINY_P0_DIRECT_FC_C7=1 → C7のみ互換
// - HAKMEM_TINY_P0_DIRECT_FC_ALL=1 → 全クラス推奨、Phase 1 目標)
do {
static int g_direct_fc = -1;
static int g_direct_fc_c7 = -1;
static int g_direct_fc_all = -1;
if (__builtin_expect(g_direct_fc == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
g_direct_fc_c7 = (e7 && *e7) ? ((*e7 == '0') ? 0 : 1) : 0;
}
if (__builtin_expect(g_direct_fc_all == -1, 0)) {
const char* ea = getenv("HAKMEM_TINY_P0_DIRECT_FC_ALL");
g_direct_fc_all = (ea && *ea && *ea != '0') ? 1 : 0;
}
if (__builtin_expect(g_direct_fc_all ||
(g_direct_fc && class_idx == 5) ||
(g_direct_fc_c7 && class_idx == 7), 0)) {
int room = tiny_fc_room(class_idx);
if (room <= 0) return 0;
uint32_t rmt = atomic_load_explicit(
&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
static int g_drain_th = -1;
if (__builtin_expect(g_drain_th == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
int v = (e && *e) ? atoi(e) : 64;
g_drain_th = (v < 0) ? 0 : v;
}
if (rmt >= (uint32_t)g_drain_th) {
static int no_drain = -1;
if (__builtin_expect(no_drain == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
no_drain = (e && *e && *e != '0') ? 1 : 0;
}
if (!no_drain) {
_ss_remote_drain_to_freelist_unsafe(
tls->ss, tls->slab_idx, tls->meta);
}
}
void* out[128];
int produced = 0;
TinySlabMeta* m = tls->meta;
size_t bs = tiny_stride_for_class(class_idx);
uint8_t* base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
while (produced < room) {
if (m->freelist) {
void* p = m->freelist;
m->freelist = tiny_next_read(class_idx, p);
m->used++;
out[produced++] = p;
} else if (m->carved < m->capacity) {
void* p = (void*)(base + ((size_t)m->carved * bs));
m->carved++;
m->used++;
out[produced++] = p;
} else {
if (!superslab_refill(class_idx)) break;
tls = &g_tls_slabs[class_idx];
m = tls->meta;
base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for(tls->ss, tls->slab_idx);
}
}
if (produced > 0) {
ss_active_add(tls->ss, (uint32_t)produced);
(void)tiny_fc_push_bulk(class_idx, out, produced);
return produced;
}
// fallthrough to regular path
}
} while (0);
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
if (room <= 0) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_no_room[class_idx]++;
#endif
return 0;
}
uint32_t want = (uint32_t)max_take;
if (want > (uint32_t)room) want = (uint32_t)room;
if (want == 0) {
#if HAKMEM_DEBUG_COUNTERS
g_rf_early_want_zero[class_idx]++;
#endif
return 0;
}
size_t bs = tiny_stride_for_class(class_idx);
int total_taken = 0;
while (want > 0) {
uintptr_t ss_base = 0;
uintptr_t ss_limit = 0;
if (tls->ss && tls->slab_idx >= 0) {
uint8_t* slab_base =
tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
ss_base = (uintptr_t)slab_base;
ss_limit = ss_base + tiny_usable_bytes_for_slab(tls->slab_idx);
}
if (tls->ss && tls->slab_idx >= 0) {
uint32_t remote_count = atomic_load_explicit(
&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
if (remote_count > 0) {
static int no_drain = -1;
if (__builtin_expect(no_drain == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
no_drain = (e && *e && *e != '0') ? 1 : 0;
}
if (!no_drain) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, meta);
}
}
}
TinyRefillChain chain;
uint32_t from_freelist = trc_pop_from_freelist(
meta, class_idx, ss_base, ss_limit, bs, want, &chain);
if (from_freelist > 0) {
trc_splice_to_sll(
class_idx, &chain,
&g_tls_sll_head[class_idx],
&g_tls_sll_count[class_idx]);
ss_active_add(tls->ss, from_freelist);
meta->used = (uint16_t)((uint32_t)meta->used + from_freelist);
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_freelist_items[];
g_rf_freelist_items[class_idx] += from_freelist;
#endif
total_taken += from_freelist;
want -= from_freelist;
if (want == 0) break;
}
if (meta->carved >= meta->capacity) {
if (!superslab_refill(class_idx)) break;
tls = &g_tls_slabs[class_idx];
meta = tls->meta;
if (!meta) break;
continue;
}
uint32_t available = meta->capacity - meta->carved;
uint32_t batch = want;
if (batch > available) batch = available;
if (batch == 0) break;
uint8_t* slab_base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for(tls->ss, tls->slab_idx);
TinyRefillChain carve;
trc_linear_carve(slab_base, bs, meta, batch, class_idx, &carve);
trc_splice_to_sll(
class_idx, &carve,
&g_tls_sll_head[class_idx],
&g_tls_sll_count[class_idx]);
ss_active_add(tls->ss, batch);
#if HAKMEM_DEBUG_COUNTERS
extern unsigned long long g_rf_carve_items[];
g_rf_carve_items[class_idx] += batch;
#endif
total_taken += batch;
want -= batch;
}
#if HAKMEM_DEBUG_COUNTERS
g_rf_hit_slab[class_idx]++;
#endif
if (tls->ss && p0_should_log()) {
uint32_t active_after = atomic_load_explicit(
&tls->ss->total_active_blocks, memory_order_relaxed);
int32_t delta =
(int32_t)active_after - (int32_t)active_before;
fprintf(stderr,
"[P0_COUNTER] cls=%d slab=%d taken=%d active_delta=%d\n",
class_idx, tls->slab_idx, total_taken, delta);
}
return total_taken;
}
#endif // HAKMEM_TINY_P0_BATCH_REFILL
#endif // HAKMEM_TINY_REFILL_P0_INC_H