Files
hakmem/core/hakmem_tiny_refill.inc.h
Moe Charm (CI) acc64f2438 Phase ML1: Pool v1 memset 89.73% overhead 軽量化 (+15.34% improvement)
## Summary
- ChatGPT により bench_profile.h の setenv segfault を修正(RTLD_NEXT 経由に切り替え)
- core/box/pool_zero_mode_box.h 新設:ENV キャッシュ経由で ZERO_MODE を統一管理
- core/hakmem_pool.c で zero mode に応じた memset 制御(FULL/header/off)
- A/B テスト結果:ZERO_MODE=header で +15.34% improvement(1M iterations, C6-heavy)

## Files Modified
- core/box/pool_api.inc.h: pool_zero_mode_box.h include
- core/bench_profile.h: glibc setenv → malloc+putenv(segfault 回避)
- core/hakmem_pool.c: zero mode 参照・制御ロジック
- core/box/pool_zero_mode_box.h (新設): enum/getter
- CURRENT_TASK.md: Phase ML1 結果記載

## Test Results
| Iterations | ZERO_MODE=full | ZERO_MODE=header | Improvement |
|-----------|----------------|-----------------|------------|
| 10K       | 3.06 M ops/s   | 3.17 M ops/s    | +3.65%     |
| 1M        | 23.71 M ops/s  | 27.34 M ops/s   | **+15.34%** |

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-10 09:08:18 +09:00

512 lines
19 KiB
C
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

// hakmem_tiny_refill.inc.h
// Phase 12: Minimal refill helpers needed by Box fast path.
//
// 本ヘッダは、以下を提供する:
// - superslab_tls_bump_fast: TinyTLSSlab + SuperSlab メタからのTLSバンプ窓
// - tiny_fast_refill_and_take: FastCache/TLS SLL からの最小 refill + 1個取得
// - bulk_mag_to_sll_if_room: Magazine→SLL へのバルク移送(容量チェック付き)
// - sll_refill_small_from_ss: Phase12 shared SuperSlab pool 向けの最小実装
//
// 旧来の g_sll_cap_override / getenv ベースの多経路ロジックは一切含めない。
#ifndef HAKMEM_TINY_REFILL_INC_H
#define HAKMEM_TINY_REFILL_INC_H
#include "hakmem_tiny.h"
#include "hakmem_tiny_superslab.h"
#include "hakmem_tiny_tls_list.h"
#include "tiny_box_geometry.h"
#include "superslab/superslab_inline.h" // Provides hak_super_lookup() and SUPERSLAB_MAGIC
#include "box/tls_sll_box.h"
#include "box/c7_meta_used_counter_box.h"
#include "box/tiny_header_box.h" // Header Box: Single Source of Truth for header operations
#include "box/tiny_front_config_box.h" // Phase 7-Step6-Fix: Config macros for dead code elimination
#include "box/tiny_heap_env_box.h" // TinyHeap front gate (C7 TinyHeapBox)
#include "hakmem_tiny_integrity.h"
#include "box/tiny_next_ptr_box.h"
#include "tiny_region_id.h" // For HEADER_MAGIC/HEADER_CLASS_MASK (prepare header before SLL push)
#include <stdint.h>
#include <stdatomic.h>
#include <stdio.h> // For fprintf diagnostics
// ========= Externs from hakmem_tiny.c and friends =========
extern int g_use_superslab;
extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES];
extern int g_fastcache_enable;
extern uint16_t g_fast_cap[TINY_NUM_CLASSES];
extern __thread TinyFastCache g_fast_cache[TINY_NUM_CLASSES];
// Phase 7-Step7: g_tls_sll_enable now accessed via TINY_FRONT_TLS_SLL_ENABLED macro
extern __thread TinyTLSSLL g_tls_sll[TINY_NUM_CLASSES];
extern _Atomic uint32_t g_frontend_fill_target[TINY_NUM_CLASSES];
extern int g_ultra_bump_shadow;
extern int g_bump_chunk;
extern __thread uint8_t* g_tls_bcur[TINY_NUM_CLASSES];
extern __thread uint8_t* g_tls_bend[TINY_NUM_CLASSES];
#if HAKMEM_DEBUG_COUNTERS
extern uint64_t g_bump_hits[TINY_NUM_CLASSES];
extern uint64_t g_bump_arms[TINY_NUM_CLASSES];
extern uint64_t g_path_refill_calls[TINY_NUM_CLASSES];
extern uint64_t g_ultra_refill_calls[TINY_NUM_CLASSES];
extern int g_path_debug_enabled;
#endif
// ========= From other units =========
SuperSlab* superslab_refill(int class_idx);
void ss_active_inc(SuperSlab* ss);
void ss_active_add(SuperSlab* ss, uint32_t n);
size_t tiny_stride_for_class(int class_idx);
uint8_t* tiny_slab_base_for_geometry(SuperSlab* ss, int slab_idx);
extern uint32_t sll_cap_for_class(int class_idx, uint32_t mag_cap);
/* ultra_* 系は hakmem_tiny.c 側に定義があるため、ここでは宣言しない */
/* tls_sll_push は box/tls_sll_box.h で static inline bool tls_sll_push(...) 提供済み */
/* tiny_small_mags_init_once / tiny_mag_init_if_needed も hakmem_tiny_magazine.h で宣言済みなので、ここでは再宣言しない */
/* tiny_fast_pop / tiny_fast_push / fastcache_* は hakmem_tiny_fastcache.inc.h 側の static inline なので、ここでは未宣言でOK */
#if !HAKMEM_BUILD_RELEASE
static inline void tiny_debug_validate_node_base(int class_idx, void* node, const char* where)
{
(void)class_idx;
(void)where;
// 最低限の防御: 異常に小さいアドレスを弾く
if ((uintptr_t)node < 4096) {
fprintf(stderr,
"[TINY_REFILL_GUARD] %s: suspicious node=%p cls=%d\n",
where, node, class_idx);
abort();
}
}
#else
static inline void tiny_debug_validate_node_base(int class_idx, void* node, const char* where)
{
(void)class_idx;
(void)node;
(void)where;
}
#endif
static inline void c7_log_used_assign_cap(TinySlabMeta* meta,
int class_idx,
const char* tag) {
if (__builtin_expect(class_idx != 7, 1)) {
return;
}
#if HAKMEM_BUILD_RELEASE
static _Atomic uint32_t rel_logs = 0;
uint32_t n = atomic_fetch_add_explicit(&rel_logs, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr,
"[REL_C7_USED_ASSIGN] tag=%s used=%u cap=%u carved=%u freelist=%p\n",
tag,
(unsigned)meta->used,
(unsigned)meta->capacity,
(unsigned)meta->carved,
meta->freelist);
}
#else
static _Atomic uint32_t dbg_logs = 0;
uint32_t n = atomic_fetch_add_explicit(&dbg_logs, 1, memory_order_relaxed);
if (n < 4) {
fprintf(stderr,
"[DBG_C7_USED_ASSIGN] tag=%s used=%u cap=%u carved=%u freelist=%p\n",
tag,
(unsigned)meta->used,
(unsigned)meta->capacity,
(unsigned)meta->carved,
meta->freelist);
}
#endif
}
// ========= superslab_tls_bump_fast =========
//
// Ultra bump shadow: current slabが freelist 空で carved<capacity のとき、
// 連続領域を TLS window としてまとめ予約する。
// tiny_hot_pop_class{0..3} から呼ばれる。
static inline void* superslab_tls_bump_fast(int class_idx) {
if (!g_ultra_bump_shadow || !g_use_superslab) return NULL;
uint8_t* cur = g_tls_bcur[class_idx];
if (cur) {
uint8_t* end = g_tls_bend[class_idx];
size_t stride = tiny_stride_for_class(class_idx);
if (cur + stride <= end) {
g_tls_bcur[class_idx] = cur + stride;
#if HAKMEM_DEBUG_COUNTERS
g_bump_hits[class_idx]++;
#endif
#if HAKMEM_TINY_HEADER_CLASSIDX
// Headerは呼び出し元で書く or strideに含め済み想定。ここでは生ポインタ返す。
#endif
return cur;
}
g_tls_bcur[class_idx] = NULL;
g_tls_bend[class_idx] = NULL;
}
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
TinySlabMeta* meta = tls->meta;
if (!tls->ss || !meta || meta->freelist) return NULL;
uint16_t carved = meta->carved;
uint16_t cap = meta->capacity;
if (carved >= cap) return NULL;
uint32_t avail = (uint32_t)cap - (uint32_t)carved;
uint32_t chunk = (g_bump_chunk > 0) ? (uint32_t)g_bump_chunk : 1u;
if (chunk > avail) chunk = avail;
size_t stride = tiny_stride_for_class(class_idx);
uint8_t* base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
uint8_t* start = base + (size_t)carved * stride;
meta->carved = (uint16_t)(carved + (uint16_t)chunk);
meta->used = (uint16_t)(meta->used + (uint16_t)chunk);
if (class_idx == 7) {
for (uint32_t i = 0; i < chunk; ++i) {
c7_meta_used_note(class_idx, C7_META_USED_SRC_FRONT);
}
}
ss_active_add(tls->ss, chunk);
#if HAKMEM_DEBUG_COUNTERS
g_bump_arms[class_idx]++;
#endif
// 1個目を即返し、残りをTLS windowとして保持
g_tls_bcur[class_idx] = start + stride;
g_tls_bend[class_idx] = start + (size_t)chunk * stride;
return start;
}
// ========= tiny_fast_refill_and_take =========
//
// FCが空の時に、TLS list/superslab からバッチ取得して一つ返す。
// 旧来の複雑な経路を削り、FC/SLLのみの最小ロジックにする。
static inline void* tiny_fast_refill_and_take(int class_idx, TinyTLSList* tls) {
(void)tls;
// 1) Front FastCache から直接
// Phase 7-Step6-Fix: Use config macro for dead code elimination in PGO mode
if (__builtin_expect(TINY_FRONT_FASTCACHE_ENABLED && class_idx <= 3, 1)) {
hak_base_ptr_t fc = fastcache_pop(class_idx);
if (!hak_base_is_null(fc)) {
extern unsigned long long g_front_fc_hit[TINY_NUM_CLASSES];
g_front_fc_hit[class_idx]++;
return HAK_BASE_TO_RAW(fc);
}
}
// 2) ローカルfast list
{
hak_base_ptr_t p = tiny_fast_pop(class_idx);
if (!hak_base_is_null(p)) return HAK_BASE_TO_RAW(p);
}
uint16_t cap = g_fast_cap[class_idx];
if (cap == 0) return NULL;
TinyFastCache* fc = &g_fast_cache[class_idx];
int room = (int)cap - fc->top;
if (room <= 0) return NULL;
// 3) TLS SLL から詰め替え
int filled = 0;
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
while (room > 0 && TINY_FRONT_TLS_SLL_ENABLED) {
void* h = NULL;
if (!tls_sll_pop(class_idx, &h)) break;
tiny_debug_validate_node_base(class_idx, h, "tiny_fast_refill_and_take");
fc->items[fc->top++] = h;
room--;
filled++;
}
if (filled == 0) {
// 4) Superslab bump (optional)
void* bump = superslab_tls_bump_fast(class_idx);
if (bump) return bump;
return NULL;
}
// 5) 1個返す
return fc->items[--fc->top];
}
// ========= bulk_mag_to_sll_if_room =========
//
// Magazine → SLL への安全な流し込み。
// tiny_free_magazine.inc.h から参照される。
static inline int bulk_mag_to_sll_if_room(int class_idx, TinyTLSMag* mag, int n) {
// Phase 7-Step7: Use config macro for dead code elimination in PGO mode
if (!TINY_FRONT_TLS_SLL_ENABLED || n <= 0) return 0;
uint32_t cap = sll_cap_for_class(class_idx, (uint32_t)mag->cap);
uint32_t have = g_tls_sll[class_idx].count;
if (have >= cap) return 0;
int room = (int)(cap - have);
int take = n < room ? n : room;
if (take <= 0) return 0;
if (take > mag->top) take = mag->top;
if (take <= 0) return 0;
int pushed = 0;
for (int i = 0; i < take; i++) {
void* p = mag->items[--mag->top].ptr;
hak_base_ptr_t base_p = HAK_BASE_FROM_RAW(p);
if (!tls_sll_push(class_idx, base_p, cap)) {
mag->top++; // rollback last
break;
}
pushed++;
}
#if HAKMEM_DEBUG_COUNTERS
if (pushed > 0) g_path_refill_calls[class_idx]++;
#endif
return pushed;
}
/*
* ========= Minimal Phase 12 sll_refill_small_from_ss =========
*
* Box化方針:
* - フロントエンドtiny_fast_refill 等)は:
* - TLS SLL: tls_sll_box.h API のみを使用
* - Superslab: 本関数を唯一の「小サイズ SLL 補充 Box」として利用
* - バックエンド:
* - 現段階(Stage A/B)では既存 TLS Superslab/TinySlabMeta を直接利用
* - 将来(Stage C)に shared_pool_acquire_slab() に差し替え可能なよう、
* ここに Superslab 内部アクセスを閉じ込める
*
* 契約:
* - Tiny classes のみ (0 <= class_idx < TINY_NUM_CLASSES)
* - max_take は「この呼び出しで SLL に積みたい最大個数」
* - 戻り値は実際に SLL に積んだ個数0 以上)
* - 呼び出し側は head/count/meta 等に触れず、Box API (tls_sll_box) のみ利用する
*/
__attribute__((noinline))
int sll_refill_small_from_ss(int class_idx, int max_take)
{
// Hard defensive gate: Tiny classes only, never trust caller.
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) {
return 0;
}
// TinyHeap front で扱うクラスは TLS SLL を使わないTinyHeapBox 内で完結)。
if (tiny_heap_class_route_enabled(class_idx)) {
return 0;
}
HAK_CHECK_CLASS_IDX(class_idx, "sll_refill_small_from_ss");
atomic_fetch_add(&g_integrity_check_class_bounds, 1);
// Phase12: 起動直後など、shared pool / superslab 未有効時は絶対に動かさない。
if (!g_use_superslab || max_take <= 0) {
return 0;
}
// TLS slab 未構成状態 (ss/meta/slab_base すべて NULL) のときは、ここでは触らない。
// superslab_refill は「本当に必要になったタイミング」でのみ呼ぶ。
TinyTLSSlab* tls = &g_tls_slabs[class_idx];
if (!tls) {
return 0;
}
// FIX: TLS未初期化時も superslab_refill() で初期化する(早期リターン削除)
// 以前は tls_uninitialized の場合に return 0 していたが、これだと
// TLS SLL が永遠に空のままになり、Larson ベンチで 70x slowdown が発生していた。
// Ensure we have a valid TLS slab for this class via shared pool.
// superslab_refill() 契約:
// - 成功: g_tls_slabs[class_idx] に ss/meta/slab_base/slab_idx を一貫して設定
// - 失敗: TLS は不変 or 巻き戻し、NULL を返す
if (!tls->ss || !tls->meta ||
tls->meta->class_idx != (uint8_t)class_idx ||
!tls->slab_base) {
if (!superslab_refill(class_idx)) {
return 0;
}
tls = &g_tls_slabs[class_idx];
if (!tls->ss || !tls->meta ||
tls->meta->class_idx != (uint8_t)class_idx ||
!tls->slab_base) {
return 0;
}
}
TinySlabMeta* meta = tls->meta;
// Meta invariants: class & capacity は妥当であること
if (!meta ||
meta->class_idx != (uint8_t)class_idx ||
meta->capacity == 0) {
return 0;
}
const uint32_t cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
const uint32_t cur = g_tls_sll[class_idx].count;
if (cur >= cap) {
return 0;
}
int room = (int)(cap - cur);
int target = (max_take < room) ? max_take : room;
if (target <= 0) {
return 0;
}
int taken = 0;
const size_t stride = tiny_stride_for_class(class_idx);
while (taken < target) {
void* p = NULL;
// freelist 優先
if (meta->freelist) {
p = meta->freelist;
// Point 4: Freelist chain integrity check (CRITICAL - detect corruption early)
void* next_raw = tiny_next_read(class_idx, p);
uintptr_t next_addr = (uintptr_t)next_raw;
// Check 4a: NULL is valid (end of freelist)
if (next_raw != NULL) {
// Check 4b: Valid address range (not obviously corrupted)
if (next_addr < 4096 || next_addr > 0x00007fffffffffffULL) {
fprintf(stderr,
"[FREELIST_NEXT_INVALID] cls=%d p=%p next=%p addr=%#lx (out of valid range)\n",
class_idx, p, next_raw, next_addr);
fprintf(stderr, "[FREELIST_NEXT_INVALID] ss=%p meta=%p freelist_head=%p\n",
(void*)tls->ss, (void*)meta, p);
abort();
}
// Check 4c: SuperSlab ownership validation
SuperSlab* ss_check = hak_super_lookup(next_raw);
if (!ss_check || ss_check->magic != SUPERSLAB_MAGIC) {
fprintf(stderr,
"[FREELIST_NEXT_INVALID] cls=%d p=%p next=%p ss_check=%p (not in valid SuperSlab)\n",
class_idx, p, next_raw, (void*)ss_check);
if (ss_check) {
fprintf(stderr, "[FREELIST_NEXT_INVALID] ss_check->magic=%#llx (expected %#llx)\n",
(unsigned long long)ss_check->magic, (unsigned long long)SUPERSLAB_MAGIC);
}
abort();
}
}
meta->freelist = next_raw;
meta->used++;
c7_meta_used_note(class_idx, C7_META_USED_SRC_FRONT);
if (__builtin_expect(meta->used > meta->capacity, 0)) {
// 異常検出時はロールバックして終了fail-fast 回避のため静かに中断)
c7_log_used_assign_cap(meta, class_idx, "FREELIST_OVERRUN");
meta->used = meta->capacity;
break;
}
ss_active_inc(tls->ss);
}
// freelist が尽きていて carved < capacity なら線形 carve
else if (meta->carved < meta->capacity) {
uint8_t* base = tls->slab_base
? tls->slab_base
: tiny_slab_base_for_geometry(tls->ss, tls->slab_idx);
if (!base) {
break;
}
uint16_t idx = meta->carved;
if (idx >= meta->capacity) {
break;
}
// Point 5: Stride calculation bounds check (CRITICAL - prevent out-of-bounds carving)
// Check 5a: Stride must be valid (not 0, not suspiciously large)
if (stride == 0 || stride > 100000) {
fprintf(stderr,
"[STRIDE_INVALID] cls=%d stride=%zu idx=%u cap=%u\n",
class_idx, stride, idx, meta->capacity);
fprintf(stderr, "[STRIDE_INVALID] ss=%p meta=%p base=%p\n",
(void*)tls->ss, (void*)meta, (void*)base);
abort();
}
uint8_t* addr = base + ((size_t)idx * stride);
// Check 5b: Calculated address must be within slab bounds
uintptr_t base_addr = (uintptr_t)base;
uintptr_t addr_addr = (uintptr_t)addr;
size_t max_offset = (size_t)meta->capacity * stride;
if (addr_addr < base_addr || (addr_addr - base_addr) > max_offset) {
fprintf(stderr,
"[ADDR_OUT_OF_BOUNDS] cls=%d base=%p addr=%p offset=%zu max=%zu\n",
class_idx, (void*)base, (void*)addr, (addr_addr - base_addr), max_offset);
fprintf(stderr, "[ADDR_OUT_OF_BOUNDS] idx=%u cap=%u stride=%zu\n",
idx, meta->capacity, stride);
abort();
}
meta->carved++;
meta->used++;
c7_meta_used_note(class_idx, C7_META_USED_SRC_FRONT);
if (__builtin_expect(meta->used > meta->capacity, 0)) {
c7_log_used_assign_cap(meta, class_idx, "CARVE_OVERRUN");
meta->used = meta->capacity;
break;
}
ss_active_inc(tls->ss);
p = addr;
}
// freelist も carve も尽きたら、新しい slab を shared pool から取得
else {
if (!superslab_refill(class_idx)) {
break;
}
tls = &g_tls_slabs[class_idx];
meta = tls->meta;
if (!tls->ss || !meta ||
meta->class_idx != (uint8_t)class_idx ||
!tls->slab_base ||
meta->capacity == 0) {
break;
}
continue;
}
if (!p) {
break;
}
tiny_debug_validate_node_base(class_idx, p, "sll_refill_small_from_ss");
// Prepare header for header-classes so that safeheader mode accepts the push
// Uses Header Box API (C1-C6 only; C0/C7 skip - offset=0 overwrites header)
tiny_header_write_if_preserved(p, class_idx);
// SLL push 失敗時はそれ以上積まないp はTLS slab管理下なので破棄でOK
if (!tls_sll_push(class_idx, p, cap)) {
break;
}
taken++;
}
return taken;
}
#endif // HAKMEM_TINY_REFILL_INC_H