From fda6cd2e675a2d0ef12b423e579fdf7ec02e9f7f Mon Sep 17 00:00:00 2001 From: "Moe Charm (CI)" Date: Sun, 7 Dec 2025 03:12:27 +0900 Subject: [PATCH] Boxify superslab registry, add bench profile, and document C7 hotpath experiments --- CURRENT_TASK.md | 141 ++++++++++++++++----- Makefile | 8 +- PERF_COMPARISON_ALLOCATORS.md | 27 ++++ README_PERF_ANALYSIS.md | 33 ++++- bench_random_mixed.c | 49 +++++++- core/box/c7_hotpath_env_box.h | 15 +++ core/box/c7_meta_used_counter_box.c | 8 ++ core/box/carve_push_box.d | 6 +- core/box/front_gate_box.d | 30 ++--- core/box/front_gate_classifier.d | 6 +- core/box/remote_side_box.c | 88 +++++++++++++ core/box/remote_side_box.h | 21 ++++ core/box/shared_pool_box.c | 50 ++++++++ core/box/shared_pool_box.h | 18 +++ core/box/ss_ace_box.c | 8 +- core/box/ss_allocation_box.c | 4 + core/box/ss_budget_box.c | 122 ++++++++++++++++++ core/box/ss_budget_box.h | 19 +++ core/box/ss_slab_reset_box.h | 7 +- core/box/ss_stats_box.c | 57 +++++++++ core/box/ss_stats_box.h | 18 +++ core/box/ss_tls_bind_box.h | 2 +- core/box/super_reg_box.c | 143 ++++++++++++++++++++++ core/box/super_reg_box.h | 77 ++++++++++++ core/box/tiny_c7_hotpath_box.h | 63 ++++++++++ core/box/tiny_c7_stats_sample_box.h | 9 ++ core/box/tiny_c7_uc_hit_box.h | 58 +++++++++ core/box/tiny_c7_warm_spill_box.h | 9 ++ core/box/tiny_class_policy_box.c | 41 ++++++- core/box/tiny_class_policy_box.h | 16 ++- core/box/tiny_class_stats_box.c | 33 ++++- core/box/tiny_class_stats_box.h | 20 +++ core/box/tiny_mem_stats_box.c | 65 ++++++++++ core/box/tiny_mem_stats_box.h | 38 ++++++ core/box/tiny_page_box.c | 3 +- core/box/tiny_page_box.h | 52 +++++--- core/box/tiny_policy_learner_box.c | 67 +++++++--- core/box/tiny_tls_carve_one_block_box.h | 5 + core/box/warm_pool_prefill_box.h | 15 ++- core/box/warm_pool_stats_box.h | 12 -- core/front/malloc_tiny_fast.h | 14 +++ core/front/tiny_unified_cache.c | 74 ++++++----- core/front/tiny_warm_pool.h | 2 + core/hakmem_shared_pool.c | 2 + core/hakmem_shared_pool_acquire.c | 96 +++++++++++---- core/hakmem_shared_pool_release.c | 8 ++ core/hakmem_super_registry.c | 80 +++++++++--- core/hakmem_super_registry.h | 17 ++- core/hakmem_tiny.c | 9 +- core/hakmem_tiny_lifecycle.inc | 8 +- core/hakmem_tiny_magazine.c | 11 ++ core/hakmem_tiny_publish_box.inc | 7 +- core/superslab_ace.c | 9 +- core/superslab_stats.c | 55 +++++++++ core/tiny_alloc_fast_push.d | 6 +- core/tiny_remote.c | 85 ++++++++----- docs/analysis/C7_FREE_HOTPATH.md | 35 ++++++ docs/analysis/C7_HOTPATH_FLATTENING.md | 39 ++++++ docs/analysis/CPU_HOTPATH_OVERVIEW.md | 53 ++++++++ docs/analysis/LARGE_GLOBALS_OVERVIEW.md | 84 +++++++++++++ docs/analysis/SUPERSLAB_STATS_SNAPSHOT.md | 40 ++++++ hakmem.d | 34 ++++- hakmem_shared_pool.d | 12 +- hakmem_super_registry.d | 7 +- hakmem_tiny_bg_spill.d | 8 +- hakmem_tiny_magazine.d | 19 +-- hakmem_tiny_query.d | 5 +- hakmem_tiny_sfc.d | 31 ++--- tiny_adaptive_sizing.d | 9 +- tiny_fastcache.d | 8 +- tiny_remote.d | 8 +- 71 files changed, 2052 insertions(+), 286 deletions(-) create mode 100644 PERF_COMPARISON_ALLOCATORS.md create mode 100644 core/box/c7_hotpath_env_box.h create mode 100644 core/box/c7_meta_used_counter_box.c create mode 100644 core/box/remote_side_box.c create mode 100644 core/box/remote_side_box.h create mode 100644 core/box/shared_pool_box.c create mode 100644 core/box/shared_pool_box.h create mode 100644 core/box/ss_budget_box.c create mode 100644 core/box/ss_budget_box.h create mode 100644 core/box/super_reg_box.c create mode 100644 core/box/super_reg_box.h create mode 100644 core/box/tiny_c7_hotpath_box.h create mode 100644 core/box/tiny_c7_stats_sample_box.h create mode 100644 core/box/tiny_c7_uc_hit_box.h create mode 100644 core/box/tiny_c7_warm_spill_box.h create mode 100644 core/box/tiny_mem_stats_box.c create mode 100644 core/box/tiny_mem_stats_box.h create mode 100644 docs/analysis/C7_FREE_HOTPATH.md create mode 100644 docs/analysis/C7_HOTPATH_FLATTENING.md create mode 100644 docs/analysis/CPU_HOTPATH_OVERVIEW.md create mode 100644 docs/analysis/LARGE_GLOBALS_OVERVIEW.md create mode 100644 docs/analysis/SUPERSLAB_STATS_SNAPSHOT.md diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index 9ef2258d..3923c4d9 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -31,6 +31,7 @@ - FROZEN デフォルト(legacy プロファイル):Page Box は C5〜C7 のみ ON、Warm は C0〜C7 すべて ON(C0〜C4 cap=4、C5〜C7 cap=8)。 - ENV `HAKMEM_TINY_POLICY_PROFILE=legacy|c5_7_only|tinyplus_all` で切替可能(未指定は legacy)。 - Stats は OBSERVE 用に積むだけ、Learner は空実装のまま。 +- mimalloc/system との最新ベンチ (Release, prefault デフォルト, policy=legacy, mode=2) を README_PERF に追記。C7-only 48.8M vs mimalloc 95.3M / system 73.9M、129–1024B 50.0M vs 128.4M / 97.7M、full 50.9M vs 123.6M / 83.5M、Tiny-only 8–128B 93.2M vs 123.7M / 66.3M。 - TLS Bind Box の導入: - `core/box/ss_tls_bind_box.h` に `ss_tls_bind_one()` を追加し、「Superslab + slab_idx → TLS」のバインド処理(`superslab_init_slab` / `meta->class_idx` 設定 / `tiny_tls_bind_slab`)を 1 箇所に集約。 - `superslab_refill()`(Shared Pool 経路)および Warm Pool 実験経路から、この Box を経由して TLS に接続するよう統一。 @@ -51,44 +52,68 @@ - `core/box/tiny_class_policy_box.{h,c}` にクラス別ポリシー構造体 `TinyClassPolicy` と `tiny_policy_get(class_idx)` を追加。 - FROZEN デフォルト: Page Box = C5–C7, Warm = 全クラス(C0–C4 cap=4 / C5–C7 cap=8)。 - `HAKMEM_TINY_POLICY_PROFILE=legacy|c5_7_only|tinyplus_all` でプロファイル切替可能(未知値は legacy にフォールバック)。 - - `core/box/tiny_class_stats_box.{h,c}` に OBSERVE 用の軽量カウンタ(UC miss / Warm hit / Shared Pool lock など)を追加。 +- `core/box/tiny_class_stats_box.{h,c}` に OBSERVE 用の軽量カウンタ(UC miss / Warm hit / Shared Pool lock など)を追加。 - `core/box/tiny_policy_learner_box.{h,c}` に Learner の骨組みを追加(現状は FROZEN/OBSERVE モード向けの雛形)。 - `core/front/tiny_unified_cache.c` / Page Box / Warm Pool 経路を `tiny_policy_get(class_idx)` ベースでゲートし、Hot path からは Policy Box を読む形に統一。 +- `bench_random_mixed` に RSS ダンプ(`getrusage(RUSAGE_SELF).ru_maxrss`)を追加し、各 allocator で ops/s と合わせて常駐メモリを記録できるようにした。 +- 新規比較表 `PERF_COMPARISON_ALLOCATORS.md` を追加。C7-only / 129–1024B / 16–1024B で HAKMEM(full/larson_guard) は ~50M ops/s / ~29MB RSS、system は ~95–78M ops/s / ~1.6MB RSS、mimalloc は ~74–126M ops/s / ~1.8MB RSS。 +- SS stats (HAKMEM_SS_STATS_DUMP=1, full profile, 16–1024B ws=256/1M): live Superslab は C2=1, C7=1(empty_events: C7=1)、RSS は ~29MB。予算を 2 に絞っても同じ配置で RSS 変化なし → RSS は Superslab 枚数より TLS/Warm/Page stack 等の常駐分が支配的。 ### 性能の現状(Random Mixed, HEAD) -- 注記 (2025-12-05, policy legacy プロファイル試験値): - - Release: `HAKMEM_TINY_PROFILE=full HAKMEM_TINY_POLICY_PROFILE=legacy ./bench_random_mixed_hakmem 1000000 256 42` → 約 4.9M ops/s(導入前 27M との乖離あり、要フォロー)。 - - Release C7-only: `HAKMEM_BENCH_C7_ONLY=1 ... HAKMEM_TINY_POLICY_PROFILE=legacy` → 約 2.7M ops/s(空スラブガード導入前の遅さに戻っており要再調査)。 -- 条件: `bench_random_mixed_hakmem 1000000 256 42`(1T, ws=256, RELEASE, 16–1024B) - - HAKMEM: 約 27.6M ops/s(C7 Warm/TLS 修復後) - - system malloc: 約 90–100M ops/s - - mimalloc: 約 120–130M ops/s -- 条件: `bench_random_mixed_hakmem 1000000 256 42` + - `HAKMEM_BENCH_MIN_SIZE=8 HAKMEM_BENCH_MAX_SIZE=128`(Tiny-only, 8–128B) - - HAKMEM Tiny Front: 約 80–90M ops/s(mimalloc と同オーダー) -- 条件: `bench_random_mixed_hakmem 1000000 256 42` + - `HAKMEM_BENCH_MIN_SIZE=129 HAKMEM_BENCH_MAX_SIZE=1024`(Tiny C5–C7 のみ) - - HAKMEM: 約 28.0M ops/s(Warm/TLS ガード適用後) -- 条件: C7 専用 micro-bench(Debug, `HAKMEM_BENCH_C7_ONLY=1 HAKMEM_TINY_PROFILE=full HAKMEM_WARM_C7_MAX=8 HAKMEM_WARM_C7_PREFETCH=4` ほか) - - mode 0(Legacy Warm): 約 2.0M ops/s、C7 Warm ヒット 0・Shared Pool ロック多数(`slab_carve_from_ss` が 0 を頻発) - - mode 1(Bind-only): 約 20M ops/s(iters=200K, ws=32)、Warm hit ≈100%・Shared Pool ロック 5 回まで減少 - - mode 2(Bind+TLS carve 実験): mode 1 と同等〜わずかに上(UC ミスは増えるが `uc_miss_tls` に集中し、avg_refill は短縮) -- 条件: C7 専用 micro-bench(Release, `HAKMEM_BENCH_C7_ONLY=1 HAKMEM_TINY_PROFILE=full HAKMEM_WARM_C7_MAX=8 HAKMEM_WARM_C7_PREFETCH=4`) - - HAKMEM: 約 18.8M ops/s(空スラブ強制ガード + リセット導入後、Debug と同オーダーまで回復) -- 結論: - - Tiny front 自体(8–128B)は十分速く、mimalloc と同オーダーまで出ている。 - - C5–C7 経路は「満杯 C7 slab を Warm に再供給していた」問題を空スラブ限定ガード+Release/Debug 共通リセットで解消し、 - C7-only Release も ~18.8M ops/s に回復。Random Mixed Release も 27M クラスまで改善。 +- 条件: Release, `HAKMEM_TINY_PROFILE=full HAKMEM_WARM_TLS_BIND_C7=2 HAKMEM_WARM_C7_MAX=8 HAKMEM_WARM_C7_PREFETCH=4`, ws=256 + - **C7-only (size=1024, iters=200K, ws=32)** + - policy=legacy: 47.3M / 47.3M / 43.9M ops/s(平均 ≈ **46M**)。C7 uc_miss=6660 / warm_hit=3329 / shared_lock=5 / tls_carve_success=3329。 + - policy=auto(Learner: score=lock*4+miss): 45.6M / 44.6M / 39.7M ops/s(平均 ≈ **43–45M**)、統計は legacy と同一(C7 固定 ON)。 + - guard 比較: full **42.4M ops/s** vs larson_guard **40.7M ops/s**(-4%程度で安全側ガードを維持)。 + - **129–1024B (iters=1M, ws=256)** + - legacy: **51.5M ops/s**。C5 uc_miss=1/warm_hit=0/shared_lock=1、C6 uc_miss=1/warm_hit=0/shared_lock=2、C7 uc_miss=17196/warm_hit=8597/shared_lock=5/tls_carve_success=8597。 + - auto: **51.9M ops/s**(Learner=lock 重視でも C7 のみ ON、統計ほぼ同じ)。 + - guard 比較: full **49.0M ops/s** vs larson_guard **48.4M ops/s**(-1.2%)。 + - **full random_mixed 16–1024B (iters=1M, ws=256)** + - legacy: **51.0M ops/s**。C7 uc_miss=16702/warm_hit=8350/shared_lock=5/tls_carve_success=8350(C5/C6 は uc_miss=1〜2)。 + - auto: **50.0M ops/s**(C7 固定 ON のまま、他クラスはほぼ動かず)。 +- 補足: + - WarmPool-STATS と TinyClassStats を統合。`HAKMEM_WARM_POOL_STATS=1` で C7-only 実行時に hits=3329 / misses=1 / prefilled=1 を確認(TinyClassStats の warm_hit=3329 と一致)。 + - `TinyClassPolicy` に `tls_carve_enabled` を追加し、デフォルトで C5–C7 を ON。`TinyClassStats` に tls_carve_attempt/success を追加済み。 + - Learner のスコアを `score = shared_lock * 4 + uc_miss` に変更済み(auto プロファイル専用)。現状のワークロードでは C7 が圧倒的に優勢で、C5/C6 はまだほぼ選ばれない。 + +### サイズ→クラス対応(HAKMEM_TINY_HEADER_CLASSIDX=1 のため size+1 で判定) +- `hak_tiny_size_to_class(size)` は `needed=size+1` で `g_size_to_class_lut_2k` を引くため、512B 要求は 513B として class 7 判定になる(現状の挙動は仕様どおり)。 +- 代表サイズのマップ(データサイズ→class_idx / 総バイト数) + - 8B → C1(16B stride) + - 16B → C2(32B) + - 32B → C3(64B) + - 64B → C4(128B) + - 128B → C5(256B) + - 256B → C6(512B) + - 512B → C7(2048B stride / 32 blocks per slab) + - 1024B → C7(同上) +- 512B 固定ベンチで C7 経路が動くのはこのヘッダ加算による設計上の結果。現時点では「C7 支配」を前提に C5/C6 は拡張枠として観測を続ける。 + +### C5/C6 専用ワークロードの速報(Release, ws=512, iters=1,000,000, size fixed) +- 条件: `HAKMEM_BENCH_MIN_SIZE=256 HAKMEM_BENCH_MAX_SIZE=256 (実質 C6)`、`HAKMEM_TINY_PROFILE=full`、`HAKMEM_WARM_TLS_BIND_C7=2`、`HAKMEM_TINY_STATS_DUMP=1` + - policy=legacy: Throughput ≈ **89.9M ops/s**。C6: uc_miss=5, warm_hit=1, shared_lock=2, tls_carve_attempt=1, tls_carve_success=1。 + - policy=auto: Throughput ≈ **87.5M ops/s**。C6 の統計はほぼ同じ(uc_miss=5, warm_hit=1, tls_carve_attempt/success=1)。C5 ほぼ負荷なし。 +- 補足: C5/C6 はワーキングセットを広げても Warm/TLS carve のヒットは少数(キャッシュヒット優位なため)。専用負荷を増やす場合はさらに ws を広げて観測予定。 +- Larson ベンチ(Release, 10 runs, `./test_larson.sh`) + - profile=full: 1.15〜1.26M ops/s + - profile=larson_guard: 1.10〜1.27M ops/s(≈-3〜0%でほぼ同等)。`HAKMEM_SS_STATS_DUMP=1` で Superslab live が 1 前後に収まり、SEGV/OOM なし。サンプルログは `docs/analysis/SUPERSLAB_STATS_SNAPSHOT.md` に記録。 + +### 新しいログ/ENV スイッチ +- `HAKMEM_TINY_POLICY_LOG=0/1`: Policy 初期化/auto update のログ抑制(デフォルト ON)。 +- `HAKMEM_TINY_WARM_LOG=0/1`: C7 prefill 関連ログ(PREFILL_META/skip 等)の抑制(デフォルト ON)。 +- `HAKMEM_TINY_PAGEBOX_LOG=0/1`: Page Box の登録ログ抑制(Debug のみ、デフォルト ON)。 +- 長時間ラン時は上記を 0 にしてノイズを抑える運用を推奨。短時間デバッグ時のみ 1 にする。 ### 次にやること(広い条件での安定化確認) 1. `HAKMEM_BENCH_MIN_SIZE=129 HAKMEM_BENCH_MAX_SIZE=1024` や通常の `bench_random_mixed_hakmem 1000000 256 42` で - 空スラブ限定ガードが副作用なく動くかを継続確認(現状 Release で 27–28M ops/s を確認済み)。 + 空スラブ限定ガードが副作用なく動くかを継続確認(現状 Release で 29–30M ops/s を確認済み)。 2. ドキュメント更新: - Release だけ C7 Warm が死んでいた根本原因 = 満杯 C7 slab を Shared Pool がリセットせず再供給していた。 - - Acquire の空スラブ強制ガード+Release/Debug 共通リセットで C7-only Release が ~18.8M ops/s まで回復した。 + - Acquire の空スラブ強制ガード+Stage3(LRU) 再利用時の Superslab 全スロットリセット+Warm/TLS carve 有効化で、 + C7-only Release が ~20–25M ops/s クラスに回復し、Random Mixed 16–1024B Release も ~29–30M ops/s クラスまで改善した。 3. 次フェーズ案: - - C5/C6 でも同様の Warm/TLS 最適化・空スラブガードを適用するか、 - - Random Mixed 全体のボトルネック(Shared Pool ロック/Wrapper/mid-size path など)を洗うかを選択。 + - Superslab ガード(Stats/Reset/Stage3/Budget/larson_guard)まで完了。以降は mimalloc/system との比較最適化や、必要に応じた C5/C6 Tiny-Plus 拡張を検討。 ### 次フェーズ(Tiny 全クラス向け Page Box / Warm / Policy 汎用化の検討) - 方向性: @@ -109,7 +134,67 @@ - `TinyClassPolicyBox`/`TinyClassStatsBox`/`TinyPolicyLearnerBox` を追加し、デフォルトで C5〜C7 に Page Box + Warm を許可(Warm cap=8)。 - unified_cache_refill の Page/Warm 経路は `tiny_policy_get()` の返り値でゲートし、Warm push は per-class cap を尊重。 - Page Box 初期化もデフォルトで C5〜C7 を有効化。OBSERVE 用の軽量 stats increment を UC miss / Warm hit に接続済み。 +- 次ステップの設計メモ: + - TinyPageBoxContext を class 汎用構造に広げ、C5/C6 も「TLS Bind で page 登録 → UC refill で page 内 freelist からバッチ供給」を C7 と共有できるようにする(実装は未着手、設計メモのみ)。 ### メモ - ページフォルト問題は Prefault Box + ウォームアップで一定水準まで解消済みで、現在の主ボトルネックはユーザー空間の箱(Unified Cache / free / Pool)側に移っている。 - 以降の最適化は「箱を削る」ではなく、「HOT 層で踏む箱を減らし、Tiny 的なシンプル経路と Tiny-Plus 経路(Page Box + Warm)をクラス別ポリシーでどう使い分けるか」にフォーカスする。 + +## 今後のフォーカス(C7 支配を前提に一旦整理) +- 設計明記: 257–512→C6, 513–2048→C7(size+1 判定)。実負荷は C7 が受ける設計として確定。C5/C6 は拡張枠・観測対象。 +- 優先度: C5-only ≈91M ops/s、512B 固定も C7 経路で ≈47M ops/s → C5/C6 最適化は auto/実験用に留め、本命は C7 Tiny-Plus+Policy。 +- プロファイル運用: legacy=本番、auto=C7固定+上位2クラス観測用のまま据え置き。学習拡張は新ワークロードで C5/C6 がホットになった際に検討。 +- 次の大きな箱候補: (1) mimalloc/system とのフルベンチ整理(論文/README 更新)、(2) hakorune 側 PHI/JoinIR の開発にリソースを戻す。 + +## 巨大 BSS グローバルの棚卸しと今後 + +- `nm -S --size-sort bench_random_mixed_hakmem` と SS_STATS のサンプルから、RSS を支配しているのは Tiny 層ではなく巨大 BSS 配列であることを確認。 + - 代表例: `g_super_reg` ≈24MB, `g_shared_pool` ≈2.3MB, `g_super_reg_by_class` ≈1MB, `g_rem_side` ≈1MB など。 + - SS_STATS(ws=64, iters=10k)では live Superslab は C2=1, C7=1 程度で、巨大レジストリの大半は未使用キャパシティになっている。 + - Tiny 用メモリ会計 Box(`tiny_mem_stats_box`)では UC/Warm/Page/TLS/Policy-Stats 合計でも ≈40KB 程度と判明し、RSS≈29MB の主因ではないことを確認。 +- docs/analysis/LARGE_GLOBALS_OVERVIEW.md に各大型シンボルのサイズ/役割と SS_STATS とのギャップを一覧化済み。 + +次フェーズ候補: +- Superslab Registry / Shared Pool / Remote Queue を Box 化し、プロファイル別に「必要なだけ動的確保」できる SuperRegBox / SharedPoolBox / RemoteSideBox への移行を検討。 +- `HAKMEM_PROFILE` や ENV から「bench 向け縮小設定」と「本番向けフル設定」を切り替えられるようにし、RSS を抑えつつ Box 構造は維持する。 + +進捗(巨大BSS Box化フェーズ) +- docs/analysis/LARGE_GLOBALS_OVERVIEW.md に大型シンボルの定義元・役割・縮小目安を追記(SuperReg/SharedPool/Remote など)。 +- 設計スタブを追加: + - `core/box/super_reg_box.h` … レジストリ容量をプロファイルで切替するための API メモ。 + - `core/box/shared_pool_box.h` … Shared Pool の容量/ガードをプロファイルに紐づけるための API メモ。 + - `core/box/remote_side_box.h` … Remote Queue テーブルをプロファイルで縮小するための API メモ。 +- `HAKMEM_PROFILE=bench` を追加し、SuperReg/SharedPool/Remote の「論理有効スロット」を 1/8〜1/16 に制限するラッパを実装(配列は現状サイズのまま)。`bench_random_mixed_hakmem` は full/bench ともビルド・完走済み。C7-only/129–1024B/16–1024B で ops/s は ±数% 以内、RSS は ~32.6MB でほぼ不変(論理制限のみのため)。 +- SuperReg/Remote を Box 内で動的確保に置き換え、`HAKMEM_PROFILE=bench` では実容量も縮小(SuperReg: 1/8〜1/16、Remote: log2 を 12〜)。C7-only 200k/ws32 では full=29.6MB → bench=7.2MB (ops ≈44.4M 同レンジ) まで RSS を削減できた。 +- bench 実容量版での広いワークロード検証: 129–1024B ws=256/1M は full=48.9M ops/s & 29.6MB → bench=49.2M & 7.2MB。16–1024B ws=256/1M は full=48.3M & 29.7MB → bench=48.8M & 7.2MB。SS_STATS(bench)でも live Superslab は C2=1, C7=1 に収まり、Tiny 層メモリは ~41KB のまま。 +- 次ステップ: SharedPool 側も必要なら動的化/縮小を検討しつつ、RSS をさらに攻めるか、CPU パス最適化に戻るか判断。*** + +### フェーズ整理と次の方針 +- SharedPool は現状サイズを維持し、`HAKMEM_PROFILE=full` を本番、`HAKMEM_PROFILE=bench` を対 mimalloc/system の軽量プロファイルとして運用(bench は SuperReg/Remote 縮小済み、RSS≈7.2MB)。 +- 巨大BSS Box化フェーズは「bench で RSS≈7.2MB / ops≈同等」まで完了。今後は perf(CPUサイクル)最適化にフォーカス。 + +ホットパス perf フェーズの TODO(案) +1. tiny_alloc_fast / tiny_free_fast_v2 の再プロファイル:残存分岐・間接呼び出し・重い箱を特定。 +2. Unified Cache ヒットパスを最短化:ヒット時を 1–2 load + 軽分岐に近づける(必要なら C7 専用インライン版検討)。 +3. free パス Gatekeeper/Box の再配線:C7 ホットケースだけ分岐極小のストレートラインにする。 + +目標: シングルスレッド小オブジェクトで ~50M ops/s → 70M〜80M 帯を狙う(mimalloc との差を半減イメージ)。*** + +補足(CPU ホットパス観測メモ) +- `HAKMEM_PROFILE=bench HAKMEM_TINY_PROFILE=full HAKMEM_WARM_TLS_BIND_C7=2` で perf を試行したが、`perf_event_paranoid` 制約で `cycles` が取れず page-fault サンプルのみ(`__memset_avx2_unaligned_erms` が warmup を支配)。`perf.data` は即削除済み。集計結果と次の測定案は `docs/analysis/CPU_HOTPATH_OVERVIEW.md` に記載。 +- C7 alloc/free flattening と UC ヒット簡略化の設計メモを追加:`docs/analysis/C7_HOTPATH_FLATTENING.md`, `docs/analysis/C7_FREE_HOTPATH.md`。実装はこれから。 +- C7 ホットパス用フックを追加(`core/box/tiny_c7_hotpath_box.h` + `HAKMEM_TINY_C7_HOT`)。今は既存 Hot/Cold Box をクラス固定で呼ぶ薄いラッパなので挙動は同一。 + - bench プロファイルの perf stat (3 run 平均): + - 16–1024B: cycles≈109.5M, inst≈233.5M (IPC≈2.13, br-miss≈2.90%) + - 16–1024B + `HAKMEM_TINY_C7_HOT=1`: cycles≈111.8M, inst≈242.1M (IPC≈2.16, br-miss≈2.75%) + +### C7 ホットパス平坦化(第1段階)の結果メモ +- `HAKMEM_PROFILE=bench HAKMEM_TINY_PROFILE=full HAKMEM_WARM_TLS_BIND_C7=2`、129–1024B ws=256/1M(Release)で: + - `HAKMEM_TINY_C7_HOT=0`: ≈49.7M ops/s + - `HAKMEM_TINY_C7_HOT=1`: ≈46.7M ops/s(分岐ミスは僅かに改善するがスループットはノイズ〜微減) +- 16–1024B ws=256/1M では: + - hot=0: ops≈47.4M, IPC≈2.13, br-miss≈2.90% + - hot=1: ops≈47.4–47.6M, IPC≈2.16, br-miss≈2.75% +- 現状の C7 ホットパス実装は「ヒット専用 UC + TLS→UC→cold 直線化」の初期版で、大幅な伸びはまだ無い。回帰はなく、分岐ミス率はわずかに改善。今後さらに UC ヒット専用関数の最短化や free 側の直線化を詰める余地あり。 +- 方針: `HAKMEM_TINY_C7_HOT` は実験用スイッチとして残し、デフォルト OFF。perf フェーズは bench プロファイルで ≈50M ops/s / RSS ≈7MB を維持できる現行経路を基準に一旦完了とする。*** diff --git a/Makefile b/Makefile index 6ba2cc32..eaa4fc3b 100644 --- a/Makefile +++ b/Makefile @@ -219,12 +219,12 @@ LDFLAGS += $(EXTRA_LDFLAGS) # Targets TARGET = test_hakmem -OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/tiny_page_box.o core/box/tiny_class_policy_box.o core/box/tiny_class_stats_box.o core/box/tiny_policy_learner_box.o core/box/wrapper_env_box.o core/box/ptr_trace_box.o core/box/link_missing_stubs.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o +OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/tiny_page_box.o core/box/tiny_class_policy_box.o core/box/tiny_class_stats_box.o core/box/tiny_policy_learner_box.o core/box/ss_budget_box.o core/box/tiny_mem_stats_box.o core/box/wrapper_env_box.o core/box/ptr_trace_box.o core/box/link_missing_stubs.o core/box/super_reg_box.o core/box/shared_pool_box.o core/box/remote_side_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o test_hakmem.o OBJS = $(OBJS_BASE) # Shared library SHARED_LIB = libhakmem.so -SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o core/box/ss_allocation_box_shared.o superslab_stats_shared.o superslab_cache_shared.o superslab_ace_shared.o superslab_slab_shared.o superslab_backend_shared.o core/superslab_head_stub_shared.o hakmem_smallmid_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/ss_addr_map_box_shared.o core/box/slab_recycling_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/box/tiny_env_box_shared.o core/box/tiny_route_box_shared.o core/box/tiny_page_box_shared.o core/box/tiny_class_policy_box_shared.o core/box/tiny_class_stats_box_shared.o core/box/tiny_policy_learner_box_shared.o core/box/wrapper_env_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_shared_pool_acquire_shared.o hakmem_shared_pool_release_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o +SHARED_OBJS = hakmem_shared.o hakmem_config_shared.o hakmem_tiny_config_shared.o hakmem_ucb1_shared.o hakmem_bigcache_shared.o hakmem_pool_shared.o hakmem_l25_pool_shared.o hakmem_site_rules_shared.o hakmem_tiny_shared.o core/box/ss_allocation_box_shared.o superslab_stats_shared.o superslab_cache_shared.o superslab_ace_shared.o superslab_slab_shared.o superslab_backend_shared.o core/superslab_head_stub_shared.o hakmem_smallmid_shared.o core/box/superslab_expansion_box_shared.o core/box/integrity_box_shared.o core/box/mailbox_box_shared.o core/box/front_gate_box_shared.o core/box/front_gate_classifier_shared.o core/box/free_publish_box_shared.o core/box/capacity_box_shared.o core/box/carve_push_box_shared.o core/box/prewarm_box_shared.o core/box/ss_hot_prewarm_box_shared.o core/box/front_metrics_box_shared.o core/box/bench_fast_box_shared.o core/box/ss_addr_map_box_shared.o core/box/slab_recycling_box_shared.o core/box/pagefault_telemetry_box_shared.o core/box/tiny_sizeclass_hist_box_shared.o core/box/tiny_env_box_shared.o core/box/tiny_route_box_shared.o core/box/tiny_page_box_shared.o core/box/tiny_class_policy_box_shared.o core/box/tiny_class_stats_box_shared.o core/box/tiny_policy_learner_box_shared.o core/box/ss_budget_box_shared.o core/box/tiny_mem_stats_box_shared.o core/box/wrapper_env_box_shared.o core/page_arena_shared.o core/front/tiny_unified_cache_shared.o core/tiny_alloc_fast_push_shared.o core/link_stubs_shared.o core/tiny_failfast_shared.o tiny_sticky_shared.o tiny_remote_shared.o tiny_publish_shared.o tiny_debug_ring_shared.o hakmem_tiny_magazine_shared.o hakmem_tiny_stats_shared.o hakmem_tiny_sfc_shared.o hakmem_tiny_query_shared.o hakmem_tiny_rss_shared.o hakmem_tiny_registry_shared.o hakmem_tiny_remote_target_shared.o hakmem_tiny_bg_spill_shared.o tiny_adaptive_sizing_shared.o hakmem_super_registry_shared.o hakmem_shared_pool_shared.o hakmem_shared_pool_acquire_shared.o hakmem_shared_pool_release_shared.o hakmem_elo_shared.o hakmem_batch_shared.o hakmem_p2_shared.o hakmem_sizeclass_dist_shared.o hakmem_evo_shared.o hakmem_debug_shared.o hakmem_sys_shared.o hakmem_whale_shared.o hakmem_policy_shared.o hakmem_ace_shared.o hakmem_ace_stats_shared.o hakmem_ace_controller_shared.o hakmem_ace_metrics_shared.o hakmem_ace_ucb1_shared.o hakmem_prof_shared.o hakmem_learner_shared.o hakmem_size_hist_shared.o hakmem_learn_log_shared.o hakmem_syscall_shared.o tiny_fastcache_shared.o core/box/super_reg_box_shared.o core/box/shared_pool_box_shared.o core/box/remote_side_box_shared.o # Pool TLS Phase 1 (enable with POOL_TLS_PHASE1=1) ifeq ($(POOL_TLS_PHASE1),1) @@ -251,7 +251,7 @@ endif # Benchmark targets BENCH_HAKMEM = bench_allocators_hakmem BENCH_SYSTEM = bench_allocators_system -BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/tiny_page_box.o core/box/tiny_class_policy_box.o core/box/tiny_class_stats_box.o core/box/tiny_policy_learner_box.o core/box/wrapper_env_box.o core/box/ptr_trace_box.o core/box/link_missing_stubs.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o +BENCH_HAKMEM_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/free_publish_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/tiny_page_box.o core/box/tiny_class_policy_box.o core/box/tiny_class_stats_box.o core/box/tiny_policy_learner_box.o core/box/ss_budget_box.o core/box/tiny_mem_stats_box.o core/box/c7_meta_used_counter_box.o core/box/wrapper_env_box.o core/box/ptr_trace_box.o core/box/link_missing_stubs.o core/box/super_reg_box.o core/box/shared_pool_box.o core/box/remote_side_box.o core/page_arena.o core/front/tiny_unified_cache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o bench_allocators_hakmem.o BENCH_HAKMEM_OBJS = $(BENCH_HAKMEM_OBJS_BASE) ifeq ($(POOL_TLS_PHASE1),1) BENCH_HAKMEM_OBJS += pool_tls.o pool_refill.o pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o @@ -428,7 +428,7 @@ test-box-refactor: box-refactor ./larson_hakmem 10 8 128 1024 1 12345 4 # Phase 4: Tiny Pool benchmarks (properly linked with hakmem) -TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/free_publish_box.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/tiny_sizeclass_hist_box.o core/box/pagefault_telemetry_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/tiny_page_box.o core/box/tiny_class_policy_box.o core/box/tiny_class_stats_box.o core/box/tiny_policy_learner_box.o core/box/wrapper_env_box.o core/box/ptr_trace_box.o core/box/link_missing_stubs.o core/page_arena.o core/front/tiny_unified_cache.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o +TINY_BENCH_OBJS_BASE = hakmem.o hakmem_config.o hakmem_tiny_config.o hakmem_ucb1.o hakmem_bigcache.o hakmem_pool.o hakmem_l25_pool.o hakmem_site_rules.o hakmem_tiny.o core/box/ss_allocation_box.o superslab_stats.o superslab_cache.o superslab_ace.o superslab_slab.o superslab_backend.o core/superslab_head_stub.o hakmem_smallmid.o core/box/superslab_expansion_box.o core/box/integrity_box.o core/box/mailbox_box.o core/box/front_gate_box.o core/box/front_gate_classifier.o core/box/free_publish_box.o core/box/capacity_box.o core/box/carve_push_box.o core/box/prewarm_box.o core/box/ss_hot_prewarm_box.o core/box/front_metrics_box.o core/box/bench_fast_box.o core/box/ss_addr_map_box.o core/box/slab_recycling_box.o core/box/pagefault_telemetry_box.o core/box/tiny_sizeclass_hist_box.o core/box/tiny_env_box.o core/box/tiny_route_box.o core/box/tiny_page_box.o core/box/tiny_class_policy_box.o core/box/tiny_class_stats_box.o core/box/tiny_policy_learner_box.o core/box/ss_budget_box.o core/box/tiny_mem_stats_box.o core/box/c7_meta_used_counter_box.o core/box/wrapper_env_box.o core/box/ptr_trace_box.o core/box/link_missing_stubs.o core/box/super_reg_box.o core/box/shared_pool_box.o core/box/remote_side_box.o core/page_arena.o core/front/tiny_unified_cache.o tiny_sticky.o tiny_remote.o tiny_publish.o tiny_debug_ring.o hakmem_tiny_magazine.o hakmem_tiny_stats.o hakmem_tiny_sfc.o hakmem_tiny_query.o hakmem_tiny_rss.o hakmem_tiny_registry.o hakmem_tiny_remote_target.o hakmem_tiny_bg_spill.o tiny_adaptive_sizing.o hakmem_super_registry.o hakmem_shared_pool.o hakmem_shared_pool_acquire.o hakmem_shared_pool_release.o hakmem_elo.o hakmem_batch.o hakmem_p2.o hakmem_sizeclass_dist.o hakmem_evo.o hakmem_debug.o hakmem_sys.o hakmem_whale.o hakmem_policy.o hakmem_ace.o hakmem_ace_stats.o hakmem_prof.o hakmem_learner.o hakmem_size_hist.o hakmem_learn_log.o hakmem_syscall.o hakmem_ace_metrics.o hakmem_ace_ucb1.o hakmem_ace_controller.o tiny_fastcache.o core/tiny_alloc_fast_push.o core/link_stubs.o core/tiny_failfast.o TINY_BENCH_OBJS = $(TINY_BENCH_OBJS_BASE) ifeq ($(POOL_TLS_PHASE1),1) TINY_BENCH_OBJS += pool_tls.o pool_refill.o core/pool_tls_arena.o pool_tls_registry.o pool_tls_remote.o diff --git a/PERF_COMPARISON_ALLOCATORS.md b/PERF_COMPARISON_ALLOCATORS.md new file mode 100644 index 00000000..89aa0ba2 --- /dev/null +++ b/PERF_COMPARISON_ALLOCATORS.md @@ -0,0 +1,27 @@ +# Allocator Throughput / RSS Comparison (Release) + +環境: 1 thread, `HAKMEM_WARM_TLS_BIND_C7=2`, RSS は `ru_maxrss` (KB) を MB 換算。 +Hakmem は `full` と `larson_guard` プロファイルを計測。その他は system / mimalloc の素の挙動。 + +| workload | allocator | ops/s | max RSS (MB) | +|----------------------|------------------------|------------------|--------------| +| C7-only (1024B, ws32, 200k) | hakmem-full | 44,381,807 | 29.6 | +| | hakmem-bench | 44,439,813 | **7.2** | +| | hakmem-larson_guard | 48,455,082 | 28.9 | +| | mimalloc | 74,433,394 | 1.8 | +| | system | 78,514,783 | 1.6 | +| 129–1024B (ws256, 1M) | hakmem-full | 48,895,987 | 29.6 | +| | hakmem-bench | 49,226,419 | **7.2** | +| | hakmem-larson_guard | 52,327,019 | 28.8 | +| | mimalloc | 106,310,868 | 1.9 | +| | system | 95,633,188 | 1.6 | +| 16–1024B (ws256, 1M) | hakmem-full | 48,276,749 | 29.7 | +| | hakmem-bench | 48,759,807 | **7.2** | +| | hakmem-larson_guard | 50,494,992 | 28.9 | +| | mimalloc | 126,403,649 | 1.9 | +| | system | 95,361,993 | 1.6 | + +所感 (現時点): +- スループットは system/mimalloc が優勢。Hakmem (full/guard) は C7 特化ワークロードで 44–48M ops/s 帯。 +- bench プロファイルを「実配列縮小」版に切り替えたことで、C7-only/129–1024B/16–1024B いずれも RSS は ~29MB → ~7MB まで低減(ops/s は同レンジ)。 +- RSS は system/mimalloc が圧倒的に小さい (1.6–1.9MB)。Hakmem は full/guard で ~29MB、bench 版は 7MB 前後まで圧縮できた。*** diff --git a/README_PERF_ANALYSIS.md b/README_PERF_ANALYSIS.md index 08f0f16b..1cc7029b 100644 --- a/README_PERF_ANALYSIS.md +++ b/README_PERF_ANALYSIS.md @@ -1,8 +1,35 @@ # HAKMEM Allocator Performance Analysis Results -**最新メモ (2025-12-05)**: C7 Warm/TLS Bind は本番経路を Bind-only (mode=1) に統一。Debug では `HAKMEM_WARM_TLS_BIND_C7=0/1/2` で切替可能だが、Release は常に mode=1 固定。C7-only ワークロードでは mode=1 が legacy (mode=0) 比で ~4–10x 速く、mode=2 は TLS carve 実験として残置。 -**追記 (2025-12-05, Release 修復)**: Release だけ C7 Warm が死んでいた原因は「満杯 C7 slab が Shared Pool に居残り、空スラブが Warm に渡っていなかった」こと。Acquire で C7 は空スラブ限定、Release でメタをリセットするガードを導入し、C7-only Release で ~18.8M ops/s、Random Mixed Release で ~27–28M ops/s まで回復。 -**追記 (2025-12-05, Policy Box)**: `TinyClassPolicyBox` を導入し、`HAKMEM_TINY_POLICY_PROFILE=legacy|c5_7_only|tinyplus_all` で Page/Warm ポリシーを切替可能にした。現状 legacy(PageBox= C5–C7, Warm= 全クラス cap 4/8)でランダム混在 Release は ~4.9M ops/s と低下しており、Warm 道の有効化状態を追加調査中。 +**最新メモ (2025-12-06, Release)** +- 新規比較表: `PERF_COMPARISON_ALLOCATORS.md` に HAKMEM (full/larson_guard) / mimalloc / system の ops/s と RSS を掲載。C7-only/129–1024/full いずれも HAKMEM は ~50M ops/s / ~29MB RSS、system/mimalloc は 75–126M ops/s / 1.6–1.9MB RSS で優位。 +- Random Mixed 129–1024B, ws=256, iters=1M, `HAKMEM_WARM_TLS_BIND_C7=2`: + - policy=legacy ≈ **51.5M ops/s**。TinyClassStats: C7 uc_miss=17196 / warm_hit=8597 / shared_lock=5 / tls_carve_attempt=8597 / success=8597(C5/C6 は uc_miss=1〜2)。 + - policy=auto(score=shared_lock*4+uc_miss)≈ **51.9M ops/s**(C7 固定 ON、C5/C6 はほぼ動かず)。 + - policy=c5_7_only ≈ **50.1M ops/s**。 +- C7-only (size=1024, ws=32, iters=200K): + - legacy: 平均 ≈ **46M ops/s**(Warm hit 3329 / tls_carve_success 3329 / shared_lock=5)。 + - auto: 平均 ≈ **44M ops/s**(統計ほぼ同じ、C7 固定 ON)。 +- C7 guard vs full(Superslab 予算+空スラブ限定): + - C7-only: full **42.4M ops/s** vs larson_guard **40.7M ops/s**(-4%)。 + - 129–1024B: full **49.0M ops/s** vs larson_guard **48.4M ops/s**(-1.2%)。 +- C5/C6 固定サイズ (size=256≒C6, ws=512, iters=1M, stats dump ON): + - policy=legacy ≈ **89.9M ops/s**(C6 uc_miss=5 / warm_hit=1 / tls_carve_success=1)。 + - policy=auto ≈ **87.5M ops/s**(統計ほぼ同じ、C5 はほぼゼロ)。 +- WarmPool-STATS を TinyClassStats と統合。`HAKMEM_WARM_POOL_STATS=1` で C7-only 実行時に hits=3329 / misses=1 / prefilled=1 を確認(warm_hit と一致)。 +- ログ抑制 ENV: `HAKMEM_TINY_POLICY_LOG` / `HAKMEM_TINY_WARM_LOG` / `HAKMEM_TINY_PAGEBOX_LOG` を 0 にすると長時間ランのノイズが減る(短時間の C7 デバッグ時だけ 1 にすると便利)。 +- C7-only (mode=2) は Release/Debug ともに ~20M ops/s 帯(ログを多めに出すと 40M 付近まで振れる)。 +- サイズ→クラス: `hak_tiny_size_to_class(size+1)` により 257–512B→C6、513–2048B→C7。512B も C7 が受ける設計で、実負荷の多くが C7 に集中する(C5/C6 は拡張枠)。 +- mimalloc/system 比較(Release, `HAKMEM_TINY_PROFILE=full HAKMEM_TINY_POLICY_PROFILE=legacy HAKMEM_WARM_TLS_BIND_C7=2`, prefault=10% デフォルト, ログOFF) + | workload (cycles/ws/size帯) | HAKMEM | mimalloc | system | 備考 | + | --- | --- | --- | --- | --- | + | C7-only (200K / 32 / 1024) | **48.8M ops/s** | 95.3M | 73.9M | mode=2, Warm+TLS carve | + | Tiny-mixed 129–1024B (1M / 256) | **50.0M** | 128.4M | 97.7M | 513–2048B を C7 が受ける設計 | + | full 16–1024B (1M / 256) | **50.9M** | 123.6M | 83.5M | デフォルト帯 | + | Tiny-only 8–128B (200K / 400) | **93.2M** | 123.7M | 66.3M | Warm/TLS はほぼ踏まれず | + 現状ベスト: mimalloc が全帯域で最速。HAKMEM は C7 専用ワークロードで 50M 付近、Tiny-only では system より高速だが mimalloc には未到達。 +**前回メモ (2025-12-05)**: C7 Warm/TLS Bind を Bind-only (mode=1) を本番経路とし、Release でも mode=2 を実験で有効化可能。C7-only で mode=1 が legacy (mode=0) 比で ~4–10x。 +**Release 修復メモ (2025-12-05)**: 満杯 C7 slab が Shared Pool に残留していたため Warm が死んでいた。Acquire/Stage3 で空スラブ限定&リセットを入れて C7-only Release ~23.7M ops/s → 20M+ 帯まで回復。 +**Policy/OBSERVE/LEARN (2025-12-05)**: `TinyClassPolicyBox` 追加。`HAKMEM_TINY_POLICY_PROFILE=legacy|c5_7_only|tinyplus_all|auto` で Page/Warm を切替。OBSERVE では C7 がホットスポットで、`auto` プロファイルは C7 固定ON + score 上位2クラス(C5/C6 など)を自動で Tiny-Plus に昇格させる。 **分析実施日**: 2025-11-28 **分析対象**: HAKMEM allocator (commit 0ce20bb83) diff --git a/bench_random_mixed.c b/bench_random_mixed.c index 812ee33f..16c81c0b 100644 --- a/bench_random_mixed.c +++ b/bench_random_mixed.c @@ -15,10 +15,7 @@ #include #include #include -#define C7_META_COUNTER_DEFINE -#include "core/box/c7_meta_used_counter_box.h" -#undef C7_META_COUNTER_DEFINE -#include "core/box/warm_pool_rel_counters_box.h" +#include #ifdef USE_HAKMEM #include "hakmem.h" @@ -26,6 +23,9 @@ #include "core/box/c7_meta_used_counter_box.h" #include "core/box/tiny_class_stats_box.h" #include "core/box/tiny_class_policy_box.h" +#include "core/box/ss_stats_box.h" +#include "core/box/warm_pool_rel_counters_box.h" +#include "core/box/tiny_mem_stats_box.h" // Box BenchMeta: Benchmark metadata management (bypass hakmem wrapper) // Phase 15: Separate BenchMeta (slots array) from CoreAlloc (user workload) @@ -61,10 +61,30 @@ static inline int bench_is_c7_only_mode(void) { return bench_mode_c7_only; } +// C5/C6 専用ベンチモード (ENV: HAKMEM_BENCH_C5_ONLY / HAKMEM_BENCH_C6_ONLY) +static int bench_mode_c5_only = -1; +static int bench_mode_c6_only = -1; +static inline int bench_is_c5_only_mode(void) { + if (bench_mode_c5_only == -1) { + const char* e = getenv("HAKMEM_BENCH_C5_ONLY"); + bench_mode_c5_only = (e && *e && *e != '0') ? 1 : 0; + } + return bench_mode_c5_only; +} +static inline int bench_is_c6_only_mode(void) { + if (bench_mode_c6_only == -1) { + const char* e = getenv("HAKMEM_BENCH_C6_ONLY"); + bench_mode_c6_only = (e && *e && *e != '0') ? 1 : 0; + } + return bench_mode_c6_only; +} + int main(int argc, char** argv){ int cycles = (argc>1)? atoi(argv[1]) : 10000000; // total ops (10M for steady-state measurement) int ws = (argc>2)? atoi(argv[2]) : 8192; // working-set slots uint32_t seed = (argc>3)? (uint32_t)strtoul(argv[3],NULL,10) : 1234567u; + struct rusage ru0 = {0}, ru1 = {0}; + getrusage(RUSAGE_SELF, &ru0); // サイズレンジ(Tiny-only / Non-Tiny-only の比較用) // 既定: 16..1040 bytes(元の挙動と同等) @@ -97,8 +117,14 @@ int main(int argc, char** argv){ if (min_size < 1) min_size = 1; if (max_size < min_size) max_size = min_size; - // C7 専用モード: サイズを C7 帯に固定(現行 C7 ブロックサイズ ≈ 1024B) - if (bench_is_c7_only_mode()) { + // C5/C6/C7 専用モード: サイズを各クラス帯に固定 + if (bench_is_c5_only_mode()) { + min_size = 256; + max_size = 256; + } else if (bench_is_c6_only_mode()) { + min_size = 512; + max_size = 512; + } else if (bench_is_c7_only_mode()) { min_size = 1024; max_size = 1024; } @@ -238,10 +264,13 @@ int main(int argc, char** argv){ for (int i=0;i0.0?sec:1e-9); // Include params in output to avoid confusion about test conditions printf("Throughput = %9.0f ops/s [iter=%d ws=%d] time=%.3fs\n", tput, cycles, ws, sec); + long rss_kb = ru1.ru_maxrss; + fprintf(stderr, "[RSS] max_kb=%ld\n", rss_kb); (void)allocs; (void)frees; // Box BenchMeta: Use __libc_free to bypass hakmem wrapper @@ -270,6 +299,14 @@ int main(int argc, char** argv){ tiny_class_stats_dump_global(stderr, "[CLASS_STATS_GLOBAL]"); } + const char* tiny_mem_dump_env = getenv("HAKMEM_TINY_MEM_DUMP"); + if (tiny_mem_dump_env && *tiny_mem_dump_env && *tiny_mem_dump_env != '0') { + tiny_mem_stats_dump(); + } + + // Superslab/slab counters (ENV: HAKMEM_SS_STATS_DUMP=1) + ss_stats_dump_if_requested(); + // Warm Pool Stats (ENV-gated: HAKMEM_WARM_POOL_STATS=1) extern void tiny_warm_pool_print_stats_public(void); tiny_warm_pool_print_stats_public(); diff --git a/core/box/c7_hotpath_env_box.h b/core/box/c7_hotpath_env_box.h new file mode 100644 index 00000000..dd3f5817 --- /dev/null +++ b/core/box/c7_hotpath_env_box.h @@ -0,0 +1,15 @@ +// c7_hotpath_env_box.h - ENV gate for C7 hotpath +// Purpose: isolate the ENV handling so hotpath code can assume gate済み。 +#pragma once + +#include + +// ENV gate: HAKMEM_TINY_C7_HOT=1 で有効化(デフォルト OFF) +static inline int tiny_c7_hot_enabled(void) { + static int g_enable = -1; + if (__builtin_expect(g_enable == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_C7_HOT"); + g_enable = (e && *e && *e != '0') ? 1 : 0; + } + return g_enable; +} diff --git a/core/box/c7_meta_used_counter_box.c b/core/box/c7_meta_used_counter_box.c new file mode 100644 index 00000000..1b4843b9 --- /dev/null +++ b/core/box/c7_meta_used_counter_box.c @@ -0,0 +1,8 @@ +// c7_meta_used_counter_box.c +// Definitions for C7 meta->used increment counters (Release/Debug共通) +#include "c7_meta_used_counter_box.h" + +_Atomic uint64_t g_c7_meta_used_inc_total = 0; +_Atomic uint64_t g_c7_meta_used_inc_backend = 0; +_Atomic uint64_t g_c7_meta_used_inc_tls = 0; +_Atomic uint64_t g_c7_meta_used_inc_front = 0; diff --git a/core/box/carve_push_box.d b/core/box/carve_push_box.d index ca8f9495..551d8ab0 100644 --- a/core/box/carve_push_box.d +++ b/core/box/carve_push_box.d @@ -17,8 +17,9 @@ core/box/carve_push_box.o: core/box/carve_push_box.c \ core/box/../tiny_region_id.h core/box/../tiny_box_geometry.h \ core/box/../ptr_track.h core/box/../hakmem_super_registry.h \ core/box/../box/ss_addr_map_box.h \ - core/box/../box/../hakmem_build_flags.h core/box/../tiny_debug_api.h \ - core/box/carve_push_box.h core/box/capacity_box.h core/box/tls_sll_box.h \ + core/box/../box/../hakmem_build_flags.h core/box/../box/super_reg_box.h \ + core/box/../tiny_debug_api.h core/box/carve_push_box.h \ + core/box/capacity_box.h core/box/tls_sll_box.h \ core/box/../hakmem_internal.h core/box/../hakmem.h \ core/box/../hakmem_config.h core/box/../hakmem_features.h \ core/box/../hakmem_sys.h core/box/../hakmem_whale.h \ @@ -70,6 +71,7 @@ core/box/../ptr_track.h: core/box/../hakmem_super_registry.h: core/box/../box/ss_addr_map_box.h: core/box/../box/../hakmem_build_flags.h: +core/box/../box/super_reg_box.h: core/box/../tiny_debug_api.h: core/box/carve_push_box.h: core/box/capacity_box.h: diff --git a/core/box/front_gate_box.d b/core/box/front_gate_box.d index 7dface9b..73939c81 100644 --- a/core/box/front_gate_box.d +++ b/core/box/front_gate_box.d @@ -11,20 +11,21 @@ core/box/front_gate_box.o: core/box/front_gate_box.c \ core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \ core/superslab/superslab_types.h core/superslab/../tiny_box_geometry.h \ core/tiny_debug_ring.h core/tiny_remote.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/tiny_debug_api.h \ - core/box/tiny_layout_box.h core/box/../hakmem_tiny_config.h \ - core/box/tiny_header_box.h core/box/tiny_layout_box.h \ - core/box/../tiny_region_id.h core/box/tls_sll_box.h \ - core/box/../hakmem_internal.h core/box/../hakmem.h \ - core/box/../hakmem_build_flags.h core/box/../hakmem_config.h \ - core/box/../hakmem_features.h core/box/../hakmem_sys.h \ - core/box/../hakmem_whale.h core/box/../box/ptr_type_box.h \ - core/box/../hakmem_debug_master.h core/box/../tiny_remote.h \ - core/box/../hakmem_tiny_integrity.h core/box/../hakmem_tiny.h \ - core/box/../ptr_track.h core/box/../ptr_trace.h \ - core/box/../hakmem_trace_master.h core/box/../hakmem_stats_master.h \ - core/box/../tiny_debug_ring.h core/box/ss_addr_map_box.h \ - core/box/../superslab/superslab_inline.h core/box/tiny_ptr_bridge_box.h \ + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/tiny_debug_api.h core/box/tiny_layout_box.h \ + core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ + core/box/tiny_layout_box.h core/box/../tiny_region_id.h \ + core/box/tls_sll_box.h core/box/../hakmem_internal.h \ + core/box/../hakmem.h core/box/../hakmem_build_flags.h \ + core/box/../hakmem_config.h core/box/../hakmem_features.h \ + core/box/../hakmem_sys.h core/box/../hakmem_whale.h \ + core/box/../box/ptr_type_box.h core/box/../hakmem_debug_master.h \ + core/box/../tiny_remote.h core/box/../hakmem_tiny_integrity.h \ + core/box/../hakmem_tiny.h core/box/../ptr_track.h \ + core/box/../ptr_trace.h core/box/../hakmem_trace_master.h \ + core/box/../hakmem_stats_master.h core/box/../tiny_debug_ring.h \ + core/box/ss_addr_map_box.h core/box/../superslab/superslab_inline.h \ + core/box/tiny_ptr_bridge_box.h \ core/box/../hakmem_tiny_superslab_internal.h \ core/box/../hakmem_tiny_superslab.h core/box/../box/ss_hot_cold_box.h \ core/box/../box/../superslab/superslab_types.h \ @@ -63,6 +64,7 @@ core/tiny_debug_ring.h: core/tiny_remote.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/tiny_debug_api.h: core/box/tiny_layout_box.h: core/box/../hakmem_tiny_config.h: diff --git a/core/box/front_gate_classifier.d b/core/box/front_gate_classifier.d index a5d92954..7f68f8e2 100644 --- a/core/box/front_gate_classifier.d +++ b/core/box/front_gate_classifier.d @@ -11,8 +11,9 @@ core/box/front_gate_classifier.o: core/box/front_gate_classifier.c \ core/box/../superslab/../tiny_box_geometry.h \ core/box/../tiny_debug_ring.h core/box/../tiny_remote.h \ core/box/../box/ss_addr_map_box.h \ - core/box/../box/../hakmem_build_flags.h core/box/../hakmem_tiny.h \ - core/box/../hakmem_trace.h core/box/../hakmem_tiny_mini_mag.h \ + core/box/../box/../hakmem_build_flags.h core/box/../box/super_reg_box.h \ + core/box/../hakmem_tiny.h core/box/../hakmem_trace.h \ + core/box/../hakmem_tiny_mini_mag.h \ core/box/../box/hak_lane_classify.inc.h core/box/../box/ptr_type_box.h \ core/box/../tiny_debug_api.h core/box/../hakmem_tiny_superslab.h \ core/box/../superslab/superslab_inline.h \ @@ -38,6 +39,7 @@ core/box/../tiny_debug_ring.h: core/box/../tiny_remote.h: core/box/../box/ss_addr_map_box.h: core/box/../box/../hakmem_build_flags.h: +core/box/../box/super_reg_box.h: core/box/../hakmem_tiny.h: core/box/../hakmem_trace.h: core/box/../hakmem_tiny_mini_mag.h: diff --git a/core/box/remote_side_box.c b/core/box/remote_side_box.c new file mode 100644 index 00000000..ba214431 --- /dev/null +++ b/core/box/remote_side_box.c @@ -0,0 +1,88 @@ +#include "remote_side_box.h" + +#include +#include +#include +#include + +#ifndef REM_SIDE_LOG2 +#define REM_SIDE_LOG2 20 +#endif + +static _Atomic uint32_t g_remote_log2 = REM_SIDE_LOG2; +static _Atomic uint32_t g_remote_size = (1u << REM_SIDE_LOG2); +static _Atomic uint32_t g_remote_mask = (1u << REM_SIDE_LOG2) - 1; +static _Atomic int g_remote_profile_inited = 0; +static rem_side_entry* g_remote_slots = NULL; +static _Atomic int g_remote_allocated = 0; + +static void remote_side_apply_profile(const char* profile) { + if (g_remote_profile_inited) { + return; + } + const char* env_profile = profile ? profile : getenv("HAKMEM_PROFILE"); + int is_bench = (env_profile && strcmp(env_profile, "bench") == 0); + + uint32_t log2 = REM_SIDE_LOG2; + if (is_bench && REM_SIDE_LOG2 > 4) { + // bench 用: ハッシュ幅だけ 1/8〜1/16 程度に論理縮小 + log2 = REM_SIDE_LOG2 - 3; // 1/8 + if (log2 < 12) { + log2 = 12; // 4096 entries までは確保 + } + } + + uint32_t size = (1u << log2); + uint32_t mask = size - 1; + + atomic_store_explicit(&g_remote_log2, log2, memory_order_relaxed); + atomic_store_explicit(&g_remote_size, size, memory_order_relaxed); + atomic_store_explicit(&g_remote_mask, mask, memory_order_relaxed); + atomic_store_explicit(&g_remote_profile_inited, 1, memory_order_release); +} + +void remote_side_init(RemoteSideBox* box, const char* profile) { + (void)box; + remote_side_apply_profile(profile); + + if (atomic_load_explicit(&g_remote_allocated, memory_order_acquire)) { + return; + } + + uint32_t size = remote_side_effective_size(); + g_remote_slots = (rem_side_entry*)calloc(size, sizeof(rem_side_entry)); + if (!g_remote_slots) { + fprintf(stderr, "[REMOTE_SIDE] failed to allocate %zu bytes\n", + (size_t)size * sizeof(rem_side_entry)); + abort(); + } + atomic_store_explicit(&g_remote_allocated, 1, memory_order_release); +} + +uint32_t remote_side_effective_log2(void) { + if (!atomic_load_explicit(&g_remote_profile_inited, memory_order_acquire)) { + remote_side_apply_profile(NULL); + } + return atomic_load_explicit(&g_remote_log2, memory_order_relaxed); +} + +uint32_t remote_side_effective_size(void) { + if (!atomic_load_explicit(&g_remote_profile_inited, memory_order_acquire)) { + remote_side_apply_profile(NULL); + } + return atomic_load_explicit(&g_remote_size, memory_order_relaxed); +} + +uint32_t remote_side_effective_mask(void) { + if (!atomic_load_explicit(&g_remote_profile_inited, memory_order_acquire)) { + remote_side_apply_profile(NULL); + } + return atomic_load_explicit(&g_remote_mask, memory_order_relaxed); +} + +rem_side_entry* remote_side_table(void) { + if (!atomic_load_explicit(&g_remote_allocated, memory_order_acquire)) { + remote_side_init(NULL, NULL); + } + return g_remote_slots; +} diff --git a/core/box/remote_side_box.h b/core/box/remote_side_box.h new file mode 100644 index 00000000..9a2ef5a2 --- /dev/null +++ b/core/box/remote_side_box.h @@ -0,0 +1,21 @@ +#pragma once +// RemoteSideBox: tiny_remote の REM_SIDE をプロファイルで論理的に絞るための薄いラッパ + +#include +#include + +typedef struct rem_side_entry { + _Atomic(uintptr_t) key; // node pointer + _Atomic(uintptr_t) val; // next pointer +} rem_side_entry; + +typedef struct RemoteSideBox RemoteSideBox; + +// profile が NULL のときは HAKMEM_PROFILE を見る。 +void remote_side_init(RemoteSideBox* box, const char* profile); + +// 有効サイズ/マスク(配列自体は REM_SIDE_SIZE のまま) +uint32_t remote_side_effective_size(void); +uint32_t remote_side_effective_mask(void); +uint32_t remote_side_effective_log2(void); +rem_side_entry* remote_side_table(void); diff --git a/core/box/shared_pool_box.c b/core/box/shared_pool_box.c new file mode 100644 index 00000000..97070c0b --- /dev/null +++ b/core/box/shared_pool_box.c @@ -0,0 +1,50 @@ +#include "shared_pool_box.h" + +#include +#include +#include + +// 既存の g_shared_pool 配列上に「論理的な上限」だけを被せる。 +static _Atomic uint32_t g_sp_total_limit = 0; // 0 = 無制限(現行のまま) +static _Atomic uint32_t g_sp_class_limit = 0; // 0 = 無制限 +static _Atomic int g_sp_profile_inited = 0; + +static void shared_pool_apply_profile(const char* profile) { + if (g_sp_profile_inited) { + return; + } + const char* env_profile = profile ? profile : getenv("HAKMEM_PROFILE"); + int is_bench = (env_profile && strcmp(env_profile, "bench") == 0); + + uint32_t total_limit = 0; + uint32_t class_limit = 0; + if (is_bench) { + // bench 用: ひとまず控えめな論理上限だけ入れる + total_limit = 65536; // 元の 1M よりかなり少ない + class_limit = 2048; // クラスあたりの active slot 上限の目安 + } + + atomic_store_explicit(&g_sp_total_limit, total_limit, memory_order_relaxed); + atomic_store_explicit(&g_sp_class_limit, class_limit, memory_order_relaxed); + atomic_store_explicit(&g_sp_profile_inited, 1, memory_order_release); +} + +void shared_pool_box_init(SharedPoolBox* box, const char* profile) { + (void)box; + shared_pool_apply_profile(profile); +} + +uint32_t shared_pool_effective_total_slots(void) { + if (!atomic_load_explicit(&g_sp_profile_inited, memory_order_acquire)) { + shared_pool_apply_profile(NULL); + } + return atomic_load_explicit(&g_sp_total_limit, memory_order_relaxed); +} + +uint32_t shared_pool_effective_class_slots(int class_idx) { + (void)class_idx; + if (!atomic_load_explicit(&g_sp_profile_inited, memory_order_acquire)) { + shared_pool_apply_profile(NULL); + } + return atomic_load_explicit(&g_sp_class_limit, memory_order_relaxed); +} diff --git a/core/box/shared_pool_box.h b/core/box/shared_pool_box.h new file mode 100644 index 00000000..0f342625 --- /dev/null +++ b/core/box/shared_pool_box.h @@ -0,0 +1,18 @@ +#pragma once +// SharedPoolBox: 既存の g_shared_pool の上に「論理上限」を被せる軽量ラッパ。 +// 目的: +// - HAKMEM_PROFILE=bench などのときに Shared Pool の増殖を論理的に抑える。 +// - 配列サイズ自体は現状のまま(BSS をまだ縮めない)。 + +#include + +typedef struct SharedPoolBox SharedPoolBox; + +// profile が NULL のときは HAKMEM_PROFILE を読む。 +void shared_pool_box_init(SharedPoolBox* box, const char* profile); + +// これ以上増やさない総枠。full では元の制限なし、bench では小さめ。 +uint32_t shared_pool_effective_total_slots(void); + +// クラス別の論理上限(active slots がこの値を超えたら新規追加を抑制) +uint32_t shared_pool_effective_class_slots(int class_idx); diff --git a/core/box/ss_ace_box.c b/core/box/ss_ace_box.c index f7ec175b..182c813a 100644 --- a/core/box/ss_ace_box.c +++ b/core/box/ss_ace_box.c @@ -175,8 +175,12 @@ static void ace_observe_and_decide(int k) { int ss_count = 0; uint32_t total_live = 0; - for (int i = 0; i < SUPER_REG_SIZE; i++) { - SuperRegEntry* e = &g_super_reg[i]; + SuperRegEntry* reg = super_reg_entries(); + int reg_cap = super_reg_effective_size(); + if (!reg || reg_cap <= 0) return; + + for (int i = 0; i < reg_cap; i++) { + SuperRegEntry* e = ®[i]; // Atomic read (thread-safe) uintptr_t base = atomic_load_explicit( diff --git a/core/box/ss_allocation_box.c b/core/box/ss_allocation_box.c index 1bc2676a..13658277 100644 --- a/core/box/ss_allocation_box.c +++ b/core/box/ss_allocation_box.c @@ -284,6 +284,10 @@ SuperSlab* superslab_allocate(uint8_t size_class) { } } while (0); + if (!from_cache) { + ss_stats_on_ss_alloc_class(size_class); + } + return ss; } diff --git a/core/box/ss_budget_box.c b/core/box/ss_budget_box.c new file mode 100644 index 00000000..61e5f171 --- /dev/null +++ b/core/box/ss_budget_box.c @@ -0,0 +1,122 @@ +// ss_budget_box.c - Superslab Budget Box +// Box Theory: Budget/limit guard for Superslab growth. +// - ENV: +// HAKMEM_SS_BUDGET_GLOBAL : global cap (0 = unlimited, default varies) +// HAKMEM_SS_BUDGET_C0..C7 : per-class cap override (0 = unlimited) +// HAKMEM_SS_BUDGET_C7 : shorthand most often used +// - Profile hint: +// HAKMEM_TINY_PROFILE=larson_guard → stricter defaults. + +#include "ss_budget_box.h" + +#include +#include +#include +#include + +#include "ss_stats_box.h" + +static _Atomic int g_budget_init = 0; +static int g_ss_budget_global = 0; +static int g_ss_budget_per_class[8] = {0}; + +static int ss_budget_parse_env(const char* name, int fallback) { + const char* e = getenv(name); + if (e && *e) { + int v = atoi(e); + if (v < 0) v = 0; + return v; + } + return fallback; +} + +static void ss_budget_init_once(void) { + if (atomic_load_explicit(&g_budget_init, memory_order_acquire)) { + return; + } + + // Profile hint: larson_guard uses tighter defaults to cap RSS. + const char* profile = getenv("HAKMEM_TINY_PROFILE"); + int is_larson_guard = (profile && strcasecmp(profile, "larson_guard") == 0); + + // Defaults: unlimited unless larson_guard + int default_global = is_larson_guard ? 512 : 0; + g_ss_budget_global = ss_budget_parse_env("HAKMEM_SS_BUDGET_GLOBAL", default_global); + + for (int i = 0; i < 8; i++) { + int def = 0; + if (is_larson_guard) { + // Larson guard: modest per-class caps, C7 is a bit looser. + def = (i == 7) ? 192 : 96; + } + g_ss_budget_per_class[i] = def; + } + + // Per-class overrides: HAKMEM_SS_BUDGET_C7 or HAKMEM_SS_BUDGET_C{idx} + for (int i = 0; i < 8; i++) { + char buf[32]; + snprintf(buf, sizeof(buf), "HAKMEM_SS_BUDGET_C%d", i); + int override = ss_budget_parse_env(buf, g_ss_budget_per_class[i]); + g_ss_budget_per_class[i] = override; + } + // Support the legacy shorthand HAKMEM_SS_BUDGET_C7 + g_ss_budget_per_class[7] = + ss_budget_parse_env("HAKMEM_SS_BUDGET_C7", g_ss_budget_per_class[7]); + + atomic_store_explicit(&g_budget_init, 1, memory_order_release); +} + +static inline uint64_t ss_budget_global_live_sum(void) { + uint64_t sum = 0; + for (int i = 0; i < 8; i++) { + sum += atomic_load_explicit(&g_ss_live_by_class[i], memory_order_relaxed); + } + return sum; +} + +bool ss_budget_on_alloc(int class_idx) { + ss_budget_init_once(); + + if (class_idx < 0 || class_idx >= 8) { + return true; // outside Tiny; do not gate here + } + + uint64_t live_cls = atomic_load_explicit(&g_ss_live_by_class[class_idx], + memory_order_relaxed); + int class_cap = g_ss_budget_per_class[class_idx]; + if (class_cap > 0 && live_cls >= (uint64_t)class_cap) { + static _Atomic uint32_t log_once = 0; + if (atomic_fetch_add_explicit(&log_once, 1, memory_order_relaxed) < 4) { + fprintf(stderr, + "[SS_BUDGET_DENY] class=%d live=%llu cap=%d\n", + class_idx, + (unsigned long long)live_cls, + class_cap); + } + return false; + } + + int global_cap = g_ss_budget_global; + if (global_cap > 0) { + uint64_t live_total = ss_budget_global_live_sum(); + if (live_total >= (uint64_t)global_cap) { + static _Atomic uint32_t g_log_once = 0; + if (atomic_fetch_add_explicit(&g_log_once, 1, memory_order_relaxed) < 4) { + fprintf(stderr, + "[SS_BUDGET_DENY_GLOBAL] live_total=%llu cap=%d class=%d\n", + (unsigned long long)live_total, + global_cap, + class_idx); + } + return false; + } + } + + return true; +} + +void ss_budget_on_free(int class_idx) { + (void)class_idx; + ss_budget_init_once(); + // We currently rely on ss_stats_on_ss_free_class() to update live counters. +} diff --git a/core/box/ss_budget_box.h b/core/box/ss_budget_box.h new file mode 100644 index 00000000..1000ed79 --- /dev/null +++ b/core/box/ss_budget_box.h @@ -0,0 +1,19 @@ +// ss_budget_box.h - Superslab Budget Box +// Box Theory: centralize budget/limit checks for Superslab allocations. +// Responsibilities: +// - Read budget ENV once (global + per-class override) +// - Provide cheap checks before allocating new Superslabs +// - Allow symmetric free hook for future accounting + +#ifndef HAKMEM_SS_BUDGET_BOX_H +#define HAKMEM_SS_BUDGET_BOX_H + +#include + +// Return false when allocation should be denied due to budget exhaustion. +bool ss_budget_on_alloc(int class_idx); + +// Hook for future bookkeeping; currently a no-op placeholder. +void ss_budget_on_free(int class_idx); + +#endif // HAKMEM_SS_BUDGET_BOX_H diff --git a/core/box/ss_slab_reset_box.h b/core/box/ss_slab_reset_box.h index f58c3d07..3f27c3c6 100644 --- a/core/box/ss_slab_reset_box.h +++ b/core/box/ss_slab_reset_box.h @@ -13,12 +13,15 @@ static inline void ss_slab_reset_meta_for_tiny(SuperSlab* ss, if (!ss) return; if (slab_idx < 0 || slab_idx >= ss_slabs_capacity(ss)) return; + // class_idx < 0 means "unassigned" (255). Otherwise keep the requested class. + uint8_t target_class = (class_idx < 0) ? 255u : (uint8_t)class_idx; + TinySlabMeta* meta = &ss->slabs[slab_idx]; meta->used = 0; meta->carved = 0; meta->freelist = NULL; - meta->class_idx = (uint8_t)class_idx; - ss->class_map[slab_idx] = (uint8_t)class_idx; + meta->class_idx = target_class; + ss->class_map[slab_idx] = target_class; // Reset remote queue state to avoid stale pending frees on reuse. atomic_store_explicit(&ss->remote_heads[slab_idx], 0, memory_order_relaxed); diff --git a/core/box/ss_stats_box.c b/core/box/ss_stats_box.c index 52d6a1ec..73e109c3 100644 --- a/core/box/ss_stats_box.c +++ b/core/box/ss_stats_box.c @@ -1,8 +1,10 @@ // ss_stats_box.c - SuperSlab Statistics Box Implementation #include "ss_stats_box.h" +#include #include "../superslab/superslab_inline.h" #include #include +#include // ============================================================================ // Global Statistics State @@ -30,6 +32,11 @@ _Atomic uint64_t g_free_ss_enter = 0; // hak_tiny_free_superslab() entr _Atomic uint64_t g_free_local_box_calls = 0; // same-thread freelist pushes _Atomic uint64_t g_free_remote_box_calls = 0; // cross-thread remote pushes +// Superslab/slab observability (Tiny-only; relaxed updates) +_Atomic uint64_t g_ss_live_by_class[8] = {0}; +_Atomic uint64_t g_ss_empty_events[8] = {0}; +_Atomic uint64_t g_slab_live_events[8] = {0}; + // ============================================================================ // Statistics Update Implementation // ============================================================================ @@ -56,6 +63,36 @@ void ss_stats_cache_store(void) { pthread_mutex_unlock(&g_superslab_lock); } +void ss_stats_on_ss_alloc_class(int class_idx) { + if (class_idx >= 0 && class_idx < 8) { + atomic_fetch_add_explicit(&g_ss_live_by_class[class_idx], 1, memory_order_relaxed); + } +} + +void ss_stats_on_ss_free_class(int class_idx) { + if (class_idx >= 0 && class_idx < 8) { + // Saturating-style decrement to avoid underflow from mismatched hooks + uint64_t prev = atomic_load_explicit(&g_ss_live_by_class[class_idx], memory_order_relaxed); + if (prev > 0) { + atomic_fetch_sub_explicit(&g_ss_live_by_class[class_idx], 1, memory_order_relaxed); + } + } +} + +void ss_stats_on_ss_scan(int class_idx, int slab_live, int is_empty) { + if (class_idx < 0 || class_idx >= 8) { + return; + } + if (slab_live > 0) { + atomic_fetch_add_explicit(&g_slab_live_events[class_idx], + (uint64_t)slab_live, + memory_order_relaxed); + } + if (is_empty) { + atomic_fetch_add_explicit(&g_ss_empty_events[class_idx], 1, memory_order_relaxed); + } +} + // ============================================================================ // Statistics Reporting Implementation // ============================================================================ @@ -92,3 +129,23 @@ void superslab_print_global_stats(void) { printf("Total bytes allocated: %lu MB\n", g_bytes_allocated / (1024 * 1024)); pthread_mutex_unlock(&g_superslab_lock); } + +void ss_stats_dump_if_requested(void) { + const char* env = getenv("HAKMEM_SS_STATS_DUMP"); + if (!env || !*env || *env == '0') { + return; + } + fprintf(stderr, "[SS_STATS] class live empty_events slab_live_events\n"); + for (int c = 0; c < 8; c++) { + uint64_t live = atomic_load_explicit(&g_ss_live_by_class[c], memory_order_relaxed); + uint64_t empty = atomic_load_explicit(&g_ss_empty_events[c], memory_order_relaxed); + uint64_t slab_live = atomic_load_explicit(&g_slab_live_events[c], memory_order_relaxed); + if (live || empty || slab_live) { + fprintf(stderr, " C%d: live=%llu empty=%llu slab_live=%llu\n", + c, + (unsigned long long)live, + (unsigned long long)empty, + (unsigned long long)slab_live); + } + } +} diff --git a/core/box/ss_stats_box.h b/core/box/ss_stats_box.h index 8a8b4833..11f44693 100644 --- a/core/box/ss_stats_box.h +++ b/core/box/ss_stats_box.h @@ -43,6 +43,16 @@ extern _Atomic uint64_t g_free_ss_enter; extern _Atomic uint64_t g_free_local_box_calls; extern _Atomic uint64_t g_free_remote_box_calls; +// ============================================================================ +// Superslab / Slab live-state observability (Tiny classes 0..7) +// ============================================================================ +// NOTE: These are “event-style” counters updated at key transitions +// (alloc/free/reset) to keep overhead minimal. They are intended for +// regression detection and coarse budgeting rather than exact gauges. +extern _Atomic uint64_t g_ss_live_by_class[8]; // +1 on alloc, -1 on free (best-effort) +extern _Atomic uint64_t g_ss_empty_events[8]; // Observations of fully-empty Superslabs +extern _Atomic uint64_t g_slab_live_events[8]; // Observations of live slabs during scans + // ============================================================================ // Statistics Update API // ============================================================================ @@ -59,6 +69,11 @@ void ss_stats_cache_reuse(void); // Thread-safe: mutex protected void ss_stats_cache_store(void); +// Event-style observability helpers (Tiny classes only, relaxed atomics) +void ss_stats_on_ss_alloc_class(int class_idx); +void ss_stats_on_ss_free_class(int class_idx); +void ss_stats_on_ss_scan(int class_idx, int slab_live, int is_empty); + // ============================================================================ // Statistics Reporting API // ============================================================================ @@ -69,4 +84,7 @@ void superslab_print_stats(SuperSlab* ss); // Print global SuperSlab statistics void superslab_print_global_stats(void); +// ENV: HAKMEM_SS_STATS_DUMP=1 → dump coarse Superslab/slab counters once +void ss_stats_dump_if_requested(void); + #endif // HAKMEM_SS_STATS_BOX_H diff --git a/core/box/ss_tls_bind_box.h b/core/box/ss_tls_bind_box.h index 6fa9eafe..be4202c6 100644 --- a/core/box/ss_tls_bind_box.h +++ b/core/box/ss_tls_bind_box.h @@ -119,7 +119,7 @@ static inline int ss_tls_bind_one(int class_idx, tls->slab_base = tiny_slab_base_for(ss, slab_idx); // Notify Tiny Page Box (if enabled for this class) - tiny_page_box_on_new_slab(tls); + tiny_page_box_on_new_slab(class_idx, tls); // Sanity check: TLS must now describe this slab for this class. // On failure, revert TLS to safe state and return 0. diff --git a/core/box/super_reg_box.c b/core/box/super_reg_box.c new file mode 100644 index 00000000..98eae18b --- /dev/null +++ b/core/box/super_reg_box.c @@ -0,0 +1,143 @@ +#include "super_reg_box.h" + +#include +#include +#include +#include + +#include "hakmem_super_registry.h" + +// プロファイル別の実容量・論理上限 +static _Atomic int g_super_reg_effective_size = SUPER_REG_SIZE; +static _Atomic int g_super_reg_effective_mask = SUPER_REG_MASK; +static _Atomic int g_super_reg_effective_per_class = SUPER_REG_PER_CLASS; +static _Atomic int g_super_reg_profile_inited = 0; + +// 動的に確保する実配列 +static SuperRegEntry* g_super_reg_entries = NULL; +static SuperSlab** g_super_reg_by_class_slots = NULL; +static int g_super_reg_by_class_stride = SUPER_REG_PER_CLASS; +static _Atomic int g_super_reg_allocated = 0; + +static inline int super_reg_clamp_power_of_two(int requested, int fallback) { + // SUPER_REG_SIZE は 2 のべき乗なので、requested もそれ未満のべき乗に丸める。 + if (requested <= 0 || requested > SUPER_REG_SIZE) { + return fallback; + } + // 丸め: 最上位ビットだけを残す(2 のべき乗に丸め下げ) + int v = requested; + v |= v >> 1; + v |= v >> 2; + v |= v >> 4; + v |= v >> 8; + v |= v >> 16; + v = v - (v >> 1); + // 有効値は最低でも 1024 にしておく + if (v < 1024) { + v = 1024; + } + return v; +} + +static void super_reg_apply_profile(const char* profile) { + if (g_super_reg_profile_inited) { + return; + } + + const char* env_profile = profile ? profile : getenv("HAKMEM_PROFILE"); + const int is_bench = (env_profile && strcmp(env_profile, "bench") == 0); + + int eff_size = SUPER_REG_SIZE; + int eff_per_class = SUPER_REG_PER_CLASS; + + if (is_bench) { + // 論理上の利用範囲だけ縮める(配列は従来サイズのまま) + eff_size = SUPER_REG_SIZE >> 3; // 1/8 に論理制限 + eff_per_class = SUPER_REG_PER_CLASS >> 4; // 1/16 + } + + eff_size = super_reg_clamp_power_of_two(eff_size, SUPER_REG_SIZE); + eff_per_class = eff_per_class > 0 ? eff_per_class : SUPER_REG_PER_CLASS; + + atomic_store_explicit(&g_super_reg_effective_size, eff_size, memory_order_relaxed); + atomic_store_explicit(&g_super_reg_effective_mask, eff_size - 1, memory_order_relaxed); + atomic_store_explicit(&g_super_reg_effective_per_class, + eff_per_class, + memory_order_relaxed); + atomic_store_explicit(&g_super_reg_profile_inited, 1, memory_order_release); +} + +void super_reg_init(SuperRegBox* box, const char* profile) { + (void)box; + super_reg_apply_profile(profile); + + if (atomic_load_explicit(&g_super_reg_allocated, memory_order_acquire)) { + return; + } + + int eff_size = super_reg_effective_size(); + int per_class = super_reg_effective_per_class(); + + // Allocate registry table + size_t reg_bytes = (size_t)eff_size * sizeof(SuperRegEntry); + g_super_reg_entries = (SuperRegEntry*)calloc(eff_size, sizeof(SuperRegEntry)); + if (!g_super_reg_entries) { + fprintf(stderr, "[SUPER_REG] failed to allocate %zu bytes for registry\n", reg_bytes); + abort(); + } + + // Allocate per-class table (contiguous 1D block) + size_t per_class_bytes = (size_t)TINY_NUM_CLASSES * (size_t)per_class * sizeof(SuperSlab*); + g_super_reg_by_class_slots = (SuperSlab**)calloc(TINY_NUM_CLASSES * (size_t)per_class, + sizeof(SuperSlab*)); + if (!g_super_reg_by_class_slots) { + fprintf(stderr, "[SUPER_REG] failed to allocate %zu bytes for per-class registry\n", + per_class_bytes); + abort(); + } + g_super_reg_by_class_stride = per_class; + + atomic_store_explicit(&g_super_reg_allocated, 1, memory_order_release); +} + +int super_reg_effective_size(void) { + if (!atomic_load_explicit(&g_super_reg_profile_inited, memory_order_acquire)) { + super_reg_apply_profile(NULL); + } + return atomic_load_explicit(&g_super_reg_effective_size, memory_order_relaxed); +} + +int super_reg_effective_mask(void) { + if (!atomic_load_explicit(&g_super_reg_profile_inited, memory_order_acquire)) { + super_reg_apply_profile(NULL); + } + return atomic_load_explicit(&g_super_reg_effective_mask, memory_order_relaxed); +} + +int super_reg_effective_per_class(void) { + if (!atomic_load_explicit(&g_super_reg_profile_inited, memory_order_acquire)) { + super_reg_apply_profile(NULL); + } + return atomic_load_explicit(&g_super_reg_effective_per_class, memory_order_relaxed); +} + +SuperRegEntry* super_reg_entries(void) { + if (!atomic_load_explicit(&g_super_reg_allocated, memory_order_acquire)) { + super_reg_init(NULL, NULL); + } + return g_super_reg_entries; +} + +SuperSlab** super_reg_by_class_slots(void) { + if (!atomic_load_explicit(&g_super_reg_allocated, memory_order_acquire)) { + super_reg_init(NULL, NULL); + } + return g_super_reg_by_class_slots; +} + +int super_reg_by_class_stride(void) { + if (!atomic_load_explicit(&g_super_reg_allocated, memory_order_acquire)) { + super_reg_init(NULL, NULL); + } + return g_super_reg_by_class_stride; +} diff --git a/core/box/super_reg_box.h b/core/box/super_reg_box.h new file mode 100644 index 00000000..102319bd --- /dev/null +++ b/core/box/super_reg_box.h @@ -0,0 +1,77 @@ +#pragma once + +#include +#include +#include +#ifndef TINY_NUM_CLASSES +#define TINY_NUM_CLASSES 8 +#endif +// SuperRegBox (設計メモ / API スタブ) +// ------------------------------------- +// 役割: +// - g_super_reg / g_super_reg_by_class への直接依存を断ち、レジストリ容量を +// プロファイル(full/prod/bench/larson_guard 等)で切り替えられるようにする箱。 +// - Box 内部だけで容量決定・確保・破棄を閉じ、外側は薄い API を呼ぶだけにする。 +// +// プロファイル方針(案): +// - full/prod : 現行の SUPER_REG_SIZE (=1,048,576) と SUPER_REG_PER_CLASS (=16,384) を維持 +// - bench : SUPER_REG_SIZE を 1/16〜1/8 程度 (例: 65,536)、per-class は 1,024 などに縮小 +// - guard : bench 同等かさらに小さくして fail-fast(ENOMEM)を優先 +// +// スレッド安全性: +// - 既存のロック/atomic 公算を流用しつつ、構造体にまとめて「初期化済みか」を判定。 +// +// 想定 API(実装は今後): + +typedef struct SuperSlab SuperSlab; +typedef struct SuperRegBox SuperRegBox; +struct SuperRegEntry; + +// プロファイル/ENV に応じて容量を決定し、内部配列を確保。 +// profile が NULL のときは HAKMEM_PROFILE (bench / full など) を読む。 +void super_reg_init(SuperRegBox* box, const char* profile); + +// 現在有効なスロット数/マスク +int super_reg_effective_size(void); +int super_reg_effective_mask(void); +int super_reg_effective_per_class(void); + +// レジストリ実体へのアクセス(Box 内部で動的確保) +struct SuperRegEntry* super_reg_entries(void); +SuperSlab** super_reg_by_class_slots(void); +int super_reg_by_class_stride(void); +static inline SuperSlab* super_reg_by_class_at(int class_idx, int idx) { + SuperSlab** slots = super_reg_by_class_slots(); + int stride = super_reg_by_class_stride(); + if (!slots || stride <= 0 || class_idx < 0 || idx < 0 || + class_idx >= TINY_NUM_CLASSES || idx >= stride) { + return NULL; + } + return slots[class_idx * stride + idx]; +} +static inline void super_reg_by_class_set(int class_idx, int idx, SuperSlab* ss) { + SuperSlab** slots = super_reg_by_class_slots(); + int stride = super_reg_by_class_stride(); + if (!slots || stride <= 0 || class_idx < 0 || idx < 0 || + class_idx >= TINY_NUM_CLASSES || idx >= stride) { + return; + } + slots[class_idx * stride + idx] = ss; +} + +// Superslab 登録/解除(既存の hak_super_register/unregister 相当を箱内に閉じ込める) +bool super_reg_register(SuperRegBox* box, SuperSlab* ss, uint32_t class_idx); +void super_reg_unregister(SuperRegBox* box, SuperSlab* ss, uint32_t class_idx); + +// アドレス検索/クラス別イテレーション(必要最小限の薄い API) +SuperSlab* super_reg_find_by_addr(SuperRegBox* box, void* ptr); +SuperSlab* super_reg_iter_for_class(SuperRegBox* box, uint32_t class_idx, void** cursor); + +// 将来のメモリ削減策(コメントのみ) +// - g_super_reg/g_super_reg_by_class を「malloc/mmap でプロファイル毎に確保」するようにし、 +// BSS から切り離す。 +// - bench プロファイルでは固定長を大幅に縮め、足りなければ ENOMEM を返して fail-fast。 +// - prod では現行サイズを維持しつつ、Box 境界でのみアクセスさせる。*** + +// 前方宣言(実装は既存の superslab に依存) +// typedef struct SuperSlab SuperSlab; // 上で宣言済み diff --git a/core/box/tiny_c7_hotpath_box.h b/core/box/tiny_c7_hotpath_box.h new file mode 100644 index 00000000..2e982235 --- /dev/null +++ b/core/box/tiny_c7_hotpath_box.h @@ -0,0 +1,63 @@ +// C7 専用の実験的ホットパス。HAKMEM_TINY_C7_HOT=1 でのみ有効化し、 +// デフォルト(未設定/0)のときは従来経路に完全フォールバックする。 +// 本番デフォルトで ON にしない前提の A/B 用スイッチ。 +#pragma once + +#include "../hakmem_build_flags.h" +#include "c7_hotpath_env_box.h" +#include "tiny_c7_uc_hit_box.h" +#include "tiny_c7_warm_spill_box.h" +#include "tiny_c7_stats_sample_box.h" +#include "tiny_front_hot_box.h" +#include "tiny_front_cold_box.h" +#include "front_gate_box.h" +#include "tls_sll_box.h" +#include "ptr_conversion_box.h" + +// C7 alloc ホットパス。 +// 順序: +// 1) TLS/SFC (front_gate_try_pop) を先に覗く +// 2) Unified Cache のヒット専用パス tiny_uc_pop_c7_hit_only() +// 3) それでもダメなら通常の cold refill(refill/統計は cold 側に任せる) +static inline void* tiny_c7_alloc_hot(size_t size) { + (void)size; // size は class_idx=7 前提なので未使用 + void* user = NULL; + + // 1) SFC/TLS SLL 直叩き(ユーザーポインタが返る) + if (front_gate_try_pop(/*class_idx=*/7, &user)) { + return user; + } + + // 2) Unified Cache ヒット + user = tiny_uc_pop_c7_hit_only(); + if (__builtin_expect(user != NULL, 1)) { + return user; + } + + // 3) Cold refill へフォールバック + return tiny_cold_refill_and_alloc(7); +} + +// C7 free ホットパス。BASE を受け取り TLS→UC の順に試す。 +static inline int tiny_c7_free_hot(void* base) { + // 1) TLS SLL へ直接 push(BASE のまま渡す) + extern int g_tls_sll_enable; + if (__builtin_expect(g_tls_sll_enable, 1)) { + if (tls_sll_push(7, HAK_BASE_FROM_RAW(base), UINT32_MAX)) { + return 1; + } + } + + // 2) Unified Cache へ push(ヒット専用の軽量版) + if (tiny_uc_push_c7_hot(base)) { + return 1; + } + + // 3) Warm spill(将来用のフック) + if (tiny_c7_warm_spill_one(base)) { + return 1; + } + + // 4) 最後に cold free パスへフォールバック + return tiny_cold_drain_and_free(7, base); +} diff --git a/core/box/tiny_c7_stats_sample_box.h b/core/box/tiny_c7_stats_sample_box.h new file mode 100644 index 00000000..dec054cd --- /dev/null +++ b/core/box/tiny_c7_stats_sample_box.h @@ -0,0 +1,9 @@ +// tiny_c7_stats_sample_box.h - Lightweight sampling helper for C7 stats +// 現状は簡易 1/16 サンプリング。hot path から #if を排除するための小箱。 +#pragma once + +static inline int tiny_c7_stats_sample(void) { + static __thread unsigned counter = 0; + counter++; + return (counter & 0xF) == 0; // 約 1/16 +} diff --git a/core/box/tiny_c7_uc_hit_box.h b/core/box/tiny_c7_uc_hit_box.h new file mode 100644 index 00000000..3b3f724a --- /dev/null +++ b/core/box/tiny_c7_uc_hit_box.h @@ -0,0 +1,58 @@ +// tiny_c7_uc_hit_box.h - C7 専用 Unified Cache hit-only helpers +// 契約: ヒット時のみ処理。ミス時は NULL/0 を返し、refill・統計は行わない。 +#pragma once + +#include "../front/tiny_unified_cache.h" +#include "tiny_layout_box.h" + +// C7 UC ヒット専用 pop +static inline void* tiny_uc_pop_c7_hit_only(void) { + TinyUnifiedCache* cache = &g_unified_cache[7]; + +#if !HAKMEM_TINY_FRONT_PGO + if (__builtin_expect(cache->slots == NULL, 0)) { + unified_cache_init(); + if (cache->slots == NULL) { + return NULL; + } + } +#endif + + if (__builtin_expect(cache->head == cache->tail, 0)) { + return NULL; + } + + void* base = cache->slots[cache->head]; + cache->head = (cache->head + 1) & cache->mask; + +#if HAKMEM_TINY_HEADER_CLASSIDX + tiny_region_id_write_header(base, 7); + size_t user_offset = tiny_user_offset(7); + return (void*)((char*)base + user_offset); +#else + return base; +#endif +} + +// C7 UC ヒット専用 push +static inline int tiny_uc_push_c7_hot(void* base) { + TinyUnifiedCache* cache = &g_unified_cache[7]; + +#if !HAKMEM_TINY_FRONT_PGO + if (__builtin_expect(cache->slots == NULL, 0)) { + unified_cache_init(); + if (cache->slots == NULL) { + return 0; + } + } +#endif + + uint16_t next_tail = (cache->tail + 1) & cache->mask; + if (__builtin_expect(next_tail == cache->head, 0)) { + return 0; // full + } + + cache->slots[cache->tail] = base; + cache->tail = next_tail; + return 1; +} diff --git a/core/box/tiny_c7_warm_spill_box.h b/core/box/tiny_c7_warm_spill_box.h new file mode 100644 index 00000000..e4332b11 --- /dev/null +++ b/core/box/tiny_c7_warm_spill_box.h @@ -0,0 +1,9 @@ +// tiny_c7_warm_spill_box.h - C7 Warm spill hook (placeholder) +// Purpose: allow swapping spill実装 without touchingホットパス。 +#pragma once + +// いまは no-op。将来 Warm spill を挿すときに差し替える。 +static inline int tiny_c7_warm_spill_one(void* base) { + (void)base; + return 0; +} diff --git a/core/box/tiny_class_policy_box.c b/core/box/tiny_class_policy_box.c index 17246df0..0f53e1df 100644 --- a/core/box/tiny_class_policy_box.c +++ b/core/box/tiny_class_policy_box.c @@ -6,17 +6,20 @@ #include #include #include "tiny_policy_learner_box.h" +#include "tiny_mem_stats_box.h" TinyClassPolicy g_tiny_class_policy[TINY_NUM_CLASSES]; static _Atomic int g_tiny_class_policy_init_done = 0; static _Atomic int g_tiny_class_policy_logged = 0; static _Atomic int g_tiny_class_policy_profile_auto = 0; +static _Atomic int g_tiny_class_policy_mem_recorded = 0; static inline TinyClassPolicy tiny_class_policy_default_entry(void) { TinyClassPolicy p = {0}; p.page_box_enabled = 0; p.warm_enabled = 0; p.warm_cap = 0; + p.tls_carve_enabled = 0; return p; } @@ -30,6 +33,7 @@ static void tiny_class_policy_set_legacy(void) { for (int i = 0; i < TINY_NUM_CLASSES; i++) { g_tiny_class_policy[i].warm_enabled = 1; g_tiny_class_policy[i].warm_cap = (i < 5) ? 4 : 8; + g_tiny_class_policy[i].tls_carve_enabled = (i >= 5) ? 1 : 0; } for (int i = 5; i < TINY_NUM_CLASSES; i++) { g_tiny_class_policy[i].page_box_enabled = 1; @@ -45,6 +49,7 @@ static void tiny_class_policy_set_c5_7_only(void) { g_tiny_class_policy[i].page_box_enabled = 1; g_tiny_class_policy[i].warm_enabled = 1; g_tiny_class_policy[i].warm_cap = 8; + g_tiny_class_policy[i].tls_carve_enabled = 1; } } @@ -53,6 +58,18 @@ static void tiny_class_policy_set_tinyplus_all(void) { tiny_class_policy_set_legacy(); } +static void tiny_class_policy_set_larson_guard(void) { + // Start from legacy, then tighten warm caps to reduce RSS for larson-style loads. + tiny_class_policy_set_legacy(); + for (int i = 0; i < TINY_NUM_CLASSES; i++) { + if (i < 5) { + g_tiny_class_policy[i].warm_cap = 2; + } else { + g_tiny_class_policy[i].warm_cap = 4; + } + } +} + static void tiny_class_policy_set_auto(void) { // auto プロファイルは legacy をベースにして、後段の learner に委譲 tiny_class_policy_set_legacy(); @@ -72,6 +89,10 @@ static const char* tiny_class_policy_set_profile(const char* profile) { tiny_class_policy_set_tinyplus_all(); atomic_store_explicit(&g_tiny_class_policy_profile_auto, 0, memory_order_release); return "tinyplus_all"; + } else if (strcasecmp(profile, "larson_guard") == 0) { + tiny_class_policy_set_larson_guard(); + atomic_store_explicit(&g_tiny_class_policy_profile_auto, 0, memory_order_release); + return "larson_guard"; } else if (strcasecmp(profile, "auto") == 0) { tiny_class_policy_set_auto(); return "auto"; @@ -84,16 +105,20 @@ static const char* tiny_class_policy_set_profile(const char* profile) { } void tiny_class_policy_dump(const char* tag) { + if (!tiny_policy_log_enabled()) { + return; + } const char* header = tag ? tag : "[POLICY_DUMP]"; fprintf(stderr, "%s\n", header); for (int cls = 0; cls < TINY_NUM_CLASSES; cls++) { TinyClassPolicy* p = &g_tiny_class_policy[cls]; fprintf(stderr, - " C%d: page=%u warm=%u cap=%u\n", + " C%d: page=%u warm=%u cap=%u tls_carve=%u\n", cls, p->page_box_enabled, p->warm_enabled, - p->warm_cap); + p->warm_cap, + p->tls_carve_enabled); } } @@ -105,8 +130,13 @@ void tiny_class_policy_init_once(void) { const char* profile = getenv("HAKMEM_TINY_POLICY_PROFILE"); const char* active_profile = tiny_class_policy_set_profile(profile); + if (atomic_exchange_explicit(&g_tiny_class_policy_mem_recorded, 1, memory_order_acq_rel) == 0) { + tiny_mem_stats_add_policy_stats((ssize_t)sizeof(g_tiny_class_policy)); + } + // 1-shot ダンプでポリシーの内容を可視化(デバッグ用) - if (atomic_exchange_explicit(&g_tiny_class_policy_logged, 1, memory_order_acq_rel) == 0) { + if (tiny_policy_log_enabled() && + atomic_exchange_explicit(&g_tiny_class_policy_logged, 1, memory_order_acq_rel) == 0) { fprintf(stderr, "[POLICY_INIT] profile=%s\n", active_profile); tiny_class_policy_dump(NULL); } @@ -121,3 +151,8 @@ void tiny_class_policy_refresh_auto(void) { } tiny_policy_learner_tick(); } + +int tiny_class_policy_is_auto(void) { + tiny_class_policy_init_once(); + return atomic_load_explicit(&g_tiny_class_policy_profile_auto, memory_order_acquire); +} diff --git a/core/box/tiny_class_policy_box.h b/core/box/tiny_class_policy_box.h index 16618bbd..6e0dc201 100644 --- a/core/box/tiny_class_policy_box.h +++ b/core/box/tiny_class_policy_box.h @@ -15,23 +15,37 @@ #include #include +#include #include "../hakmem_tiny_config.h" typedef struct TinyClassPolicy { uint8_t page_box_enabled; // Enable Tiny Page Box for this class uint8_t warm_enabled; // Enable Warm Pool for this class uint8_t warm_cap; // Max warm SuperSlabs to keep (per-thread) - uint8_t reserved; + uint8_t tls_carve_enabled; // Enable Warm→TLS carve experiment for this class } TinyClassPolicy; extern TinyClassPolicy g_tiny_class_policy[TINY_NUM_CLASSES]; +// ENV-gated policy logging (default ON; disable with HAKMEM_TINY_POLICY_LOG=0) +static inline int tiny_policy_log_enabled(void) { + static int g_policy_log = -1; + if (__builtin_expect(g_policy_log == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_POLICY_LOG"); + g_policy_log = (e && *e && *e != '0') ? 1 : 0; + } + return g_policy_log; +} + // Initialize policy table once (idempotent). void tiny_class_policy_init_once(void); // Refresh auto profile based on learner output (no-op for non-auto profiles) void tiny_class_policy_refresh_auto(void); +// True when active profile is "auto" (learner-managed) +int tiny_class_policy_is_auto(void); + // Debug helper: dump current policy (tag optional) void tiny_class_policy_dump(const char* tag); diff --git a/core/box/tiny_class_stats_box.c b/core/box/tiny_class_stats_box.c index 7da00796..77a7b0d2 100644 --- a/core/box/tiny_class_stats_box.c +++ b/core/box/tiny_class_stats_box.c @@ -1,6 +1,7 @@ // tiny_class_stats_box.c - Thread-local stats storage for Tiny classes #include "tiny_class_stats_box.h" +#include "tiny_mem_stats_box.h" #include #include @@ -8,6 +9,20 @@ __thread TinyClassStatsThread g_tiny_class_stats = {0}; _Atomic uint64_t g_tiny_class_stats_uc_miss_global[TINY_NUM_CLASSES] = {0}; _Atomic uint64_t g_tiny_class_stats_warm_hit_global[TINY_NUM_CLASSES] = {0}; _Atomic uint64_t g_tiny_class_stats_shared_lock_global[TINY_NUM_CLASSES] = {0}; +_Atomic uint64_t g_tiny_class_stats_tls_carve_attempt_global[TINY_NUM_CLASSES] = {0}; +_Atomic uint64_t g_tiny_class_stats_tls_carve_success_global[TINY_NUM_CLASSES] = {0}; +static _Atomic int g_tiny_class_stats_mem_recorded = 0; + +static void tiny_class_stats_record_mem_once(void) { + if (atomic_exchange_explicit(&g_tiny_class_stats_mem_recorded, 1, memory_order_acq_rel) == 0) { + tiny_mem_stats_add_policy_stats((ssize_t)sizeof(g_tiny_class_stats)); + tiny_mem_stats_add_policy_stats((ssize_t)sizeof(g_tiny_class_stats_uc_miss_global)); + tiny_mem_stats_add_policy_stats((ssize_t)sizeof(g_tiny_class_stats_warm_hit_global)); + tiny_mem_stats_add_policy_stats((ssize_t)sizeof(g_tiny_class_stats_shared_lock_global)); + tiny_mem_stats_add_policy_stats((ssize_t)sizeof(g_tiny_class_stats_tls_carve_attempt_global)); + tiny_mem_stats_add_policy_stats((ssize_t)sizeof(g_tiny_class_stats_tls_carve_success_global)); + } +} void tiny_class_stats_reset_thread(void) { memset(&g_tiny_class_stats, 0, sizeof(g_tiny_class_stats)); @@ -15,11 +30,13 @@ void tiny_class_stats_reset_thread(void) { void tiny_class_stats_snapshot_thread(TinyClassStatsThread* out) { if (!out) return; + tiny_class_stats_record_mem_once(); memcpy(out, &g_tiny_class_stats, sizeof(*out)); } void tiny_class_stats_snapshot_global(TinyClassStatsThread* out) { if (!out) return; + tiny_class_stats_record_mem_once(); for (int i = 0; i < TINY_NUM_CLASSES; i++) { out->uc_miss[i] = atomic_load_explicit(&g_tiny_class_stats_uc_miss_global[i], memory_order_relaxed); @@ -27,6 +44,10 @@ void tiny_class_stats_snapshot_global(TinyClassStatsThread* out) { memory_order_relaxed); out->shared_lock[i] = atomic_load_explicit(&g_tiny_class_stats_shared_lock_global[i], memory_order_relaxed); + out->tls_carve_attempt[i] = atomic_load_explicit( + &g_tiny_class_stats_tls_carve_attempt_global[i], memory_order_relaxed); + out->tls_carve_success[i] = atomic_load_explicit( + &g_tiny_class_stats_tls_carve_success_global[i], memory_order_relaxed); } } @@ -34,14 +55,18 @@ static void tiny_class_stats_dump_common(FILE* out, const char* tag, const TinyClassStatsThread* stats) { if (!(out && stats)) return; - fprintf(out, "%s class uc_miss warm_hit shared_lock\n", tag ? tag : "[STATS]"); + fprintf(out, "%s class uc_miss warm_hit shared_lock tls_carve_attempt tls_carve_success\n", + tag ? tag : "[STATS]"); for (int c = 0; c < TINY_NUM_CLASSES; c++) { - if (stats->uc_miss[c] || stats->warm_hit[c] || stats->shared_lock[c]) { - fprintf(out, " C%d: %llu %llu %llu\n", + if (stats->uc_miss[c] || stats->warm_hit[c] || stats->shared_lock[c] || + stats->tls_carve_attempt[c] || stats->tls_carve_success[c]) { + fprintf(out, " C%d: %llu %llu %llu %llu %llu\n", c, (unsigned long long)stats->uc_miss[c], (unsigned long long)stats->warm_hit[c], - (unsigned long long)stats->shared_lock[c]); + (unsigned long long)stats->shared_lock[c], + (unsigned long long)stats->tls_carve_attempt[c], + (unsigned long long)stats->tls_carve_success[c]); } } } diff --git a/core/box/tiny_class_stats_box.h b/core/box/tiny_class_stats_box.h index 91e37205..a39d8107 100644 --- a/core/box/tiny_class_stats_box.h +++ b/core/box/tiny_class_stats_box.h @@ -16,6 +16,8 @@ typedef struct TinyClassStatsThread { uint64_t uc_miss[TINY_NUM_CLASSES]; // unified_cache_refill() hits uint64_t warm_hit[TINY_NUM_CLASSES]; // warm pool successes uint64_t shared_lock[TINY_NUM_CLASSES]; // shared pool lock acquisitions (hook as needed) + uint64_t tls_carve_attempt[TINY_NUM_CLASSES]; // Warm/TLS carve attempts + uint64_t tls_carve_success[TINY_NUM_CLASSES]; // Warm/TLS carve successes } TinyClassStatsThread; extern __thread TinyClassStatsThread g_tiny_class_stats; @@ -24,6 +26,8 @@ extern __thread TinyClassStatsThread g_tiny_class_stats; extern _Atomic uint64_t g_tiny_class_stats_uc_miss_global[TINY_NUM_CLASSES]; extern _Atomic uint64_t g_tiny_class_stats_warm_hit_global[TINY_NUM_CLASSES]; extern _Atomic uint64_t g_tiny_class_stats_shared_lock_global[TINY_NUM_CLASSES]; +extern _Atomic uint64_t g_tiny_class_stats_tls_carve_attempt_global[TINY_NUM_CLASSES]; +extern _Atomic uint64_t g_tiny_class_stats_tls_carve_success_global[TINY_NUM_CLASSES]; static inline void tiny_class_stats_on_uc_miss(int ci) { if (ci >= 0 && ci < TINY_NUM_CLASSES) { @@ -49,6 +53,22 @@ static inline void tiny_class_stats_on_shared_lock(int ci) { } } +static inline void tiny_class_stats_on_tls_carve_attempt(int ci) { + if (ci >= 0 && ci < TINY_NUM_CLASSES) { + g_tiny_class_stats.tls_carve_attempt[ci]++; + atomic_fetch_add_explicit(&g_tiny_class_stats_tls_carve_attempt_global[ci], + 1, memory_order_relaxed); + } +} + +static inline void tiny_class_stats_on_tls_carve_success(int ci) { + if (ci >= 0 && ci < TINY_NUM_CLASSES) { + g_tiny_class_stats.tls_carve_success[ci]++; + atomic_fetch_add_explicit(&g_tiny_class_stats_tls_carve_success_global[ci], + 1, memory_order_relaxed); + } +} + // Optional: reset per-thread counters (cold path only). void tiny_class_stats_reset_thread(void); diff --git a/core/box/tiny_mem_stats_box.c b/core/box/tiny_mem_stats_box.c new file mode 100644 index 00000000..a7d586cc --- /dev/null +++ b/core/box/tiny_mem_stats_box.c @@ -0,0 +1,65 @@ +// tiny_mem_stats_box.c - Memory accounting helpers for Tiny front components + +#include "tiny_mem_stats_box.h" + +#include +#include +#include + +_Atomic long long g_tiny_mem_unified_cache_bytes = 0; +_Atomic long long g_tiny_mem_warm_pool_bytes = 0; +_Atomic long long g_tiny_mem_page_box_bytes = 0; +_Atomic long long g_tiny_mem_tls_magazine_bytes = 0; +_Atomic long long g_tiny_mem_policy_stats_bytes = 0; + +static inline void tiny_mem_stats_add(_Atomic long long* target, ssize_t bytes) { + if (!target || bytes == 0) { + return; + } + atomic_fetch_add_explicit(target, (long long)bytes, memory_order_relaxed); +} + +void tiny_mem_stats_add_unified(ssize_t bytes) { + tiny_mem_stats_add(&g_tiny_mem_unified_cache_bytes, bytes); +} + +void tiny_mem_stats_add_warm(ssize_t bytes) { + tiny_mem_stats_add(&g_tiny_mem_warm_pool_bytes, bytes); +} + +void tiny_mem_stats_add_pagebox(ssize_t bytes) { + tiny_mem_stats_add(&g_tiny_mem_page_box_bytes, bytes); +} + +void tiny_mem_stats_add_tls_magazine(ssize_t bytes) { + tiny_mem_stats_add(&g_tiny_mem_tls_magazine_bytes, bytes); +} + +void tiny_mem_stats_add_policy_stats(ssize_t bytes) { + tiny_mem_stats_add(&g_tiny_mem_policy_stats_bytes, bytes); +} + +void tiny_mem_stats_dump(void) { + long long unified = atomic_load_explicit(&g_tiny_mem_unified_cache_bytes, + memory_order_relaxed); + long long warm = atomic_load_explicit(&g_tiny_mem_warm_pool_bytes, + memory_order_relaxed); + long long pagebox = atomic_load_explicit(&g_tiny_mem_page_box_bytes, + memory_order_relaxed); + long long tls_mag = atomic_load_explicit(&g_tiny_mem_tls_magazine_bytes, + memory_order_relaxed); + long long policy_stats = atomic_load_explicit(&g_tiny_mem_policy_stats_bytes, + memory_order_relaxed); + + long long total = unified + warm + pagebox + tls_mag + policy_stats; + + fprintf(stderr, + "[TINY_MEM_STATS] unified_cache=%lldKB warm_pool=%lldKB page_box=%lldKB " + "tls_mag=%lldKB policy_stats=%lldKB total=%lldKB\n", + unified / 1024, + warm / 1024, + pagebox / 1024, + tls_mag / 1024, + policy_stats / 1024, + total / 1024); +} diff --git a/core/box/tiny_mem_stats_box.h b/core/box/tiny_mem_stats_box.h new file mode 100644 index 00000000..15c9bc0a --- /dev/null +++ b/core/box/tiny_mem_stats_box.h @@ -0,0 +1,38 @@ +// tiny_mem_stats_box.h - Lightweight memory accounting for Tiny front boxes +// +// Purpose: +// - Provide coarse-grained byte counters for major Tiny front allocations +// (Unified Cache buffers, Warm Pool TLS state, Page Box TLS state, +// TLS magazine/front caches, and policy/stats tables). +// - Keep overhead near-zero: helpers are simple fetch-adds, typically called +// at init time when the structures are allocated. +// +// Usage: +// - Call tiny_mem_stats_add_*() at allocation/free sites (positive/negative). +// - Call tiny_mem_stats_dump() when HAKMEM_TINY_MEM_DUMP is set to emit one +// summary line to stderr (values reported in KB). + +#ifndef TINY_MEM_STATS_BOX_H +#define TINY_MEM_STATS_BOX_H + +#include +#include +#include + +// Byte counters (signed to allow subtracting on free paths) +extern _Atomic long long g_tiny_mem_unified_cache_bytes; +extern _Atomic long long g_tiny_mem_warm_pool_bytes; +extern _Atomic long long g_tiny_mem_page_box_bytes; +extern _Atomic long long g_tiny_mem_tls_magazine_bytes; +extern _Atomic long long g_tiny_mem_policy_stats_bytes; + +void tiny_mem_stats_add_unified(ssize_t bytes); +void tiny_mem_stats_add_warm(ssize_t bytes); +void tiny_mem_stats_add_pagebox(ssize_t bytes); +void tiny_mem_stats_add_tls_magazine(ssize_t bytes); +void tiny_mem_stats_add_policy_stats(ssize_t bytes); + +// Dump one line summary (values in KB) if hooked by caller. +void tiny_mem_stats_dump(void); + +#endif // TINY_MEM_STATS_BOX_H diff --git a/core/box/tiny_page_box.c b/core/box/tiny_page_box.c index 2f6d574c..7cba8227 100644 --- a/core/box/tiny_page_box.c +++ b/core/box/tiny_page_box.c @@ -1,6 +1,5 @@ #include "tiny_page_box.h" // TLS state definitions for Tiny Page Box -__thread TinyPageBoxState g_tiny_page_box_state[TINY_NUM_CLASSES]; +__thread TinyPageBoxContext g_tiny_page_box[TINY_NUM_CLASSES]; __thread int g_tiny_page_box_init_done = 0; - diff --git a/core/box/tiny_page_box.h b/core/box/tiny_page_box.h index 6aaf4849..3b06df4e 100644 --- a/core/box/tiny_page_box.h +++ b/core/box/tiny_page_box.h @@ -9,7 +9,7 @@ // - API is generic over class_idx (0-7), but enabled-classes are controlled // by ENV so that we can start with C7 only and later extend to C5/C6. // - When enabled for a class: -// tiny_page_box_refill(class_idx, out, max) will try to supply up to +// tiny_page_box_refill(class_idx, tls, out, max) will try to supply up to // `max` BASE pointers using per-page freelist before falling back. // - When disabled for a class: the box returns 0 and caller uses legacy path. // @@ -37,6 +37,7 @@ #include "../superslab/superslab_types.h" // For TinySlabMeta, SuperSlab #include "../box/tiny_next_ptr_box.h" // For tiny_next_read() #include "../hakmem_tiny_superslab.h" // For tiny_stride_for_class(), base helpers, superslab_ref_inc/dec +#include "../box/tiny_mem_stats_box.h" // For coarse memory accounting // Superslab active counter(Release Guard Box と整合性を取るためのカウンタ更新) extern void ss_active_add(SuperSlab* ss, uint32_t n); @@ -61,19 +62,28 @@ typedef struct TinyPageDesc { // - enabled: このクラスで Page Box を使うかどうか // - num_pages: 現在保持しているページ数(0〜TINY_PAGE_BOX_MAX_PAGES) // - pages[]: TLS が掴んだ C7/C5/C6 ページの ring(小さなバッファ) -typedef struct TinyPageBoxState { +typedef struct TinyPageBoxContext { uint8_t enabled; // 1=Page Box enabled for this class, 0=disabled uint8_t num_pages; // 有効な pages[] エントリ数 uint8_t _pad[2]; TinyPageDesc pages[TINY_PAGE_BOX_MAX_PAGES]; -} TinyPageBoxState; +} TinyPageBoxContext; -// TLS/state: one TinyPageBoxState per class(per-thread Box) -extern __thread TinyPageBoxState g_tiny_page_box_state[TINY_NUM_CLASSES]; +// TLS/state: one TinyPageBoxContext per class(per-thread Box) +extern __thread TinyPageBoxContext g_tiny_page_box[TINY_NUM_CLASSES]; // One-shot init guard(per-thread) extern __thread int g_tiny_page_box_init_done; +static inline int tiny_page_box_log_enabled(void) { + static int g_page_box_log = -1; + if (__builtin_expect(g_page_box_log == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_PAGEBOX_LOG"); + g_page_box_log = (e && *e && *e != '0') ? 1 : 0; + } + return g_page_box_log; +} + // Helper: parse class list from ENV and set enabled flags. // Default behaviour (ENV unset/empty) is to enable class 7 only. static inline void tiny_page_box_init_once(void) { @@ -82,13 +92,14 @@ static inline void tiny_page_box_init_once(void) { } // Clear all state - memset(g_tiny_page_box_state, 0, sizeof(g_tiny_page_box_state)); + memset(g_tiny_page_box, 0, sizeof(g_tiny_page_box)); + tiny_mem_stats_add_pagebox((ssize_t)sizeof(g_tiny_page_box)); const char* env = getenv("HAKMEM_TINY_PAGE_BOX_CLASSES"); if (!env || !*env) { // Default: enable mid-size classes (C5–C7) for (int c = 5; c <= 7 && c < TINY_NUM_CLASSES; c++) { - g_tiny_page_box_state[c].enabled = 1; + g_tiny_page_box[c].enabled = 1; } } else { // Parse simple comma-separated list of integers: "5,6,7" @@ -107,7 +118,7 @@ static inline void tiny_page_box_init_once(void) { p++; } if (val >= 0 && val < TINY_NUM_CLASSES) { - g_tiny_page_box_state[val].enabled = 1; + g_tiny_page_box[val].enabled = 1; } } } @@ -123,7 +134,7 @@ static inline int tiny_page_box_is_enabled(int class_idx) { if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) { return 0; } - return g_tiny_page_box_state[class_idx].enabled != 0; + return g_tiny_page_box[class_idx].enabled != 0; } // Forward declaration for TLS slab state(tiny_tls.h から参照) @@ -133,7 +144,7 @@ extern __thread TinyTLSSlab g_tls_slabs[TINY_NUM_CLASSES]; // ここで Page Box が利用可能なページとして登録しておくことで、 // 後続の unified_cache_refill() から Superslab/Warm Pool に落ちる前に // 「既に TLS が掴んでいるページ」を優先的に使えるようにする。 -static inline void tiny_page_box_on_new_slab(TinyTLSSlab* tls) +static inline void tiny_page_box_on_new_slab(int class_idx, TinyTLSSlab* tls) { if (!tls) { return; @@ -143,6 +154,10 @@ static inline void tiny_page_box_on_new_slab(TinyTLSSlab* tls) tiny_page_box_init_once(); } + if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) { + return; + } + SuperSlab* ss = tls->ss; TinySlabMeta* meta = tls->meta; uint8_t* base = tls->slab_base; @@ -152,12 +167,11 @@ static inline void tiny_page_box_on_new_slab(TinyTLSSlab* tls) return; } - int class_idx = (int)meta->class_idx; - if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) { + if (meta->class_idx != (uint8_t)class_idx) { return; } - TinyPageBoxState* st = &g_tiny_page_box_state[class_idx]; + TinyPageBoxContext* st = &g_tiny_page_box[class_idx]; if (!st->enabled) { return; } @@ -200,9 +214,11 @@ static inline void tiny_page_box_on_new_slab(TinyTLSSlab* tls) superslab_ref_inc(ss); #if !HAKMEM_BUILD_RELEASE - // Debug: Track Page Box stats per-class - fprintf(stderr, "[PAGE_BOX_REG] class=%d num_pages=%u capacity=%u carved=%u\n", - class_idx, st->num_pages, meta->capacity, meta->carved); + // Debug: Track Page Box stats per-class(ENV: HAKMEM_TINY_PAGEBOX_LOG=0 で抑制) + if (tiny_page_box_log_enabled()) { + fprintf(stderr, "[PAGE_BOX_REG] class=%d num_pages=%u capacity=%u carved=%u\n", + class_idx, st->num_pages, meta->capacity, meta->carved); + } #endif } @@ -219,9 +235,11 @@ static inline void tiny_page_box_on_new_slab(TinyTLSSlab* tls) // - Superslab/Shared Pool 呼び出し頻度を徐々に観測・調整できる。 static inline int tiny_page_box_refill(int class_idx, + TinyTLSSlab* tls, void** out, int max_out) { + (void)tls; // reserved for future per-TLS hints if (!tiny_page_box_is_enabled(class_idx)) { return 0; } @@ -233,7 +251,7 @@ static inline int tiny_page_box_refill(int class_idx, return 0; } - TinyPageBoxState* st = &g_tiny_page_box_state[class_idx]; + TinyPageBoxContext* st = &g_tiny_page_box[class_idx]; if (st->num_pages == 0) { return 0; } diff --git a/core/box/tiny_policy_learner_box.c b/core/box/tiny_policy_learner_box.c index 0cc60b55..c4c5cd84 100644 --- a/core/box/tiny_policy_learner_box.c +++ b/core/box/tiny_policy_learner_box.c @@ -4,39 +4,78 @@ #include "tiny_class_policy_box.h" #include "tiny_class_stats_box.h" #include +#include -// Simple OBSERVE/LEARN rule: -// - Choose top-2 classes by shared_pool_lock and enable Page Box for them. -// - Always keep existing warm_enabled / warm_cap (policy table is already seeded). +// Simple OBSERVE/LEARN rule (auto profile only): +// - C7 は常に ON (page + warm, cap=8) +// - それ以外のクラスから score = shared_lock*4 + uc_miss の上位2つだけ page/warm を ON +// - warm_cap は C5–C7:8, それ以外:4 +// - スコアが 0 なら何も変更しない void tiny_policy_learner_tick(void) { + if (!tiny_class_policy_is_auto()) { + return; + } + TinyClassStatsThread snap = {0}; tiny_class_stats_snapshot_global(&snap); + // 事前に全クラスを OFF ベースに初期化(cap はデフォルト値に) + for (int c = 0; c < TINY_NUM_CLASSES; c++) { + TinyClassPolicy* p = &g_tiny_class_policy[c]; + p->page_box_enabled = 0; + p->warm_enabled = 0; + p->warm_cap = (c >= 5) ? 8 : 4; + p->tls_carve_enabled = 0; + } + + // C7 は常に ON + g_tiny_class_policy[7].page_box_enabled = 1; + g_tiny_class_policy[7].warm_enabled = 1; + g_tiny_class_policy[7].warm_cap = 8; + g_tiny_class_policy[7].tls_carve_enabled = 1; + + // C7 を除く上位2クラスをスコアで選択 int top1 = -1, top2 = -1; uint64_t v1 = 0, v2 = 0; for (int i = 0; i < TINY_NUM_CLASSES; i++) { - uint64_t v = snap.shared_lock[i]; - if (v > v1) { + if (i == 7) continue; + uint64_t score = snap.shared_lock[i] * 4 + snap.uc_miss[i]; + if (score > v1) { top2 = top1; v2 = v1; top1 = i; - v1 = v; - } else if (v > v2) { + v1 = score; + } else if (score > v2) { top2 = i; - v2 = v; + v2 = score; } } - // Nothing observed yet → leave policy untouched + // スコアが全く無い場合は C7 だけ維持 if (v1 == 0) { return; } - for (int c = 0; c < TINY_NUM_CLASSES; c++) { - TinyClassPolicy* p = &g_tiny_class_policy[c]; - if (c == top1 || c == top2) { - p->page_box_enabled = 1; - p->warm_enabled = 1; + if (top1 >= 0) { + TinyClassPolicy* p = &g_tiny_class_policy[top1]; + p->page_box_enabled = 1; + p->warm_enabled = 1; + p->tls_carve_enabled = 1; + } + if (top2 >= 0 && v2 > 0) { + TinyClassPolicy* p = &g_tiny_class_policy[top2]; + p->page_box_enabled = 1; + p->warm_enabled = 1; + p->tls_carve_enabled = 1; + } + + // 1-shot ログ(最多 4 回まで) + static _Atomic uint32_t auto_logs = 0; + if (tiny_policy_log_enabled()) { + uint32_t n = atomic_fetch_add_explicit(&auto_logs, 1, memory_order_relaxed); + if (n < 4) { + fprintf(stderr, "[POLICY_AUTO_UPDATE] profile=auto (top=%d/%d)\n", top1, top2); + tiny_class_policy_dump(NULL); } } } diff --git a/core/box/tiny_tls_carve_one_block_box.h b/core/box/tiny_tls_carve_one_block_box.h index a3af9f0b..59d31d10 100644 --- a/core/box/tiny_tls_carve_one_block_box.h +++ b/core/box/tiny_tls_carve_one_block_box.h @@ -7,6 +7,7 @@ #include "../tiny_debug_api.h" // tiny_refill_failfast_level(), tiny_failfast_abort_ptr() #include "c7_meta_used_counter_box.h" // C7 meta->used telemetry (Release/Debug共通) #include "tiny_next_ptr_box.h" +#include "tiny_class_stats_box.h" #include "../superslab/superslab_inline.h" #include #include @@ -41,6 +42,8 @@ tiny_tls_carve_one_block(TinyTLSSlab* tls, int class_idx) if (meta->class_idx != (uint8_t)class_idx) return res; if (tls->slab_idx < 0 || tls->slab_idx >= ss_slabs_capacity(tls->ss)) return res; + tiny_class_stats_on_tls_carve_attempt(class_idx); + // Freelist pop if (meta->freelist) { #if !HAKMEM_BUILD_RELEASE @@ -61,6 +64,7 @@ tiny_tls_carve_one_block(TinyTLSSlab* tls, int class_idx) meta->used++; c7_meta_used_note(meta->class_idx, C7_META_USED_SRC_TLS); ss_active_add(tls->ss, 1); + tiny_class_stats_on_tls_carve_success(class_idx); res.block = block; res.path = TINY_TLS_CARVE_PATH_FREELIST; return res; @@ -93,6 +97,7 @@ tiny_tls_carve_one_block(TinyTLSSlab* tls, int class_idx) meta->used++; c7_meta_used_note(meta->class_idx, C7_META_USED_SRC_TLS); ss_active_add(tls->ss, 1); + tiny_class_stats_on_tls_carve_success(class_idx); res.block = block; res.path = TINY_TLS_CARVE_PATH_LINEAR; return res; diff --git a/core/box/warm_pool_prefill_box.h b/core/box/warm_pool_prefill_box.h index 1f6367d6..906466c4 100644 --- a/core/box/warm_pool_prefill_box.h +++ b/core/box/warm_pool_prefill_box.h @@ -9,6 +9,7 @@ #include #include #include +#include #include "../hakmem_tiny_config.h" #include "../hakmem_tiny_superslab.h" #include "../tiny_tls.h" @@ -18,8 +19,18 @@ extern _Atomic uintptr_t g_c7_stage3_magic_ss; +static inline int warm_prefill_log_enabled(void) { + static int g_warm_log = -1; + if (__builtin_expect(g_warm_log == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_WARM_LOG"); + g_warm_log = (e && *e && *e != '0') ? 1 : 0; + } + return g_warm_log; +} + static inline void warm_prefill_log_c7_meta(const char* tag, TinyTLSSlab* tls) { if (!tls || !tls->ss) return; + if (!warm_prefill_log_enabled()) return; #if HAKMEM_BUILD_RELEASE static _Atomic uint32_t rel_logs = 0; uint32_t n = atomic_fetch_add_explicit(&rel_logs, 1, memory_order_relaxed); @@ -116,7 +127,7 @@ static inline int warm_pool_do_prefill(int class_idx, TinyTLSSlab* tls, int warm } // C7 safety: prefer only pristine slabs (used=0 carved=0 freelist=NULL) - if (class_idx == 7) { + if (class_idx == 7 && warm_prefill_log_enabled()) { TinySlabMeta* meta = &tls->ss->slabs[tls->slab_idx]; if (meta->class_idx == 7 && (meta->used > 0 || meta->carved > 0 || meta->freelist != NULL)) { @@ -162,7 +173,7 @@ static inline int warm_pool_do_prefill(int class_idx, TinyTLSSlab* tls, int warm warm_pool_rel_c7_prefill_slab(); } #else - if (class_idx == 7) { + if (class_idx == 7 && warm_prefill_log_enabled()) { static __thread int dbg_c7_prefill_logs = 0; if (dbg_c7_prefill_logs < 8) { TinySlabMeta* meta = &tls->ss->slabs[tls->slab_idx]; diff --git a/core/box/warm_pool_stats_box.h b/core/box/warm_pool_stats_box.h index 519e5418..3556e929 100644 --- a/core/box/warm_pool_stats_box.h +++ b/core/box/warm_pool_stats_box.h @@ -23,31 +23,19 @@ extern __thread TinyWarmPoolStats g_warm_pool_stats[TINY_NUM_CLASSES]; // Record a warm pool hit // Called when warm_pool_pop() succeeds and carve produces blocks static inline void warm_pool_record_hit(int class_idx) { -#if HAKMEM_DEBUG_COUNTERS g_warm_pool_stats[class_idx].hits++; -#else - (void)class_idx; -#endif } // Record a warm pool miss // Called when warm_pool_pop() returns NULL (pool empty) static inline void warm_pool_record_miss(int class_idx) { -#if HAKMEM_DEBUG_COUNTERS g_warm_pool_stats[class_idx].misses++; -#else - (void)class_idx; -#endif } // Record a warm pool prefill event // Called when pool is empty and we do secondary prefill static inline void warm_pool_record_prefilled(int class_idx) { -#if HAKMEM_DEBUG_COUNTERS g_warm_pool_stats[class_idx].prefilled++; -#else - (void)class_idx; -#endif } #endif // HAK_WARM_POOL_STATS_BOX_H diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index 4e6e0593..c470351e 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -36,6 +36,7 @@ #include "../hakmem_tiny.h" // For hak_tiny_size_to_class #include "../box/tiny_front_hot_box.h" // Phase 4-Step2: Hot Path Box #include "../box/tiny_front_cold_box.h" // Phase 4-Step2: Cold Path Box +#include "../box/tiny_c7_hotpath_box.h" // Optional: C7 専用ホットパス // Helper: current thread id (low 32 bits) for owner check #ifndef TINY_SELF_U32_LOCAL_DEFINED @@ -98,6 +99,11 @@ static inline void* malloc_tiny_fast(size_t size) { // 1. size → class_idx (inline table lookup, 1-2 instructions) int class_idx = hak_tiny_size_to_class(size); + // Optional: C7 専用ホットパス(環境変数 HAKMEM_TINY_C7_HOT でON) + if (__builtin_expect(class_idx == 7 && tiny_c7_hot_enabled(), 0)) { + return tiny_c7_alloc_hot(size); + } + // 2. Phase 4-Step2: Hot/Cold Path Box // Try hot path first (cache hit, 1 branch) void* ptr = tiny_hot_alloc_fast(class_idx); @@ -235,6 +241,14 @@ static inline int free_tiny_fast(void* ptr) { } #endif + // Optional: C7 専用ホットパス(キャッシュのみで完了させる) + if (__builtin_expect(class_idx == 7 && tiny_c7_hot_enabled(), 0)) { + if (tiny_c7_free_hot(base)) { + return 1; + } + // fallthrough to unified cache push on failure + } + int pushed = unified_cache_push(class_idx, HAK_BASE_FROM_RAW(base)); if (__builtin_expect(pushed, 1)) { return 1; // Success diff --git a/core/front/tiny_unified_cache.c b/core/front/tiny_unified_cache.c index f29e0c79..e2ab7822 100644 --- a/core/front/tiny_unified_cache.c +++ b/core/front/tiny_unified_cache.c @@ -17,6 +17,7 @@ #undef WARM_POOL_REL_DEFINE #include "../box/c7_meta_used_counter_box.h" // Box: C7 meta->used increment counters #include "../box/warm_pool_prefill_box.h" // Box: Warm Pool Prefill (secondary optimization) +#include "../box/tiny_mem_stats_box.h" // Box: Tiny front memory accounting #include "../hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls) #include "../box/tiny_page_box.h" // Tiny-Plus Page Box (C5–C7 initial hook) #include "../box/ss_tls_bind_box.h" // Box: TLS Bind (SuperSlab -> TLS binding) @@ -205,6 +206,8 @@ void unified_cache_init(void) { continue; // Skip this class, try others } + tiny_mem_stats_add_unified((ssize_t)(cap * sizeof(void*))); + g_unified_cache[cls].capacity = (uint16_t)cap; g_unified_cache[cls].mask = (uint16_t)(cap - 1); g_unified_cache[cls].head = 0; @@ -522,6 +525,7 @@ hak_base_ptr_t unified_cache_refill(int class_idx) { int warm_enabled = policy ? policy->warm_enabled : 0; int warm_cap = policy ? policy->warm_cap : 0; int page_enabled = policy ? policy->page_box_enabled : 0; + TinyTLSSlab* tls = &g_tls_slabs[class_idx]; // ✅ Phase 11+: Ensure cache is initialized (lazy init for cold path) if (!cache->slots) { @@ -562,12 +566,15 @@ hak_base_ptr_t unified_cache_refill(int class_idx) { void* out[512]; int produced = 0; int tls_carved = 0; // Debug bookkeeping: track TLS carve experiment hits +#if HAKMEM_BUILD_RELEASE + (void)tls_carved; +#endif // ========== PAGE BOX HOT PATH(Tiny-Plus 層): Try page box FIRST ========== // 将来的に C7 専用の page-level freelist 管理をここに統合する。 // いまは stub 実装で常に 0 を返すが、Box 境界としての接続だけ先に行う。 if (page_enabled && tiny_page_box_is_enabled(class_idx)) { - int page_produced = tiny_page_box_refill(class_idx, out, room); + int page_produced = tiny_page_box_refill(class_idx, tls, out, room); if (page_produced > 0) { // Store blocks into cache and return first void* first = out[0]; @@ -625,45 +632,58 @@ hak_base_ptr_t unified_cache_refill(int class_idx) { #endif SuperSlab* warm_ss = tiny_warm_pool_pop(class_idx); if (warm_ss) { + int allow_tls_bind = policy && policy->tls_carve_enabled; + int allow_tls_carve = allow_tls_bind; + int warm_mode = 0; if (class_idx == 7) { #if !HAKMEM_BUILD_RELEASE warm_pool_dbg_c7_hit(); #endif - int warm_mode = warm_tls_bind_mode_c7(); - if (warm_mode >= 1) { - int cap = ss_slabs_capacity(warm_ss); - int slab_idx = -1; + warm_mode = warm_tls_bind_mode_c7(); + allow_tls_bind = (warm_mode >= 1); + allow_tls_carve = (warm_mode == 2); + } - // Simple heuristic: first slab matching class - for (int i = 0; i < cap; i++) { - if (tiny_get_class_from_ss(warm_ss, i) == class_idx) { - slab_idx = i; - break; - } + if (allow_tls_bind) { + int cap = ss_slabs_capacity(warm_ss); + int slab_idx = -1; + + // Simple heuristic: first slab matching class + for (int i = 0; i < cap; i++) { + if (tiny_get_class_from_ss(warm_ss, i) == class_idx) { + slab_idx = i; + break; } + } - if (slab_idx >= 0) { - TinyTLSSlab* tls = &g_tls_slabs[class_idx]; - uint32_t tid = (uint32_t)(uintptr_t)pthread_self(); - if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) { + if (slab_idx >= 0) { + uint32_t tid = (uint32_t)(uintptr_t)pthread_self(); + if (ss_tls_bind_one(class_idx, tls, warm_ss, slab_idx, tid)) { + if (class_idx == 7) { warm_tls_bind_log_success(warm_ss, slab_idx); + } - // Mode 2: carve a single block via TLS fast path - if (warm_mode == 2) { - #if !HAKMEM_BUILD_RELEASE + // Mode 2: carve a single block via TLS fast path (policy enabled classes) + if (allow_tls_carve) { + #if !HAKMEM_BUILD_RELEASE + if (class_idx == 7) { warm_pool_dbg_c7_tls_attempt(); - #endif - TinyTLSCarveOneResult tls_carve = - tiny_tls_carve_one_block(tls, class_idx); - if (tls_carve.block) { + } + #endif + TinyTLSCarveOneResult tls_carve = + tiny_tls_carve_one_block(tls, class_idx); + if (tls_carve.block) { + if (class_idx == 7) { warm_tls_bind_log_tls_carve(warm_ss, slab_idx, tls_carve.block); #if !HAKMEM_BUILD_RELEASE warm_pool_dbg_c7_tls_success(); #endif - out[0] = tls_carve.block; - produced = 1; - tls_carved = 1; - } else { + } + out[0] = tls_carve.block; + produced = 1; + tls_carved = 1; + } else { + if (class_idx == 7) { warm_tls_bind_log_tls_fail(warm_ss, slab_idx); #if !HAKMEM_BUILD_RELEASE warm_pool_dbg_c7_tls_fail(); @@ -774,8 +794,6 @@ hak_base_ptr_t unified_cache_refill(int class_idx) { warm_pool_record_miss(class_idx); } - TinyTLSSlab* tls = &g_tls_slabs[class_idx]; - // Step 1: Ensure SuperSlab available via normal refill // Enhanced: Use Warm Pool Prefill Box for secondary prefill when pool is empty if (warm_enabled) { diff --git a/core/front/tiny_warm_pool.h b/core/front/tiny_warm_pool.h index f620e501..c9b21a48 100644 --- a/core/front/tiny_warm_pool.h +++ b/core/front/tiny_warm_pool.h @@ -10,6 +10,7 @@ #include #include "../hakmem_tiny_config.h" #include "../superslab/superslab_types.h" +#include "../box/tiny_mem_stats_box.h" // ============================================================================ // Warm Pool Design @@ -74,6 +75,7 @@ static inline void tiny_warm_pool_init_once(void) { for (int i = 0; i < TINY_NUM_CLASSES; i++) { g_tiny_warm_pool[i].count = 0; } + tiny_mem_stats_add_warm((ssize_t)(sizeof(g_tiny_warm_pool) + sizeof(g_warm_pool_stats))); initialized = 1; } } diff --git a/core/hakmem_shared_pool.c b/core/hakmem_shared_pool.c index 80a75476..b135c0d1 100644 --- a/core/hakmem_shared_pool.c +++ b/core/hakmem_shared_pool.c @@ -7,6 +7,7 @@ #include "box/tls_sll_drain_box.h" // Box TLS SLL Drain (tiny_tls_sll_drain) #include "box/tls_slab_reuse_guard_box.h" // Box TLS Slab Reuse Guard (P0.3) #include "hakmem_policy.h" // FrozenPolicy (learning layer) +#include "box/shared_pool_box.h" // Logical cap for bench profile #include #include @@ -287,6 +288,7 @@ shared_pool_init(void) { // Idempotent init; safe to call from multiple early paths. // pthread_mutex_t with static initializer is already valid. + shared_pool_box_init(NULL, NULL); pthread_mutex_lock(&g_shared_pool.alloc_lock); if (g_shared_pool.capacity == 0 && g_shared_pool.slabs == NULL) { shared_pool_ensure_capacity_unlocked(16); diff --git a/core/hakmem_shared_pool_acquire.c b/core/hakmem_shared_pool_acquire.c index 0a0c6faa..21167c22 100644 --- a/core/hakmem_shared_pool_acquire.c +++ b/core/hakmem_shared_pool_acquire.c @@ -12,6 +12,10 @@ #include "front/tiny_warm_pool.h" // Warm Pool: Prefill during registry scans #include "box/ss_slab_reset_box.h" // Box: Reset slab metadata on reuse (C7 guard) #include "box/tiny_class_stats_box.h" // OBSERVE: per-class shared lock stats +#include "box/ss_stats_box.h" // OBSERVE: Superslab/slab event counters +#include "box/ss_budget_box.h" // Budget guard for Superslab growth (larson_guard) +#include "box/super_reg_box.h" // Logical limit for registry scan +#include "box/shared_pool_box.h" // Logical cap for shared pool slots (bench profile) #include #include @@ -22,8 +26,8 @@ _Atomic uintptr_t g_c7_stage3_magic_ss = 0; static inline void sp_lock_with_stats(int class_idx) { + pthread_mutex_lock(&g_shared_pool.alloc_lock); tiny_class_stats_on_shared_lock(class_idx); - sp_lock_with_stats(class_idx); } static inline void c7_log_meta_state(const char* tag, SuperSlab* ss, int slab_idx) { @@ -159,6 +163,37 @@ static inline int c7_reset_and_log_if_needed(SuperSlab* ss, return 0; } +static inline void sp_reset_superslab_all_slabs(SuperSlab* ss, + int class_idx, + int from_lru) { + if (!ss) { + return; + } + int cap = ss_slabs_capacity(ss); + ss->slab_bitmap = 0; + ss->nonempty_mask = 0; + ss->freelist_mask = 0; + ss->empty_mask = 0; + ss->empty_count = 0; + ss->active_slabs = 0; + ss->hot_count = 0; + ss->cold_count = 0; + for (int s = 0; s < cap; s++) { + ss_slab_reset_meta_for_tiny(ss, s, class_idx); + } + ss_stats_on_ss_scan(class_idx, 0, 1); + static _Atomic uint32_t rel_stage3_reset_logs = 0; + uint32_t n = atomic_fetch_add_explicit(&rel_stage3_reset_logs, 1, memory_order_relaxed); + if (n < 4) { + fprintf(stderr, + "[REL_STAGE3_RESET] class=%d ss=%p from_lru=%d cap=%d\n", + class_idx, + (void*)ss, + from_lru, + cap); + } +} + // ============================================================================ // Performance Measurement: Shared Pool Lock Contention (ENV-gated) // ============================================================================ @@ -208,10 +243,13 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, return -1; } - extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; extern int g_super_reg_class_size[TINY_NUM_CLASSES]; int reg_size = (class_idx < TINY_NUM_CLASSES) ? g_super_reg_class_size[class_idx] : 0; + int reg_cap = super_reg_effective_per_class(); + if (reg_cap > 0 && reg_size > reg_cap) { + reg_size = reg_cap; + } // Priority-2: Use cached ENV int scan_limit = HAK_ENV_SS_EMPTY_SCAN_LIMIT(); if (scan_limit > reg_size) scan_limit = reg_size; @@ -229,7 +267,7 @@ sp_acquire_from_empty_scan(int class_idx, SuperSlab** ss_out, int* slab_idx_out, int primary_slab_idx = -1; for (int i = 0; i < scan_limit; i++) { - SuperSlab* ss = g_super_reg_by_class[class_idx][i]; + SuperSlab* ss = super_reg_by_class_at(class_idx, i); if (!(ss && ss->magic == SUPERSLAB_MAGIC)) continue; // P-Tier: Skip DRAINING tier SuperSlabs if (!ss_tier_is_hot(ss)) continue; @@ -769,6 +807,26 @@ stage2_scan: 1, memory_order_relaxed); } + // bench プロファイルでは Shared Pool の論理上限を軽くかけておく + uint32_t total_limit = shared_pool_effective_total_slots(); + if (total_limit > 0 && g_shared_pool.total_count >= total_limit) { + if (g_lock_stats_enabled == 1) { + atomic_fetch_add(&g_lock_release_count, 1); + } + pthread_mutex_unlock(&g_shared_pool.alloc_lock); + return -1; + } + uint32_t class_limit = shared_pool_effective_class_slots(class_idx); + if (class_limit > 0 && + class_idx < TINY_NUM_CLASSES_SS && + (uint32_t)g_shared_pool.class_active_slots[class_idx] >= class_limit) { + if (g_lock_stats_enabled == 1) { + atomic_fetch_add(&g_lock_release_count, 1); + } + pthread_mutex_unlock(&g_shared_pool.alloc_lock); + return -1; + } + // ========== Stage 3: Get new SuperSlab ========== // Try LRU cache first, then mmap SuperSlab* new_ss = NULL; @@ -786,6 +844,13 @@ stage2_scan: // Stage 3b: If LRU miss, allocate new SuperSlab if (!new_ss) { + if (!ss_budget_on_alloc(class_idx)) { + if (g_lock_stats_enabled == 1) { + atomic_fetch_add(&g_lock_release_count, 1); + } + pthread_mutex_unlock(&g_shared_pool.alloc_lock); + return -1; + } // Release the alloc_lock to avoid deadlock with registry during superslab_allocate if (g_lock_stats_enabled == 1) { atomic_fetch_add(&g_lock_release_count, 1); @@ -834,27 +899,10 @@ stage2_scan: g_shared_pool.total_count++; } - // C7: LRU 再利用・新規確保いずれでも、空スラブに完全リセットしてから返す - if (class_idx == 7 && new_ss) { - int cap = ss_slabs_capacity(new_ss); - new_ss->slab_bitmap = 0; - new_ss->nonempty_mask = 0; - new_ss->freelist_mask = 0; - new_ss->empty_mask = 0; - new_ss->empty_count = 0; - new_ss->active_slabs = 0; - new_ss->hot_count = 0; - new_ss->cold_count = 0; - for (int s = 0; s < cap; s++) { - ss_slab_reset_meta_for_tiny(new_ss, s, class_idx); - } - static _Atomic uint32_t rel_stage3_reset_logs = 0; - uint32_t n = atomic_fetch_add_explicit(&rel_stage3_reset_logs, 1, memory_order_relaxed); - if (n < 4) { - fprintf(stderr, - "[REL_C7_STAGE3_RESET] ss=%p from_lru=%d cap=%d\n", - (void*)new_ss, from_lru, cap); - } + // Stage3 から返す前に、LRU 再利用分は必ず空スラブ化する。 + // C7 以外でも from_lru の場合は全スラブをリセットしておく。 + if (new_ss && (from_lru || class_idx == 7)) { + sp_reset_superslab_all_slabs(new_ss, class_idx, from_lru); } #if !HAKMEM_BUILD_RELEASE diff --git a/core/hakmem_shared_pool_release.c b/core/hakmem_shared_pool_release.c index 71a37b6e..26150f19 100644 --- a/core/hakmem_shared_pool_release.c +++ b/core/hakmem_shared_pool_release.c @@ -7,6 +7,8 @@ #include "superslab/superslab_inline.h" // superslab_ref_get guard for TLS pins #include "box/ss_release_guard_box.h" // Box: SuperSlab Release Guard #include "box/ss_slab_reset_box.h" // Box: Reset slab metadata on reuse path +#include "box/ss_stats_box.h" // Observability: Superslab/slab counters +#include "box/ss_budget_box.h" // Budget guard (global/class caps) #include #include @@ -217,6 +219,8 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx) slab_meta->class_idx = 255; // UNASSIGNED // P1.1: Mark class_map as UNASSIGNED when releasing slab ss->class_map[slab_idx] = 255; + // Reset slab metadata to a pristine state for all classes (C0–C7) + ss_slab_reset_meta_for_tiny(ss, slab_idx, -1); if (ss->active_slabs > 0) { ss->active_slabs--; @@ -284,6 +288,8 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx) // Free SuperSlab immediately (bypasses normal active_slots==0 check) extern void superslab_free(SuperSlab* ss); + ss_stats_on_ss_free_class(class_idx); + ss_budget_on_free(class_idx); superslab_free(ss); return; } @@ -321,6 +327,8 @@ shared_pool_release_slab(SuperSlab* ss, int slab_idx) // If so, we must NOT free the SS. if (ss_release_guard_superslab_can_free(ss)) { extern void superslab_free(SuperSlab* ss); + ss_stats_on_ss_free_class(class_idx); + ss_budget_on_free(class_idx); superslab_free(ss); } else { #if !HAKMEM_BUILD_RELEASE diff --git a/core/hakmem_super_registry.c b/core/hakmem_super_registry.c index c2b24a1d..d967d798 100644 --- a/core/hakmem_super_registry.c +++ b/core/hakmem_super_registry.c @@ -4,17 +4,20 @@ #include "box/ss_addr_map_box.h" // Phase 9-1: SuperSlab address map #include "box/ss_cold_start_box.inc.h" // Phase 11+: Cold Start prewarm defaults #include "hakmem_env_cache.h" // Priority-2: ENV cache (eliminate syscalls) +#include #include #include #include // munmap for incompatible SuperSlab eviction -// Global registry storage -SuperRegEntry g_super_reg[SUPER_REG_SIZE]; +// Global registry storage (allocated via SuperRegBox) +static SuperRegEntry* reg_entries(void) { + return super_reg_entries(); +} + pthread_mutex_t g_super_reg_lock = PTHREAD_MUTEX_INITIALIZER; int g_super_reg_initialized = 0; // Per-class registry storage (Phase 6: Registry Optimization) -SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; int g_super_reg_class_size[TINY_NUM_CLASSES]; // Phase 9: Lazy Deallocation - LRU Cache Storage @@ -28,11 +31,23 @@ static _Atomic int g_ss_prewarm_bypass = 0; void hak_super_registry_init(void) { if (g_super_reg_initialized) return; + super_reg_init(NULL, NULL); + + SuperRegEntry* entries = reg_entries(); + int reg_cap = super_reg_effective_size(); + if (!entries) { + fprintf(stderr, "[SUPER_REG] init failed: no registry entries\n"); + abort(); + } // Zero-initialize all entries (hash table) - memset(g_super_reg, 0, sizeof(g_super_reg)); + memset(entries, 0, (size_t)reg_cap * sizeof(SuperRegEntry)); // Zero-initialize per-class registry (Phase 6: Registry Optimization) - memset(g_super_reg_by_class, 0, sizeof(g_super_reg_by_class)); + SuperSlab** by_class = super_reg_by_class_slots(); + int stride = super_reg_by_class_stride(); + if (by_class && stride > 0) { + memset(by_class, 0, (size_t)TINY_NUM_CLASSES * (size_t)stride * sizeof(SuperSlab*)); + } memset(g_super_reg_class_size, 0, sizeof(g_super_reg_class_size)); // Memory fence to ensure initialization is visible to all threads @@ -62,12 +77,22 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) { const int dbg = 0; #endif + SuperRegEntry* entries = reg_entries(); + if (!entries) { + pthread_mutex_unlock(&g_super_reg_lock); + return 0; + } + int h = hak_super_hash(base, lg); + const int mask = super_reg_effective_mask(); + const int probe_limit = super_reg_effective_size() > SUPER_MAX_PROBE + ? SUPER_MAX_PROBE + : super_reg_effective_size(); // Step 1: Register in hash table (for address → SuperSlab lookup) int hash_registered = 0; - for (int i = 0; i < SUPER_MAX_PROBE; i++) { - SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK]; + for (int i = 0; i < probe_limit; i++) { + SuperRegEntry* e = &entries[(h + i) & mask]; if (atomic_load_explicit(&e->base, memory_order_acquire) == 0) { // Found empty slot @@ -84,7 +109,7 @@ int hak_super_register(uintptr_t base, SuperSlab* ss) { hash_registered = 1; if (dbg == 1) { fprintf(stderr, "[SUPER_REG] register base=%p lg=%d slot=%d magic=%llx\n", - (void*)base, lg, (h + i) & SUPER_REG_MASK, + (void*)base, lg, (h + i) & mask, (unsigned long long)ss->magic); } break; @@ -131,12 +156,22 @@ void hak_super_unregister(uintptr_t base) { // Step 1: Find and remove from hash table SuperSlab* ss = NULL; // Save SuperSlab pointer for per-class removal + SuperRegEntry* entries = reg_entries(); + if (!entries) { + pthread_mutex_unlock(&g_super_reg_lock); + return; + } + for (int lg = 20; lg <= 21; lg++) { int h = hak_super_hash(base, lg); + const int mask = super_reg_effective_mask(); + const int probe_limit = super_reg_effective_size() > SUPER_MAX_PROBE + ? SUPER_MAX_PROBE + : super_reg_effective_size(); // Linear probing to find matching entry - for (int i = 0; i < SUPER_MAX_PROBE; i++) { - SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK]; + for (int i = 0; i < probe_limit; i++) { + SuperRegEntry* e = &entries[(h + i) & mask]; if (atomic_load_explicit(&e->base, memory_order_acquire) == base && e->lg_size == lg) { // Found entry to remove @@ -775,30 +810,37 @@ void hak_ss_prewarm_init(void) { void hak_super_registry_stats(SuperRegStats* stats) { if (!stats) return; - stats->total_slots = SUPER_REG_SIZE; + int eff_size = super_reg_effective_size(); + int eff_mask = super_reg_effective_mask(); + SuperRegEntry* reg = reg_entries(); + + stats->total_slots = eff_size; stats->used_slots = 0; stats->max_probe_depth = 0; + if (!reg || eff_size <= 0) { + return; + } pthread_mutex_lock(&g_super_reg_lock); // Count used slots - for (int i = 0; i < SUPER_REG_SIZE; i++) { - if (atomic_load_explicit(&g_super_reg[i].base, memory_order_acquire) != 0) { + for (int i = 0; i < eff_size; i++) { + if (atomic_load_explicit(®[i].base, memory_order_acquire) != 0) { stats->used_slots++; } } // Calculate max probe depth - for (int i = 0; i < SUPER_REG_SIZE; i++) { - if (atomic_load_explicit(&g_super_reg[i].base, memory_order_acquire) != 0) { - uintptr_t base = atomic_load_explicit(&g_super_reg[i].base, memory_order_acquire); - int lg = g_super_reg[i].lg_size; // Phase 8.3: Use stored lg_size + for (int i = 0; i < eff_size; i++) { + if (atomic_load_explicit(®[i].base, memory_order_acquire) != 0) { + uintptr_t base = atomic_load_explicit(®[i].base, memory_order_acquire); + int lg = reg[i].lg_size; // Phase 8.3: Use stored lg_size int h = hak_super_hash(base, lg); // Find actual probe depth for this entry for (int j = 0; j < SUPER_MAX_PROBE; j++) { - int idx = (h + j) & SUPER_REG_MASK; - if (atomic_load_explicit(&g_super_reg[idx].base, memory_order_acquire) == base && g_super_reg[idx].lg_size == lg) { + int idx = (h + j) & eff_mask; + if (atomic_load_explicit(®[idx].base, memory_order_acquire) == base && reg[idx].lg_size == lg) { if (j > stats->max_probe_depth) { stats->max_probe_depth = j; } diff --git a/core/hakmem_super_registry.h b/core/hakmem_super_registry.h index 8d86f9e1..68e6bc06 100644 --- a/core/hakmem_super_registry.h +++ b/core/hakmem_super_registry.h @@ -19,6 +19,7 @@ #include #include "hakmem_tiny_superslab.h" // For SuperSlab and SUPERSLAB_MAGIC #include "box/ss_addr_map_box.h" // Phase 9-1: O(1) hash table lookup +#include "box/super_reg_box.h" // Phase X: profile-aware logical registry sizing // Registry configuration // Increased from 4096 to 32768 to avoid registry exhaustion under @@ -36,7 +37,7 @@ #define SUPER_REG_PER_CLASS 16384 // Per-class registry capacity (increased for high-churn workloads) // Registry entry: base address → SuperSlab pointer mapping -typedef struct { +typedef struct SuperRegEntry { _Atomic(uintptr_t) base; // Aligned base address (1MB or 2MB, 0 = empty slot) [atomic for proper sync] _Atomic(SuperSlab*) ss; // Atomic SuperSlab pointer (MT-safe, prevents TOCTOU race) uint8_t lg_size; // Phase 8.3: ACE - SuperSlab size (20=1MB, 21=2MB) @@ -44,7 +45,6 @@ typedef struct { } SuperRegEntry; // Global registry (lock-free reads, mutex-protected writes) -extern SuperRegEntry g_super_reg[SUPER_REG_SIZE]; extern pthread_mutex_t g_super_reg_lock; extern int g_super_reg_initialized; @@ -56,7 +56,6 @@ extern int g_super_reg_initialized; #ifndef TINY_NUM_CLASSES #define TINY_NUM_CLASSES 8 // Fallback if hakmem_tiny.h not included yet #endif -extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; extern int g_super_reg_class_size[TINY_NUM_CLASSES]; // ============================================================================ @@ -111,7 +110,7 @@ void hak_super_registry_init(void); // Hash function for aligned addresses (variable size) static inline int hak_super_hash(uintptr_t base, int lg_size) { // Phase 8.3: ACE - Variable size hash (lg_size = 20 for 1MB, 21 for 2MB) - return (int)((base >> lg_size) & SUPER_REG_MASK); + return (int)((base >> lg_size) & super_reg_effective_mask()); } // Lookup SuperSlab by pointer (lock-free, thread-safe) @@ -127,12 +126,18 @@ static inline SuperSlab* hak_super_lookup(void* ptr) { // Fallback: If hash map misses (e.g., map not populated yet), probe the // legacy registry table to avoid NULL for valid SuperSlabs. if (__builtin_expect(ss == NULL, 0)) { + SuperRegEntry* reg = super_reg_entries(); + if (!reg) return NULL; uintptr_t p = (uintptr_t)ptr; for (int lg = SUPERSLAB_LG_MIN; lg <= SUPERSLAB_LG_MAX; lg++) { uintptr_t base = p & ~(((uintptr_t)1 << lg) - 1); int h = hak_super_hash(base, lg); - for (int i = 0; i < SUPER_MAX_PROBE; i++) { - SuperRegEntry* e = &g_super_reg[(h + i) & SUPER_REG_MASK]; + int eff_mask = super_reg_effective_mask(); + int probe_limit = super_reg_effective_size() > SUPER_MAX_PROBE + ? SUPER_MAX_PROBE + : super_reg_effective_size(); + for (int i = 0; i < probe_limit; i++) { + SuperRegEntry* e = ®[(h + i) & eff_mask]; uintptr_t reg_base = atomic_load_explicit(&e->base, memory_order_acquire); if (reg_base == 0) { break; // empty slot diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c index 248ba875..301b8fc5 100644 --- a/core/hakmem_tiny.c +++ b/core/hakmem_tiny.c @@ -26,6 +26,7 @@ #include "tiny_tls_guard.h" #include "tiny_ready.h" #include "box/c7_meta_used_counter_box.h" +#include "box/super_reg_box.h" #include "hakmem_tiny_tls_list.h" #include "hakmem_tiny_remote_target.h" // Phase 2C-1: Remote target queue #include "hakmem_tiny_bg_spill.h" // Phase 2C-2: Background spill queue @@ -124,7 +125,7 @@ static void* __attribute__((cold, noinline)) hak_tiny_alloc_slow(size_t size, in // Box: adopt_gate_try (implementation moved from header for robust linkage) // --------------------------------------------------------------------------- #include "box/adopt_gate_box.h" -extern SuperSlab* g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]; +#include "box/super_reg_box.h" extern int g_super_reg_class_size[TINY_NUM_CLASSES]; extern unsigned long long g_adopt_gate_calls[]; extern unsigned long long g_adopt_gate_success[]; @@ -137,6 +138,10 @@ SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) { if (ss) { g_adopt_gate_success[class_idx]++; return ss; } g_reg_scan_attempts[class_idx]++; int reg_size = g_super_reg_class_size[class_idx]; + int reg_cap = super_reg_effective_per_class(); + if (reg_cap > 0 && reg_size > reg_cap) { + reg_size = reg_cap; + } int scan_limit = tiny_reg_scan_max(); if (scan_limit > reg_size) scan_limit = reg_size; uint32_t self_tid = tiny_self_u32(); @@ -156,7 +161,7 @@ SuperSlab* adopt_gate_try(int class_idx, TinyTLSSlab* tls) { } for (int i = 0; i < scan_limit; i++) { - SuperSlab* cand = g_super_reg_by_class[class_idx][i]; + SuperSlab* cand = super_reg_by_class_at(class_idx, i); if (!(cand && cand->magic == SUPERSLAB_MAGIC)) continue; // Fast path: use nonempty_mask / freelist_mask to locate candidates in O(1) uint32_t mask = cand->nonempty_mask; diff --git a/core/hakmem_tiny_lifecycle.inc b/core/hakmem_tiny_lifecycle.inc index 1a2cdbf9..df7487ec 100644 --- a/core/hakmem_tiny_lifecycle.inc +++ b/core/hakmem_tiny_lifecycle.inc @@ -12,6 +12,7 @@ // Cold/maintenance path - not performance critical. #include "tiny_tls_guard.h" #include "box/ss_slab_meta_box.h" // Phase 3d-A: SlabMeta Box boundary +#include "hakmem_super_registry.h" // Phase 12: Helper to derive a representative class index for a SuperSlab // from per-slab metadata (all slabs are empty when used in trim). @@ -96,8 +97,11 @@ void hak_tiny_trim(void) { } // Walk the registry and collect empty SuperSlabs by class - for (int i = 0; i < SUPER_REG_SIZE; i++) { - SuperRegEntry* e = &g_super_reg[i]; + SuperRegEntry* reg = super_reg_entries(); + int reg_cap = super_reg_effective_size(); + if (!reg || reg_cap <= 0) return; + for (int i = 0; i < reg_cap; i++) { + SuperRegEntry* e = ®[i]; uintptr_t base = atomic_load_explicit((_Atomic uintptr_t*)&e->base, memory_order_acquire); if (base == 0) continue; SuperSlab* ss = e->ss; diff --git a/core/hakmem_tiny_magazine.c b/core/hakmem_tiny_magazine.c index a9aa4af4..806ac5f0 100644 --- a/core/hakmem_tiny_magazine.c +++ b/core/hakmem_tiny_magazine.c @@ -7,6 +7,7 @@ #include "hakmem_prof.h" #include "hakmem_internal.h" #include "box/tiny_next_ptr_box.h" // Box API: Next pointer read/write +#include "box/tiny_mem_stats_box.h" #include static inline uint32_t tiny_self_u32_guard(void) { @@ -36,6 +37,14 @@ int g_mag_cap_limit = TINY_TLS_MAG_CAP; int g_mag_cap_override[TINY_NUM_CLASSES] = {0}; // HAKMEM_TINY_MAG_CAP_C{0..7} __thread int g_tls_small_mags_inited = 0; +static __thread int g_tls_mag_mem_recorded = 0; + +static inline void tiny_mag_record_mem_once(void) { + if (!g_tls_mag_mem_recorded) { + tiny_mem_stats_add_tls_magazine((ssize_t)sizeof(g_tls_mags)); + g_tls_mag_mem_recorded = 1; + } +} // tiny_default_cap() and tiny_cap_max_for_class() now defined as inline functions // in hakmem_tiny_config.h for centralized configuration @@ -49,6 +58,7 @@ int tiny_effective_cap(int class_idx) { void tiny_small_mags_init_once(void) { if (__builtin_expect(g_tls_small_mags_inited, 1)) return; + tiny_mag_record_mem_once(); for (int k = 0; k <= 3; k++) { TinyTLSMag* m = &g_tls_mags[k]; if (m->cap == 0) { @@ -65,6 +75,7 @@ void tiny_small_mags_init_once(void) { void tiny_mag_init_if_needed(int class_idx) { TinyTLSMag* mag = &g_tls_mags[class_idx]; if (mag->cap == 0) { + tiny_mag_record_mem_once(); int base = tiny_effective_cap(class_idx); int cap = (base < TINY_TLS_MAG_CAP) ? base : TINY_TLS_MAG_CAP; if (g_mag_cap_limit < cap) cap = g_mag_cap_limit; diff --git a/core/hakmem_tiny_publish_box.inc b/core/hakmem_tiny_publish_box.inc index 5a9e4294..c40ce9f9 100644 --- a/core/hakmem_tiny_publish_box.inc +++ b/core/hakmem_tiny_publish_box.inc @@ -7,7 +7,7 @@ // Tiny Page Box: C5〜C7 用 Tiny-Plus page pool(Superslab/Warm Pool より前段の箱) // tiny_tls_bind_slab() で新しい TLS Slab が bind されたタイミングで -// tiny_page_box_on_new_slab() を呼び出し、Page Box 側の page pool を更新する。 +// tiny_page_box_on_new_slab(class_idx, tls) を呼び出し、Page Box 側の page pool を更新する。 #include "box/tiny_page_box.h" // Mailbox box @@ -369,8 +369,9 @@ static inline void tiny_tls_bind_slab(TinyTLSSlab* tls, SuperSlab* ss, int slab_ tls->meta = &ss->slabs[slab_idx]; tls->slab_base = tiny_slab_base_for(ss, slab_idx); - // Tiny Page Box にも新しい slab を通知しておく(C7 など有効クラスのみ) - tiny_page_box_on_new_slab(tls); + // Tiny Page Box にも新しい slab を通知しておく(有効クラスのみ) + int pb_class = tls->meta ? (int)tls->meta->class_idx : -1; + tiny_page_box_on_new_slab(pb_class, tls); } static inline uint32_t tiny_tls_default_refill(uint32_t cap) { diff --git a/core/superslab_ace.c b/core/superslab_ace.c index 65583150..20cb4119 100644 --- a/core/superslab_ace.c +++ b/core/superslab_ace.c @@ -4,6 +4,7 @@ // Date: 2025-11-28 #include "hakmem_tiny_superslab_internal.h" +#include "hakmem_super_registry.h" // ============================================================================ // ACE (Adaptive Cache Engine) State @@ -140,8 +141,12 @@ void ace_observe_and_decide(int k) { int ss_count = 0; uint32_t total_live = 0; - for (int i = 0; i < SUPER_REG_SIZE; i++) { - SuperRegEntry* e = &g_super_reg[i]; + SuperRegEntry* reg = super_reg_entries(); + int reg_cap = super_reg_effective_size(); + if (!reg || reg_cap <= 0) return; + + for (int i = 0; i < reg_cap; i++) { + SuperRegEntry* e = ®[i]; // Atomic read (thread-safe) uintptr_t base = atomic_load_explicit( diff --git a/core/superslab_stats.c b/core/superslab_stats.c index c7a56135..2db4268a 100644 --- a/core/superslab_stats.c +++ b/core/superslab_stats.c @@ -4,6 +4,7 @@ // Date: 2025-11-28 #include "hakmem_tiny_superslab_internal.h" +#include // ============================================================================ // Global Statistics @@ -30,6 +31,11 @@ uint64_t g_ss_freed_by_class[8] = {0}; _Atomic uint64_t g_ss_mmap_count = 0; _Atomic uint64_t g_final_fallback_mmap_count = 0; +// Superslab/slab observability (Tiny-only; relaxed updates) +_Atomic uint64_t g_ss_live_by_class[8] = {0}; +_Atomic uint64_t g_ss_empty_events[8] = {0}; +_Atomic uint64_t g_slab_live_events[8] = {0}; + // ============================================================================ // Statistics Functions // ============================================================================ @@ -56,6 +62,35 @@ void ss_stats_cache_store(void) { pthread_mutex_unlock(&g_superslab_lock); } +void ss_stats_on_ss_alloc_class(int class_idx) { + if (class_idx >= 0 && class_idx < 8) { + atomic_fetch_add_explicit(&g_ss_live_by_class[class_idx], 1, memory_order_relaxed); + } +} + +void ss_stats_on_ss_free_class(int class_idx) { + if (class_idx >= 0 && class_idx < 8) { + uint64_t prev = atomic_load_explicit(&g_ss_live_by_class[class_idx], memory_order_relaxed); + if (prev > 0) { + atomic_fetch_sub_explicit(&g_ss_live_by_class[class_idx], 1, memory_order_relaxed); + } + } +} + +void ss_stats_on_ss_scan(int class_idx, int slab_live, int is_empty) { + if (class_idx < 0 || class_idx >= 8) { + return; + } + if (slab_live > 0) { + atomic_fetch_add_explicit(&g_slab_live_events[class_idx], + (uint64_t)slab_live, + memory_order_relaxed); + } + if (is_empty) { + atomic_fetch_add_explicit(&g_ss_empty_events[class_idx], 1, memory_order_relaxed); + } +} + // ============================================================================ // Diagnostics // ============================================================================ @@ -164,3 +199,23 @@ void superslab_print_global_stats(void) { printf("Total bytes allocated: %lu MB\n", g_bytes_allocated / (1024 * 1024)); pthread_mutex_unlock(&g_superslab_lock); } + +void ss_stats_dump_if_requested(void) { + const char* env = getenv("HAKMEM_SS_STATS_DUMP"); + if (!env || !*env || *env == '0') { + return; + } + fprintf(stderr, "[SS_STATS] class live empty_events slab_live_events\n"); + for (int c = 0; c < 8; c++) { + uint64_t live = atomic_load_explicit(&g_ss_live_by_class[c], memory_order_relaxed); + uint64_t empty = atomic_load_explicit(&g_ss_empty_events[c], memory_order_relaxed); + uint64_t slab_live = atomic_load_explicit(&g_slab_live_events[c], memory_order_relaxed); + if (live || empty || slab_live) { + fprintf(stderr, " C%d: live=%llu empty=%llu slab_live=%llu\n", + c, + (unsigned long long)live, + (unsigned long long)empty, + (unsigned long long)slab_live); + } + } +} diff --git a/core/tiny_alloc_fast_push.d b/core/tiny_alloc_fast_push.d index 87004d35..bc161ebc 100644 --- a/core/tiny_alloc_fast_push.d +++ b/core/tiny_alloc_fast_push.d @@ -17,8 +17,9 @@ core/tiny_alloc_fast_push.o: core/tiny_alloc_fast_push.c \ core/box/../superslab/../tiny_box_geometry.h \ core/box/../tiny_debug_ring.h core/box/../tiny_remote.h \ core/box/../box/ss_addr_map_box.h \ - core/box/../box/../hakmem_build_flags.h core/box/../hakmem_tiny.h \ - core/box/../hakmem_trace.h core/box/../hakmem_tiny_mini_mag.h \ + core/box/../box/../hakmem_build_flags.h core/box/../box/super_reg_box.h \ + core/box/../hakmem_tiny.h core/box/../hakmem_trace.h \ + core/box/../hakmem_tiny_mini_mag.h \ core/box/../box/hak_lane_classify.inc.h core/box/../tiny_debug_api.h \ core/box/../hakmem_tiny_integrity.h core/box/../ptr_track.h \ core/box/../ptr_trace.h core/box/../hakmem_trace_master.h \ @@ -68,6 +69,7 @@ core/box/../tiny_debug_ring.h: core/box/../tiny_remote.h: core/box/../box/ss_addr_map_box.h: core/box/../box/../hakmem_build_flags.h: +core/box/../box/super_reg_box.h: core/box/../hakmem_tiny.h: core/box/../hakmem_trace.h: core/box/../hakmem_tiny_mini_mag.h: diff --git a/core/tiny_remote.c b/core/tiny_remote.c index 9da0fa88..82342a23 100644 --- a/core/tiny_remote.c +++ b/core/tiny_remote.c @@ -10,18 +10,11 @@ #endif #include #include "tiny_remote.h" +#include "box/remote_side_box.h" #include "hakmem_tiny_superslab.h" #include "tiny_debug_ring.h" -#define REM_SIDE_LOG2 20 -#define REM_SIDE_SIZE (1u<lg_size; - uint32_t i = hmix(k) & (REM_SIDE_SIZE - 1); - for (uint32_t n=0; nwarm_enabled/page_box_enabled` once per thread and reuse. +- Split UC helpers into `*_hit_fast` vs `*_miss` to keep the hit CFG tiny. + +Trade-offs / checks +------------------- +- Keep the Box boundaries (Gate/Route/Policy) but allow an inline “fast lane” for C7. +- Ensure Debug/Policy logging stays in the slow/miss path only. +- Validate with IPC/ops after implementation; target +10–15% for C7-heavy mixes. diff --git a/docs/analysis/CPU_HOTPATH_OVERVIEW.md b/docs/analysis/CPU_HOTPATH_OVERVIEW.md new file mode 100644 index 00000000..e9efd424 --- /dev/null +++ b/docs/analysis/CPU_HOTPATH_OVERVIEW.md @@ -0,0 +1,53 @@ +CPU Hotpath Overview (bench profile) +==================================== + +Context +------- +- Build/profile: `HAKMEM_PROFILE=bench`, `HAKMEM_TINY_PROFILE=full`, `HAKMEM_WARM_TLS_BIND_C7=2`. +- Workloads sampled: + - 16–1024B (`./bench_random_mixed_hakmem 1000000 256 42`) + - 129–1024B (`HAKMEM_BENCH_MIN_SIZE=129 HAKMEM_BENCH_MAX_SIZE=1024 ./bench_random_mixed_hakmem 1000000 256 42`) +- Target: identify user‑space hot spots to guide C7 flattening work. + +Sampling attempt (perf) +----------------------- +- `perf record -g -e cycles:u` and `perf record -g -e cpu-clock:u` both fall back to `page-faults` on this host (likely perf_event_paranoid). The captures show: + - ~97% of page-fault samples in `__memset_avx2_unaligned_erms` during warmup/zeroing. + - Callers were `tiny_tls_sll_drain.part.0.constprop.0` and `adaptive_sizing_init` (warmup path). +- No steady‑state cycle profile was available without elevated perf permissions. `perf data` removed after inspection (`rm perf.data`) to keep tree clean. + +What we can infer despite the limitation +---------------------------------------- +- Warmup zeroing dominates page‑fault samples; steady‑state alloc/free is not represented. +- Hot candidates for the next pass (from previous code inspection and bench intuition): + - `tiny_alloc_fast` / `malloc_tiny_fast` (C7 fast path) + - `hak_tiny_free_fast_v2` + - `tiny_unified_cache` hit path helpers + - `tls_sll_pop_impl` / `tiny_tls_sll_drain` + +Next measurement options +------------------------ +- If perf cycles are still blocked: + - Use `perf stat -e cycles,instructions,branches,branch-misses -r 5 -- ...` to get aggregate IPC per workload. + - Add temporary user‑space counters (Box‑guarded) around C7 alloc/free hot sections to estimate per‑op cycles. + - Run perf with elevated permissions or lower `perf_event_paranoid` if available. + +Aggregate perf stat snapshot (bench profile) +-------------------------------------------- +- Env: `HAKMEM_PROFILE=bench HAKMEM_TINY_PROFILE=full HAKMEM_WARM_TLS_BIND_C7=2` +- Workloads (Release, 3× perf stat): + - 16–1024B: cycles≈109.8M, instructions≈233.2M → IPC≈2.12, branches≈49.4M, branch-miss≈2.89% + - 129–1024B: cycles≈109.6M, instructions≈230.3M → IPC≈2.10, branches≈48.8M, branch-miss≈2.90% + - 16–1024B with `HAKMEM_TINY_C7_HOT=1` (UC hit-only + flat TLS→UC→cold): + - cycles≈111.8M, instructions≈242.1M → IPC≈2.16, branches≈52.0M, branch-miss≈2.75% + - RSS≈7.1MB; throughput ≈47.4–47.6M ops/s (hot=1) vs ≈47.2M (hot=0) on the same runset. + +Action items flowing from this note +----------------------------------- +- Proceed with design notes for C7 alloc/free flattening and UC hit simplification based on code structure. +- Keep warmup zeroing out of the steady‑state loop when profiling (consider `HAKMEM_BENCH_FAST_MODE` for future captures). + +Conclusion (current state) +-------------------------- +- `HAKMEM_TINY_C7_HOT` は実験用フラグとして残し、デフォルト OFF のまま運用する。ON にしても branch-miss はわずかに改善する程度で、ops/s は同等〜微減。 +- ひとまず「安全+そこそこ速い」現行経路を基準とし、さらなるフラット化は別途必要性を見て検討する。*** diff --git a/docs/analysis/LARGE_GLOBALS_OVERVIEW.md b/docs/analysis/LARGE_GLOBALS_OVERVIEW.md new file mode 100644 index 00000000..db5155f7 --- /dev/null +++ b/docs/analysis/LARGE_GLOBALS_OVERVIEW.md @@ -0,0 +1,84 @@ +LARGE_GLOBALS_OVERVIEW +====================== + +概要 +---- +- `nm -S --size-sort bench_random_mixed_hakmem` で確認した巨大 BSS/静的領域の上位シンボルをメモ。 +- 目視での役割・要素数イメージと、直近の SS_STATS(短い run, ws=64, iters=10k, HAKMEM_SS_STATS_DUMP=1)のギャップを併記。 +- 目的: 次フェーズの「SuperReg/SharedPool/Remote を動的化 or 縮小」設計の入力にする。 + +コマンド +-------- +```bash +nm -S --size-sort bench_random_mixed_hakmem | tail -n 120 +HAKMEM_SS_STATS_DUMP=1 ./bench_random_mixed_hakmem 10000 64 1 2> /tmp/ss_stats_sample.log +``` + +観測された大きめシンボル +------------------------ + +| Symbol | Size | 役割/箱 | 備考・ギャップ | +| --- | --- | --- | --- | +| `g_super_reg` | 0x1800000 ≈ 24.0 MB | Super Registry 全体 | SS_STATS では C2=1, C7=1 live と極小。大半が未使用の固定配列。 | +| `g_rem_side` | 0x1000000 ≈ 16.0 MB | Remote Queue 側バッファ | スレッド数・ノード数に対してオーバーサイズ。bench ではほぼ未使用。 | +| `g_shared_pool` | 0x238140 ≈ 2.23 MB | Shared Pool テーブル | live SS 2 枚に対し容量が大きい。class 別縮小余地あり。 | +| `g_super_reg_by_class` | 0x100000 ≈ 1.0 MB | クラス別 SuperReg インデックス | クラス数 8 に対し 1MB 固定。動的化で圧縮可能。 | +| `g_free_node_pool` | 0xC0000 ≈ 0.75 MB | Free ノードプール | Remote/Pool 用。小さくはないが上位ほどではない。 | +| `g_mf2_page_registry.lto_priv.0` | 0x82810 ≈ 0.51 MB | MF2 ページレジストリ | MF2 経路用。 | +| `g_tls_mags` | 0x40040 ≈ 0.26 MB | TLS Magazine 配列 | スレッド数ぶん前提。実使用は少数。 | +| `g_site_rules` | 0x40040 ≈ 0.26 MB | Site rule テーブル | 固定長。 | +| `g_mid_desc_mu` | 0x14000 ≈ 80 KB | Mid-size desc | 中規模。 | +| `g_mid_tc_mu` | 0xA000 ≈ 40 KB | Mid-size TC | 中規模。 | +| `g_pool.lto_priv.0` | 0x9680 ≈ 37 KB | Pool 配列 | 中規模。 | +| `g_tiny_page_box` | 0xC40 ≈ 3.1 KB | Tiny Page Box 配列 | Tiny Front 系。微小。 | +| `g_tls_hot_mag` | 0x2040 ≈ 8 KB | TLS Hot Magazine | 微小。 | +| `g_fast_cache` | 0x2200 ≈ 8.6 KB | Fast cache | 微小。 | + +補足(SS_STATS サンプル) +------------------------- +- 短い run(ws=64, iters=10k, Release, HAKMEM_SS_STATS_DUMP=1)の結果: + - `[SS_STATS] class live empty_events slab_live_events` + - `C2: live=1 empty=0 slab_live=0` + - `C7: live=1 empty=1 slab_live=0` + - `[RSS] max_kb=29568` +- 「巨大配列の容量」に対し「実際に live の Superslab」は 2 枚のみ。固定長 BSS が RSS を支配していることが確実。 + +定義元と役割(コード位置) +-------------------------- +- Super Registry (`core/hakmem_super_registry.{h,c}`) + - `g_super_reg[SUPER_REG_SIZE]` … ハッシュ登録(デフォルト 1,048,576 エントリ = 24MB、`SUPER_REG_SIZE` で調整可能) + - `g_super_reg_by_class[TINY_NUM_CLASSES][SUPER_REG_PER_CLASS]` … クラス別スキャン用(デフォルト 8×16384 = 128K スロット ≈1MB) + - `g_super_reg_class_size[]` … クラス別 live カウント + - `g_ss_lru_cache` … LRU 再利用キャッシュ(メモリは小さめ) +- Shared Pool (`core/hakmem_shared_pool.{h,c}` + `_acquire.c` + `_release.c`) + - `g_shared_pool` … Superslab 配列、クラス別ヒント/活性/フリーリスト/メタ配列を同居させた大きめ struct(≈2.3MB) + - `g_shared_pool.ss_metadata[]` … Superslab ごとのメタデータ配列 +- Remote Queue (`core/tiny_remote.c`) + - `g_rem_side[REM_SIDE_SIZE]` … cross-thread free のハッシュ(`REM_SIDE_LOG2=20` → 1M エントリ ≈16MB) + - Debug 時の `g_rem_track[]` は release では落ちるのでサイズ影響なし +- Free Node Pool (`core/pool_refill.c` など) + - `g_free_node_pool` … pool refilling 用のノードストック(≈0.75MB) +- TLS / MF2 系 + - `g_tls_mags` (`core/hakmem_tiny_magazine.c`) … TLS マガジン配列(≈0.26MB、スレッド数前提) + - `g_mf2_page_registry` (`core/mf2*`) … MF2 併用時のページレジストリ(≈0.5MB) + - `g_ss_addr_map` (`core/box/ss_addr_map_box.h`) … Superslab アドレス検索ハッシュ(サイズは中程度) + +ベンチ向け縮小の目安(案) +-------------------------- +- SuperReg + - 現状: `SUPER_REG_SIZE=1,048,576`(24MB)、`SUPER_REG_PER_CLASS=16384`(1MB) + - Bench 目安: `SUPER_REG_SIZE_BENCH=65,536`(~1.5MB)、`SUPER_REG_PER_CLASS_BENCH=1024`(~64KB) +- Shared Pool + - 現状: capacity は動的拡張だが初期サイズは大きめ(約 2.3MB) + - Bench 目安: 初期 capacity を 64〜128 に抑え、クラス別スロットも縮小 +- Remote Queue + - 現状: `REM_SIDE_LOG2=20` → 1M エントリ(16MB) + - Bench 目安: `REM_SIDE_LOG2=16`(64K エントリ ≈1MB)程度まで削減 +- Free Node Pool / TLS Mag / MF2 + - Bench ではスレッド数や MF2 オンオフに応じて「初期化を遅延」「固定配列を半減」する余地あり。 + +次の設計ステップ(Box 化の方向性) +----------------------------------- +- SuperReg/SharedPool/Remote を Box 化し、`HAKMEM_PROFILE`(prod/full/bench/larson_guard 等)で容量を切替できるようにする。 +- Bench 用の小型プロファイル(registry/pool/remote を 1/4〜1/8)を追加し、RSS を抑えた状態で mimalloc/system と比較する。 +- Superslab Budget Box と組み合わせ、live 枚数上限(予算)と「空 SS 再利用ポリシー」を分離して管理する。*** diff --git a/docs/analysis/SUPERSLAB_STATS_SNAPSHOT.md b/docs/analysis/SUPERSLAB_STATS_SNAPSHOT.md new file mode 100644 index 00000000..f8f78b31 --- /dev/null +++ b/docs/analysis/SUPERSLAB_STATS_SNAPSHOT.md @@ -0,0 +1,40 @@ +# Superslab Stats Snapshot (larson_guard, 2025-12-06) + +コマンド: +`HAKMEM_TINY_PROFILE=larson_guard HAKMEM_SS_STATS_DUMP=1 ./bench_allocators_hakmem larson 1 10000 1` + +抜粋ログ: +``` +[SS_STATS] class live empty_events slab_live_events + C2: live=1 empty=0 slab_live=0 +``` + +メモ: larson_guard では Superslab 枚数が予算近辺で頭打ちになり、暴走せずに完走することを確認。 + +# Superslab Stats Snapshot (bench profile, 2025-12-06) + +コマンド: +`HAKMEM_PROFILE=bench HAKMEM_TINY_PROFILE=full HAKMEM_WARM_TLS_BIND_C7=2 HAKMEM_SS_STATS_DUMP=1 ./bench_random_mixed_hakmem 1000000 256 42` + +抜粋ログ: +``` +[SS_STATS] class live empty_events slab_live_events + C2: live=1 empty=0 slab_live=0 + C7: live=1 empty=1 slab_live=0 +[RSS] max_kb=7168 +``` + +メモ: bench プロファイル(SuperReg/Remote 実配列縮小版)でも live Superslab は C2=1, C7=1 に収まり、RSS は ~7MB まで低減。*** + +# Tiny Mem Stats Snapshot (bench profile, 2025-12-06) + +コマンド: +`HAKMEM_PROFILE=bench HAKMEM_TINY_PROFILE=full HAKMEM_WARM_TLS_BIND_C7=2 HAKMEM_TINY_MEM_DUMP=1 ./bench_random_mixed_hakmem 1000 8 1` + +抜粋ログ: +``` +[TINY_MEM_STATS] unified_cache=36KB warm_pool=2KB page_box=3KB tls_mag=0KB policy_stats=0KB total=41KB +[RSS] max_kb=7040 +``` + +メモ: Tiny 層(UC/Warm/Page/TLS/Policy)だけなら数十 KB で、 bench プロファイルの RSS 低減は主に SuperReg/Remote の実配列縮小による。*** diff --git a/hakmem.d b/hakmem.d index 0815b81a..c96d990d 100644 --- a/hakmem.d +++ b/hakmem.d @@ -16,12 +16,13 @@ hakmem.o: core/hakmem.c core/hakmem.h core/hakmem_build_flags.h \ core/box/tiny_next_ptr_box.h core/hakmem_tiny_config.h \ core/tiny_nextptr.h core/tiny_region_id.h core/tiny_box_geometry.h \ core/ptr_track.h core/hakmem_super_registry.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/tiny_debug_api.h \ - core/box/tiny_layout_box.h core/box/../hakmem_tiny_config.h \ - core/box/tiny_header_box.h core/box/tiny_layout_box.h \ - core/box/../tiny_region_id.h core/hakmem_elo.h core/hakmem_ace_stats.h \ - core/hakmem_batch.h core/hakmem_evo.h core/hakmem_debug.h \ - core/hakmem_prof.h core/hakmem_syscall.h core/hakmem_ace_controller.h \ + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/tiny_debug_api.h core/box/tiny_layout_box.h \ + core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ + core/box/tiny_layout_box.h core/box/../tiny_region_id.h \ + core/hakmem_elo.h core/hakmem_ace_stats.h core/hakmem_batch.h \ + core/hakmem_evo.h core/hakmem_debug.h core/hakmem_prof.h \ + core/hakmem_syscall.h core/hakmem_ace_controller.h \ core/hakmem_ace_metrics.h core/hakmem_ace_ucb1.h \ core/box/bench_fast_box.h core/ptr_trace.h core/hakmem_trace_master.h \ core/hakmem_stats_master.h core/box/hak_kpi_util.inc.h \ @@ -86,6 +87,16 @@ hakmem.o: core/hakmem.c core/hakmem.h core/hakmem_build_flags.h \ core/box/../front/../box/../front/tiny_unified_cache.h \ core/box/../front/../box/tiny_layout_box.h \ core/box/../front/../box/tiny_front_cold_box.h \ + core/box/../front/../box/tiny_c7_hotpath_box.h \ + core/box/../front/../box/c7_hotpath_env_box.h \ + core/box/../front/../box/tiny_c7_uc_hit_box.h \ + core/box/../front/../box/tiny_c7_warm_spill_box.h \ + core/box/../front/../box/tiny_c7_stats_sample_box.h \ + core/box/../front/../box/tiny_front_hot_box.h \ + core/box/../front/../box/tiny_front_cold_box.h \ + core/box/../front/../box/front_gate_box.h \ + core/box/../front/../box/tls_sll_box.h \ + core/box/../front/../box/ptr_conversion_box.h \ core/box/tiny_alloc_gate_box.h core/box/tiny_route_box.h \ core/box/tiny_front_config_box.h core/box/wrapper_env_box.h \ core/box/../hakmem_internal.h core/box/../superslab/superslab_inline.h @@ -131,6 +142,7 @@ core/ptr_track.h: core/hakmem_super_registry.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/tiny_debug_api.h: core/box/tiny_layout_box.h: core/box/../hakmem_tiny_config.h: @@ -244,6 +256,16 @@ core/box/../front/../box/../tiny_region_id.h: core/box/../front/../box/../front/tiny_unified_cache.h: core/box/../front/../box/tiny_layout_box.h: core/box/../front/../box/tiny_front_cold_box.h: +core/box/../front/../box/tiny_c7_hotpath_box.h: +core/box/../front/../box/c7_hotpath_env_box.h: +core/box/../front/../box/tiny_c7_uc_hit_box.h: +core/box/../front/../box/tiny_c7_warm_spill_box.h: +core/box/../front/../box/tiny_c7_stats_sample_box.h: +core/box/../front/../box/tiny_front_hot_box.h: +core/box/../front/../box/tiny_front_cold_box.h: +core/box/../front/../box/front_gate_box.h: +core/box/../front/../box/tls_sll_box.h: +core/box/../front/../box/ptr_conversion_box.h: core/box/tiny_alloc_gate_box.h: core/box/tiny_route_box.h: core/box/tiny_front_config_box.h: diff --git a/hakmem_shared_pool.d b/hakmem_shared_pool.d index 6fa0eb60..f9c03ed3 100644 --- a/hakmem_shared_pool.d +++ b/hakmem_shared_pool.d @@ -12,9 +12,10 @@ hakmem_shared_pool.o: core/hakmem_shared_pool.c \ core/box/tiny_next_ptr_box.h core/hakmem_tiny_config.h \ core/tiny_nextptr.h core/tiny_region_id.h core/tiny_box_geometry.h \ core/ptr_track.h core/hakmem_super_registry.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/hakmem_tiny.h core/hakmem_trace.h \ - core/hakmem_tiny_mini_mag.h core/box/hak_lane_classify.inc.h \ - core/box/ptr_type_box.h core/tiny_debug_api.h core/box/tiny_layout_box.h \ + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/hakmem_tiny.h core/hakmem_trace.h core/hakmem_tiny_mini_mag.h \ + core/box/hak_lane_classify.inc.h core/box/ptr_type_box.h \ + core/tiny_debug_api.h core/box/tiny_layout_box.h \ core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ core/box/tiny_layout_box.h core/box/../tiny_region_id.h \ core/box/ss_hot_cold_box.h core/box/pagefault_telemetry_box.h \ @@ -40,7 +41,8 @@ hakmem_shared_pool.o: core/hakmem_shared_pool.c \ core/box/ss_hot_cold_box.h core/box/ss_release_guard_box.h \ core/box/free_local_box.h core/box/ptr_type_box.h \ core/box/free_publish_box.h core/hakmem_tiny.h core/tiny_region_id.h \ - core/box/tls_slab_reuse_guard_box.h core/hakmem_policy.h + core/box/tls_slab_reuse_guard_box.h core/hakmem_policy.h \ + core/box/shared_pool_box.h core/hakmem_shared_pool_internal.h: core/hakmem_shared_pool.h: core/superslab/superslab_types.h: @@ -69,6 +71,7 @@ core/ptr_track.h: core/hakmem_super_registry.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/hakmem_tiny.h: core/hakmem_trace.h: core/hakmem_tiny_mini_mag.h: @@ -127,3 +130,4 @@ core/hakmem_tiny.h: core/tiny_region_id.h: core/box/tls_slab_reuse_guard_box.h: core/hakmem_policy.h: +core/box/shared_pool_box.h: diff --git a/hakmem_super_registry.d b/hakmem_super_registry.d index d9854085..3f200f96 100644 --- a/hakmem_super_registry.d +++ b/hakmem_super_registry.d @@ -7,9 +7,9 @@ hakmem_super_registry.o: core/hakmem_super_registry.c \ core/superslab/../hakmem_tiny_config.h core/tiny_debug_ring.h \ core/hakmem_build_flags.h core/tiny_remote.h \ core/hakmem_tiny_superslab_constants.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/box/ss_allocation_box.h \ - core/hakmem_tiny_superslab.h core/box/ss_cold_start_box.inc.h \ - core/hakmem_env_cache.h + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/box/ss_allocation_box.h core/hakmem_tiny_superslab.h \ + core/box/ss_cold_start_box.inc.h core/hakmem_env_cache.h core/hakmem_super_registry.h: core/hakmem_tiny_superslab.h: core/superslab/superslab_types.h: @@ -25,6 +25,7 @@ core/tiny_remote.h: core/hakmem_tiny_superslab_constants.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/box/ss_allocation_box.h: core/hakmem_tiny_superslab.h: core/box/ss_cold_start_box.inc.h: diff --git a/hakmem_tiny_bg_spill.d b/hakmem_tiny_bg_spill.d index 23f8f0df..6c284d90 100644 --- a/hakmem_tiny_bg_spill.d +++ b/hakmem_tiny_bg_spill.d @@ -8,9 +8,10 @@ hakmem_tiny_bg_spill.o: core/hakmem_tiny_bg_spill.c \ core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \ core/superslab/superslab_types.h core/superslab/../tiny_box_geometry.h \ core/tiny_debug_ring.h core/tiny_remote.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/hakmem_tiny.h core/hakmem_trace.h \ - core/hakmem_tiny_mini_mag.h core/box/hak_lane_classify.inc.h \ - core/box/ptr_type_box.h core/tiny_debug_api.h core/box/tiny_layout_box.h \ + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/hakmem_tiny.h core/hakmem_trace.h core/hakmem_tiny_mini_mag.h \ + core/box/hak_lane_classify.inc.h core/box/ptr_type_box.h \ + core/tiny_debug_api.h core/box/tiny_layout_box.h \ core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ core/box/tiny_layout_box.h core/box/../tiny_region_id.h core/hakmem_tiny_bg_spill.h: @@ -34,6 +35,7 @@ core/tiny_debug_ring.h: core/tiny_remote.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/hakmem_tiny.h: core/hakmem_trace.h: core/hakmem_tiny_mini_mag.h: diff --git a/hakmem_tiny_magazine.d b/hakmem_tiny_magazine.d index 02834aa6..7a3cacb6 100644 --- a/hakmem_tiny_magazine.d +++ b/hakmem_tiny_magazine.d @@ -10,14 +10,15 @@ hakmem_tiny_magazine.o: core/hakmem_tiny_magazine.c \ core/superslab/../hakmem_tiny_config.h core/tiny_debug_ring.h \ core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \ core/hakmem_super_registry.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/hakmem_prof.h \ - core/hakmem_internal.h core/hakmem.h core/hakmem_config.h \ - core/hakmem_features.h core/hakmem_sys.h core/hakmem_whale.h \ - core/box/tiny_next_ptr_box.h core/hakmem_tiny_config.h \ - core/tiny_nextptr.h core/tiny_region_id.h core/tiny_box_geometry.h \ - core/ptr_track.h core/tiny_debug_api.h core/box/tiny_layout_box.h \ - core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ - core/box/tiny_layout_box.h core/box/../tiny_region_id.h + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/hakmem_prof.h core/hakmem_internal.h core/hakmem.h \ + core/hakmem_config.h core/hakmem_features.h core/hakmem_sys.h \ + core/hakmem_whale.h core/box/tiny_next_ptr_box.h \ + core/hakmem_tiny_config.h core/tiny_nextptr.h core/tiny_region_id.h \ + core/tiny_box_geometry.h core/ptr_track.h core/tiny_debug_api.h \ + core/box/tiny_layout_box.h core/box/../hakmem_tiny_config.h \ + core/box/tiny_header_box.h core/box/tiny_layout_box.h \ + core/box/../tiny_region_id.h core/box/tiny_mem_stats_box.h core/hakmem_tiny_magazine.h: core/hakmem_tiny.h: core/hakmem_build_flags.h: @@ -40,6 +41,7 @@ core/hakmem_tiny_superslab_constants.h: core/hakmem_super_registry.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/hakmem_prof.h: core/hakmem_internal.h: core/hakmem.h: @@ -59,3 +61,4 @@ core/box/../hakmem_tiny_config.h: core/box/tiny_header_box.h: core/box/tiny_layout_box.h: core/box/../tiny_region_id.h: +core/box/tiny_mem_stats_box.h: diff --git a/hakmem_tiny_query.d b/hakmem_tiny_query.d index 273d7a3a..867c2047 100644 --- a/hakmem_tiny_query.d +++ b/hakmem_tiny_query.d @@ -10,8 +10,8 @@ hakmem_tiny_query.o: core/hakmem_tiny_query.c core/hakmem_tiny.h \ core/superslab/../hakmem_tiny_config.h core/tiny_debug_ring.h \ core/tiny_remote.h core/hakmem_tiny_superslab_constants.h \ core/hakmem_super_registry.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/hakmem_config.h \ - core/hakmem_features.h + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/hakmem_config.h core/hakmem_features.h core/hakmem_tiny.h: core/hakmem_build_flags.h: core/hakmem_trace.h: @@ -34,5 +34,6 @@ core/hakmem_tiny_superslab_constants.h: core/hakmem_super_registry.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/hakmem_config.h: core/hakmem_features.h: diff --git a/hakmem_tiny_sfc.d b/hakmem_tiny_sfc.d index 501476c5..6a534718 100644 --- a/hakmem_tiny_sfc.d +++ b/hakmem_tiny_sfc.d @@ -9,21 +9,21 @@ hakmem_tiny_sfc.o: core/hakmem_tiny_sfc.c core/tiny_alloc_fast_sfc.inc.h \ core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \ core/superslab/superslab_types.h core/superslab/../tiny_box_geometry.h \ core/tiny_debug_ring.h core/tiny_remote.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/tiny_debug_api.h \ - core/box/tiny_layout_box.h core/box/../hakmem_tiny_config.h \ - core/box/tiny_header_box.h core/box/tiny_layout_box.h \ - core/box/../tiny_region_id.h core/hakmem_stats_master.h core/tiny_tls.h \ - core/box/tls_sll_box.h core/box/../hakmem_internal.h \ - core/box/../hakmem.h core/box/../hakmem_build_flags.h \ - core/box/../hakmem_config.h core/box/../hakmem_features.h \ - core/box/../hakmem_sys.h core/box/../hakmem_whale.h \ - core/box/../box/ptr_type_box.h core/box/../hakmem_debug_master.h \ - core/box/../tiny_remote.h core/box/../hakmem_tiny_integrity.h \ - core/box/../hakmem_tiny.h core/box/../ptr_track.h \ - core/box/../ptr_trace.h core/box/../hakmem_trace_master.h \ - core/box/../hakmem_stats_master.h core/box/../tiny_debug_ring.h \ - core/box/ss_addr_map_box.h core/box/../superslab/superslab_inline.h \ - core/box/tiny_ptr_bridge_box.h \ + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/tiny_debug_api.h core/box/tiny_layout_box.h \ + core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ + core/box/tiny_layout_box.h core/box/../tiny_region_id.h \ + core/hakmem_stats_master.h core/tiny_tls.h core/box/tls_sll_box.h \ + core/box/../hakmem_internal.h core/box/../hakmem.h \ + core/box/../hakmem_build_flags.h core/box/../hakmem_config.h \ + core/box/../hakmem_features.h core/box/../hakmem_sys.h \ + core/box/../hakmem_whale.h core/box/../box/ptr_type_box.h \ + core/box/../hakmem_debug_master.h core/box/../tiny_remote.h \ + core/box/../hakmem_tiny_integrity.h core/box/../hakmem_tiny.h \ + core/box/../ptr_track.h core/box/../ptr_trace.h \ + core/box/../hakmem_trace_master.h core/box/../hakmem_stats_master.h \ + core/box/../tiny_debug_ring.h core/box/ss_addr_map_box.h \ + core/box/../superslab/superslab_inline.h core/box/tiny_ptr_bridge_box.h \ core/box/../hakmem_tiny_superslab_internal.h \ core/box/../hakmem_tiny_superslab.h core/box/../box/ss_hot_cold_box.h \ core/box/../box/../superslab/superslab_types.h \ @@ -60,6 +60,7 @@ core/tiny_debug_ring.h: core/tiny_remote.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/tiny_debug_api.h: core/box/tiny_layout_box.h: core/box/../hakmem_tiny_config.h: diff --git a/tiny_adaptive_sizing.d b/tiny_adaptive_sizing.d index 74a7024f..c9fde647 100644 --- a/tiny_adaptive_sizing.d +++ b/tiny_adaptive_sizing.d @@ -10,10 +10,10 @@ tiny_adaptive_sizing.o: core/tiny_adaptive_sizing.c \ core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \ core/superslab/superslab_types.h core/superslab/../tiny_box_geometry.h \ core/tiny_debug_ring.h core/tiny_remote.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/tiny_debug_api.h \ - core/box/tiny_layout_box.h core/box/../hakmem_tiny_config.h \ - core/box/tiny_header_box.h core/box/tiny_layout_box.h \ - core/box/../tiny_region_id.h + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/tiny_debug_api.h core/box/tiny_layout_box.h \ + core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ + core/box/tiny_layout_box.h core/box/../tiny_region_id.h core/tiny_adaptive_sizing.h: core/hakmem_tiny.h: core/hakmem_build_flags.h: @@ -40,6 +40,7 @@ core/tiny_debug_ring.h: core/tiny_remote.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/tiny_debug_api.h: core/box/tiny_layout_box.h: core/box/../hakmem_tiny_config.h: diff --git a/tiny_fastcache.d b/tiny_fastcache.d index 1c9a746d..4535048f 100644 --- a/tiny_fastcache.d +++ b/tiny_fastcache.d @@ -8,9 +8,10 @@ tiny_fastcache.o: core/tiny_fastcache.c core/tiny_fastcache.h \ core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \ core/superslab/superslab_types.h core/superslab/../tiny_box_geometry.h \ core/tiny_debug_ring.h core/tiny_remote.h core/box/ss_addr_map_box.h \ - core/box/../hakmem_build_flags.h core/hakmem_tiny.h core/hakmem_trace.h \ - core/hakmem_tiny_mini_mag.h core/box/hak_lane_classify.inc.h \ - core/box/ptr_type_box.h core/tiny_debug_api.h core/box/tiny_layout_box.h \ + core/box/../hakmem_build_flags.h core/box/super_reg_box.h \ + core/hakmem_tiny.h core/hakmem_trace.h core/hakmem_tiny_mini_mag.h \ + core/box/hak_lane_classify.inc.h core/box/ptr_type_box.h \ + core/tiny_debug_api.h core/box/tiny_layout_box.h \ core/box/../hakmem_tiny_config.h core/box/tiny_header_box.h \ core/box/tiny_layout_box.h core/box/../tiny_region_id.h core/tiny_fastcache.h: @@ -35,6 +36,7 @@ core/tiny_debug_ring.h: core/tiny_remote.h: core/box/ss_addr_map_box.h: core/box/../hakmem_build_flags.h: +core/box/super_reg_box.h: core/hakmem_tiny.h: core/hakmem_trace.h: core/hakmem_tiny_mini_mag.h: diff --git a/tiny_remote.d b/tiny_remote.d index f2b226d6..bfc7d22f 100644 --- a/tiny_remote.d +++ b/tiny_remote.d @@ -1,11 +1,13 @@ tiny_remote.o: core/tiny_remote.c core/tiny_remote.h \ - core/hakmem_tiny_superslab.h core/superslab/superslab_types.h \ - core/hakmem_tiny_superslab_constants.h core/superslab/superslab_inline.h \ - core/superslab/superslab_types.h core/superslab/../tiny_box_geometry.h \ + core/box/remote_side_box.h core/hakmem_tiny_superslab.h \ + core/superslab/superslab_types.h core/hakmem_tiny_superslab_constants.h \ + core/superslab/superslab_inline.h core/superslab/superslab_types.h \ + core/superslab/../tiny_box_geometry.h \ core/superslab/../hakmem_tiny_superslab_constants.h \ core/superslab/../hakmem_tiny_config.h core/tiny_debug_ring.h \ core/hakmem_build_flags.h core/hakmem_tiny_superslab_constants.h core/tiny_remote.h: +core/box/remote_side_box.h: core/hakmem_tiny_superslab.h: core/superslab/superslab_types.h: core/hakmem_tiny_superslab_constants.h: