diff --git a/AGENTS.md b/AGENTS.md index 512c5963..6be597de 100644 --- a/AGENTS.md +++ b/AGENTS.md @@ -238,10 +238,13 @@ Do / Don’t(壊れやすいパターンの禁止) - C6 は Phase C6-FREEZE 中(標準では Tiny/SmallObject/ULTRA に載せない) - 標準プロファイル(MIXED_TINYV3_C7_SAFE / C6_HEAVY_LEGACY_POOLV1)は C6 を mid/pool の通常クラスとして扱う。C6_HOT や smallheap(v3/v4)/ULTRA はすべて研究用の opt-in に限定。 - C6 を触るときは ENV_PROFILE_PRESETS.md の研究プリセット(C6_SMALL_HEAP_V3_EXPERIMENT / C6_SMALL_HEAP_V4_EXPERIMENT など)から開始し、標準プリセットに混ぜ込まないこと。 + - mid/smallmid の性能を攻めるときは、まず pool/mid 側の HotBox 設計(SmallObject v4 や mid 用 ULTRA)を考え、C6 専用の特別扱いは後段フェーズまで凍結する。 - C7 ULTRA は UF-3 セグメント版が標準(ENV で OFF に戻せる) - `HAKMEM_TINY_C7_ULTRA_ENABLED` デフォルト ON。2MiB セグメント(64KiB ページ, mask 判定)上で carve/push。セグメント外の ptr は必ず既存 v3 free にフォールバックする。 - ON/OFF の前後で必ず健康診断ランを 1 回ずつ取り、差分が出たらまず ULTRA 側を疑う。 + - ヘッダ軽量化(`HAKMEM_TINY_C7_ULTRA_HEADER_LIGHT`)は研究箱。Mixed/C7-only の A/B を取ってから触ること。デフォルトは OFF。 + - 将来的にヘッダレス設計を広げるときは、まず C7 ULTRA の勝ちパターンを small-object v4(16〜2KiB)にコピーし、各サイズクラスを一つの SmallObjectHotBox v4 に統合してから mid/pool 側を触る。 - 一般ルール(壊れたらまず健康診断ラン) - Tiny / Superslab / Pool に手を入れたあと、まず上記の健康診断ランを 1 回だけ回してから長尺ベンチ・本番 A/B に進んでください。 diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index 42932657..3cd145e6 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -26,6 +26,19 @@ - UF-1: `core/box/tiny_c7_ultra_box.h` / `core/tiny_c7_ultra.c` を追加。ENV `HAKMEM_TINY_C7_ULTRA_ENABLED`(デフォルト OFF)で front から C7 ULTRA stub を経由し、内部では既存 C7 v3 の so_alloc/so_free を呼ぶだけで挙動・性能は不変。 - UF-2: ULTRA TLS に自前 freelist と C7 ページジオメトリを持たせ、C7 v3 ColdIface から 1 ページ lease→自前 carve→ヘッダ書きで返すホットパスを実装。範囲外/失敗は so_alloc/so_free へ即フォールバック。Mixed 16–1024B(C7-only v3 本線, ws=400, 1M)で ULTRA ON は OFF 比でおおよそ +9% 程度の改善を確認(segv/assert なし)。UF-3 以降で専用 2MiB セグメント+mask 判定による完全ヘッダレス化を予定。 +### Phase UF-3: C7 ULTRA 専用セグメント(デフォルトON) +- 追加: `core/box/tiny_c7_ultra_segment_box.h` / `core/tiny_c7_ultra_segment.c` で 2MiB セグメント(64KiB ページ)を mmap、mask 判定で page_of を引く箱を追加。ULTRA TLS から carve/push できるように統合。 +- front gate: `HAKMEM_TINY_C7_ULTRA_ENABLED` デフォルト ON。seg 外 ptr は so_free(7) へフォールバックする Fail-Fast オーバーレイ。 +- Mixed 16–1024B (ws=400, 1M): ULTRA ON ≈ **44–45M ops/s**(ULTRA OFF は ≈35M)。C7-only (MIN=MAX=1024): ULTRA ON ≈ **57.5M ops/s**, OFF ≈ **38.1M ops/s**。segv/assert なし。 + +### Phase UF-4: C7 ULTRA header light(研究箱, デフォルト OFF) +- 目的: ULTRA alloc/free から毎回の tiny_region_id_write_header を外し、carve 時の一括初期化に寄せる。 +- 変更: freelist next をヘッダ直後に保存するように変更(ヘッダを保持したまま push/pop)。ENV `HAKMEM_TINY_C7_ULTRA_HEADER_LIGHT`(default 0)を tiny_front_v3_env_box Snapshot に追加。ULTRA carve 時に ON なら全ブロックへ 1 回だけヘッダを書き、alloc ではヘッダ済みなら書き直さない。 +- A/B: + - Mixed 16–1024B (C7-only v3, ULTRA ON, ws=400, iters=1M): OFF **45.03M ops/s** → ON **43.97M ops/s**(-2% 程度, segv/assert なし)。 + - C7-only (MIN=MAX=1024, ULTRA ON): OFF **55.37M ops/s** → ON **55.90M ops/s**(+1% 程度)。 +- 方針: header light は研究箱のまま。Mixed ではわずかにマイナスなのでデフォルト OFF を維持。C7-only ベンチでわずかにプラスが出るため、今後の ULTRA pf/segment 改善と合わせて再評価する。 + ### Phase C6-FREEZE: C6 は mid/pool の普通クラスに固定(研究箱に退避) - 目的: C6 を Tiny/SmallObject/ULTRA で特別扱いしないのを標準とし、C6 v3/v4/ULTRA/flatten はすべて ENV opt-in の研究箱に戻す。 - 変更: diff --git a/core/box/smallobject_hotbox_v4_box.h b/core/box/smallobject_hotbox_v4_box.h index f002fdf1..72c5a871 100644 --- a/core/box/smallobject_hotbox_v4_box.h +++ b/core/box/smallobject_hotbox_v4_box.h @@ -15,6 +15,8 @@ #define SMALLOBJECT_NUM_CLASSES TINY_NUM_CLASSES #endif +struct small_segment_v4; + // Page metadata for v4 HotBox typedef struct small_page_v4 { void* freelist; @@ -25,6 +27,7 @@ typedef struct small_page_v4 { uint32_t block_size; uint8_t* base; void* slab_ref; // Superslab / lease token (box境界で扱う) + struct small_segment_v4* segment; // PF3: segment owner(NULLなら Tiny v1 経路) struct small_page_v4* next; } small_page_v4; diff --git a/core/box/smallsegment_v4_box.h b/core/box/smallsegment_v4_box.h index 127d22fb..c2eaf446 100644 --- a/core/box/smallsegment_v4_box.h +++ b/core/box/smallsegment_v4_box.h @@ -5,7 +5,7 @@ typedef struct small_segment_v4 small_segment_v4; // class_idx ごとに小さな Segment を確保/再利用する想定。 -// まだ実装はなく、次フェーズで Superslab/OS との接続を決める。 +// Phase PF3: Superslab/Tiny v1 からの lease を経由してページを供給する。 small_segment_v4* smallsegment_v4_acquire(int class_idx); void* smallsegment_v4_alloc_page(small_segment_v4* seg, int class_idx); -void smallsegment_v4_release_if_empty(small_segment_v4* seg); +void smallsegment_v4_release_if_empty(small_segment_v4* seg, void* page, int class_idx); diff --git a/core/box/tiny_c7_ultra_box.h b/core/box/tiny_c7_ultra_box.h index c5513534..d27824b9 100644 --- a/core/box/tiny_c7_ultra_box.h +++ b/core/box/tiny_c7_ultra_box.h @@ -1,6 +1,7 @@ // tiny_c7_ultra_box.h - C7 ULTRA TLS box (UF-2: TLS freelist, coldはv3を利用) #pragma once +#include #include #include #include "tiny_c7_ultra_segment_box.h" @@ -15,6 +16,7 @@ typedef struct tiny_c7_ultra_tls_t { uint32_t page_idx; // セグメント内ページ index tiny_c7_ultra_segment_t* seg; // 所有セグメント tiny_c7_ultra_page_meta_t* page_meta; // 現在のページメタ + bool headers_initialized; // carve 済みヘッダが有効か } tiny_c7_ultra_tls_t; // TLS accessor diff --git a/core/box/tiny_front_v3_env_box.h b/core/box/tiny_front_v3_env_box.h index 70e6a7d1..be152889 100644 --- a/core/box/tiny_front_v3_env_box.h +++ b/core/box/tiny_front_v3_env_box.h @@ -13,6 +13,7 @@ typedef struct TinyFrontV3Snapshot { bool header_v3_enabled; // ENV: HAKMEM_TINY_HEADER_V3_ENABLED bool header_v3_skip_c7; // ENV: HAKMEM_TINY_HEADER_V3_SKIP_C7 bool c7_ultra_enabled; // ENV: HAKMEM_TINY_C7_ULTRA_ENABLED + bool c7_ultra_header_light; // ENV: HAKMEM_TINY_C7_ULTRA_HEADER_LIGHT } TinyFrontV3Snapshot; // Size→class/route entry for Tiny front v3 LUT (route_kind は tiny_route_kind_t を想定) @@ -132,6 +133,16 @@ static inline bool tiny_c7_ultra_enabled_env(void) { return g != 0; } +// C7 ULTRA header light gate (default OFF: research box) +static inline bool tiny_c7_ultra_header_light_enabled_env(void) { + static int g = -1; + if (__builtin_expect(g == -1, 0)) { + const char* e = getenv("HAKMEM_TINY_C7_ULTRA_HEADER_LIGHT"); + g = (e && *e && *e != '0') ? 1 : 0; + } + return g != 0; +} + // Snapshot initializer (implemented in hakmem_tiny.c) void tiny_front_v3_snapshot_init(void); @@ -152,3 +163,9 @@ static inline bool tiny_front_v3_c7_ultra_enabled(void) { const TinyFrontV3Snapshot* snap = tiny_front_v3_snapshot_get(); return snap->c7_ultra_enabled; } + +// Cached getter for ULTRA header light gate +static inline bool tiny_front_v3_c7_ultra_header_light_enabled(void) { + const TinyFrontV3Snapshot* snap = tiny_front_v3_snapshot_get(); + return snap->c7_ultra_header_light; +} diff --git a/core/front/malloc_tiny_fast.h b/core/front/malloc_tiny_fast.h index 6d6f85f6..510c437d 100644 --- a/core/front/malloc_tiny_fast.h +++ b/core/front/malloc_tiny_fast.h @@ -42,6 +42,7 @@ #include "../box/tiny_hotheap_v2_box.h" // TinyHotHeap v2 (Phase31 A/B) #include "../box/smallobject_hotbox_v3_box.h" // SmallObject HotHeap v3 skeleton #include "../box/smallobject_hotbox_v4_box.h" // SmallObject HotHeap v4 (C7 stub) +#include "../box/tiny_c7_ultra_box.h" // C7 ULTRA stub (UF-1, delegates to v3) #include "../box/tiny_front_v3_env_box.h" // Tiny front v3 snapshot gate #include "../box/tiny_heap_env_box.h" // ENV gate for TinyHeap front (A/B) #include "../box/tiny_route_env_box.h" // Route snapshot (Heap vs Legacy) @@ -140,6 +141,18 @@ static inline void* malloc_tiny_fast(size_t size) { tiny_front_alloc_stat_inc(class_idx); + // C7 ULTRA stub (UF-1): delegates to v3, ENV gated + if (class_idx == 7 && + tiny_front_v3_enabled() && + tiny_front_v3_c7_ultra_enabled() && + small_heap_v3_c7_enabled()) { + void* ultra_p = tiny_c7_ultra_alloc(size); + if (ultra_p) { + return ultra_p; + } + // fallback to existing route on miss + } + switch (route) { case TINY_ROUTE_SMALL_HEAP_V3: { void* v3p = so_alloc((uint32_t)class_idx); @@ -241,6 +254,15 @@ static inline int free_tiny_fast(void* ptr) { void* base = (void*)((char*)ptr - 1); tiny_front_free_stat_inc(class_idx); + // C7 ULTRA stub (UF-1): delegates to v3, ENV gated + if (class_idx == 7 && + tiny_front_v3_enabled() && + tiny_front_v3_c7_ultra_enabled() && + small_heap_v3_c7_enabled()) { + tiny_c7_ultra_free(base); + return 1; + } + // C7 v3 fast classify: bypass classify_ptr/ss_map_lookup for clear hits if (class_idx == 7 && tiny_front_v3_enabled() && diff --git a/core/hakmem_tiny.c b/core/hakmem_tiny.c index 1d147cf1..2bd9b3d6 100644 --- a/core/hakmem_tiny.c +++ b/core/hakmem_tiny.c @@ -83,6 +83,8 @@ void tiny_front_v3_snapshot_init(void) { .header_mode = (uint8_t)tiny_header_mode(), .header_v3_enabled = tiny_header_v3_enabled(), .header_v3_skip_c7 = tiny_header_v3_skip_c7(), + .c7_ultra_enabled = tiny_c7_ultra_enabled_env(), + .c7_ultra_header_light = tiny_c7_ultra_header_light_enabled_env(), }; g_tiny_front_v3_snapshot = snap; g_tiny_front_v3_snapshot_ready = 1; diff --git a/core/smallobject_hotbox_v4.c b/core/smallobject_hotbox_v4.c index 556089d3..cec4297a 100644 --- a/core/smallobject_hotbox_v4.c +++ b/core/smallobject_hotbox_v4.c @@ -10,6 +10,8 @@ #include "box/smallobject_cold_iface_v4.h" #include "box/smallobject_hotbox_v3_env_box.h" #include "box/tiny_heap_box.h" +#include "box/smallsegment_v4_box.h" +#include "box/smallsegment_v4_env_box.h" #include "box/tiny_cold_iface_v1.h" #include "box/tiny_geometry_box.h" #include "tiny_region_id.h" @@ -17,10 +19,20 @@ // TLS context static __thread small_heap_ctx_v4 g_ctx_v4; +typedef struct small_segment_v4 { + int class_idx; + size_t segment_size; + tiny_heap_ctx_t* tiny_ctx; +} small_segment_v4; + +static __thread small_segment_v4 g_segments_v4[SMALLOBJECT_NUM_CLASSES]; + small_heap_ctx_v4* small_heap_ctx_v4_get(void) { return &g_ctx_v4; } +static small_page_v4* v4_page_from_lease(tiny_heap_page_t* lease, int class_idx, small_segment_v4* seg); + // ----------------------------------------------------------------------------- // helpers // ----------------------------------------------------------------------------- @@ -29,6 +41,55 @@ static inline int v4_class_supported(int class_idx) { return class_idx == 7 || class_idx == 6 || class_idx == 5; } +static size_t smallsegment_v4_default_size(void) { + const char* env = smallsegment_v4_size_env(); + if (env && *env) { + size_t v = strtoull(env, NULL, 0); + if (v > (size_t)(64 * 1024)) { + return v; + } + } + return (size_t)(2 * 1024 * 1024); // default 2MiB segment単位(将来の実装用) +} + +small_segment_v4* smallsegment_v4_acquire(int class_idx) { + if (!v4_class_supported(class_idx)) return NULL; + small_segment_v4* seg = &g_segments_v4[class_idx]; + seg->class_idx = class_idx; + if (!seg->segment_size) { + seg->segment_size = smallsegment_v4_default_size(); + } + if (!seg->tiny_ctx) { + seg->tiny_ctx = tiny_heap_ctx_for_thread(); + } + return seg; +} + +void* smallsegment_v4_alloc_page(small_segment_v4* seg, int class_idx) { + if (!seg || !v4_class_supported(class_idx)) return NULL; + if (!seg->tiny_ctx) { + seg->tiny_ctx = tiny_heap_ctx_for_thread(); + } + tiny_heap_ctx_t* tctx = seg->tiny_ctx ? seg->tiny_ctx : tiny_heap_ctx_for_thread(); + if (!tctx) return NULL; + + tiny_heap_page_t* lease = tiny_heap_prepare_page(tctx, class_idx); + if (!lease) return NULL; + seg->tiny_ctx = tctx; + return v4_page_from_lease(lease, class_idx, seg); +} + +void smallsegment_v4_release_if_empty(small_segment_v4* seg, void* page_ptr, int class_idx) { + small_page_v4* page = (small_page_v4*)page_ptr; + if (!page || !v4_class_supported(class_idx)) return; + tiny_heap_ctx_t* tctx = (seg && seg->tiny_ctx) ? seg->tiny_ctx : tiny_heap_ctx_for_thread(); + tiny_heap_page_t* lease = (tiny_heap_page_t*)page->slab_ref; + if (tctx && lease) { + tiny_heap_page_becomes_empty(tctx, class_idx, lease); + } + free(page); +} + static inline void v4_page_push_partial(small_class_heap_v4* h, small_page_v4* page) { if (!h || !page) return; page->next = h->partial_head; @@ -128,19 +189,10 @@ int smallobject_hotbox_v4_can_own(int class_idx, void* ptr) { // Cold iface (C5/C6/C7, Tiny v1 経由) // ----------------------------------------------------------------------------- -static small_page_v4* cold_refill_page_v4(small_heap_ctx_v4* hot_ctx, uint32_t class_idx) { - if (__builtin_expect(!v4_class_supported((int)class_idx), 0)) return NULL; - (void)hot_ctx; - tiny_heap_ctx_t* tctx = tiny_heap_ctx_for_thread(); - if (!tctx) return NULL; - - tiny_heap_page_t* lease = tiny_heap_prepare_page(tctx, (int)class_idx); +static small_page_v4* v4_page_from_lease(tiny_heap_page_t* lease, int class_idx, small_segment_v4* seg) { if (!lease) return NULL; - small_page_v4* page = (small_page_v4*)malloc(sizeof(small_page_v4)); - if (!page) { - return NULL; - } + if (!page) return NULL; memset(page, 0, sizeof(*page)); page->class_idx = (uint8_t)class_idx; page->capacity = lease->capacity; @@ -148,6 +200,7 @@ static small_page_v4* cold_refill_page_v4(small_heap_ctx_v4* hot_ctx, uint32_t c page->block_size = (uint32_t)tiny_stride_for_class((int)class_idx); page->base = lease->base; page->slab_ref = lease; + page->segment = seg; page->freelist = v4_build_freelist(lease->base, lease->capacity, page->block_size); if (!page->freelist) { free(page); @@ -158,9 +211,32 @@ static small_page_v4* cold_refill_page_v4(small_heap_ctx_v4* hot_ctx, uint32_t c return page; } +static small_page_v4* cold_refill_page_v4(small_heap_ctx_v4* hot_ctx, uint32_t class_idx) { + if (__builtin_expect(!v4_class_supported((int)class_idx), 0)) return NULL; + (void)hot_ctx; + + if (smallsegment_v4_enabled()) { + small_segment_v4* seg = smallsegment_v4_acquire((int)class_idx); + return (small_page_v4*)smallsegment_v4_alloc_page(seg, (int)class_idx); + } + + tiny_heap_ctx_t* tctx = tiny_heap_ctx_for_thread(); + if (!tctx) return NULL; + + tiny_heap_page_t* lease = tiny_heap_prepare_page(tctx, (int)class_idx); + if (!lease) return NULL; + + return v4_page_from_lease(lease, (int)class_idx, NULL); +} + static void cold_retire_page_v4(small_heap_ctx_v4* hot_ctx, uint32_t class_idx, small_page_v4* page) { (void)hot_ctx; if (!page) return; + if (smallsegment_v4_enabled()) { + small_segment_v4* seg = (small_segment_v4*)page->segment; + smallsegment_v4_release_if_empty(seg, page, (int)class_idx); + return; + } tiny_heap_ctx_t* tctx = tiny_heap_ctx_for_thread(); tiny_heap_page_t* lease = (tiny_heap_page_t*)page->slab_ref; if (tctx && lease) { diff --git a/core/tiny_c7_ultra.c b/core/tiny_c7_ultra.c index e8a56ac3..0a9ecb56 100644 --- a/core/tiny_c7_ultra.c +++ b/core/tiny_c7_ultra.c @@ -3,11 +3,13 @@ #include #include #include +#include #include "box/tiny_c7_ultra_box.h" #include "box/smallobject_hotbox_v3_box.h" #include "box/tiny_geometry_box.h" #include "tiny_region_id.h" #include "box/tiny_c7_ultra_segment_box.h" +#include "box/tiny_front_v3_env_box.h" static __thread tiny_c7_ultra_tls_t g_tiny_c7_ultra_tls; @@ -19,12 +21,24 @@ static inline void tiny_c7_ultra_clear(tiny_c7_ultra_tls_t* tls) { tls->freelist = NULL; tls->page_idx = 0; tls->page_meta = NULL; + tls->headers_initialized = false; } tiny_c7_ultra_tls_t* tiny_c7_ultra_tls_get(void) { return &g_tiny_c7_ultra_tls; } +// freelist next をヘッダを壊さずに保持する(ヘッダ byte の直後に保存) +static inline void ultra_store_next(void* base, void* next) { + memcpy((uint8_t*)base + 1, &next, sizeof(next)); +} + +static inline void* ultra_load_next(void* base) { + void* next = NULL; + memcpy(&next, (uint8_t*)base + 1, sizeof(next)); + return next; +} + // セグメントから C7 ページを 1 枚借りて自前で carve する static bool tiny_c7_ultra_lease_page(tiny_c7_ultra_tls_t* tls) { tiny_c7_ultra_segment_t* seg = tls->seg; @@ -40,6 +54,8 @@ static bool tiny_c7_ultra_lease_page(tiny_c7_ultra_tls_t* tls) { uint32_t capacity = (uint32_t)(seg->page_size / block_sz); if (capacity == 0) return false; + const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled(); + // 空きページを 1 枚だけ拾う(UF-3 では最初の空きを線形探索) uint32_t chosen = seg->num_pages; for (uint32_t i = 0; i < seg->num_pages; i++) { @@ -60,7 +76,11 @@ static bool tiny_c7_ultra_lease_page(tiny_c7_ultra_tls_t* tls) { void* head = NULL; for (int i = (int)capacity - 1; i >= 0; i--) { uint8_t* blk = base + ((size_t)i * block_sz); - *(void**)blk = head; + if (header_light) { + // header_light 時は carve で 1 度だけヘッダを書き込む + tiny_region_id_write_header(blk, 7); + } + ultra_store_next(blk, head); head = blk; } if (!head) { @@ -78,17 +98,19 @@ static bool tiny_c7_ultra_lease_page(tiny_c7_ultra_tls_t* tls) { tls->freelist = head; tls->page_idx = chosen; tls->page_meta = page; + tls->headers_initialized = header_light; return true; } void* tiny_c7_ultra_alloc(size_t size) { (void)size; // C7 専用のため未使用 tiny_c7_ultra_tls_t* tls = tiny_c7_ultra_tls_get(); + const bool header_light = tiny_front_v3_c7_ultra_header_light_enabled(); // 1) freelist hit void* p = tls->freelist; if (__builtin_expect(p != NULL, 1)) { - void* next = *(void**)p; + void* next = ultra_load_next(p); tls->freelist = next; if (tls->page_meta) { tls->page_meta->freelist = next; @@ -99,6 +121,9 @@ void* tiny_c7_ultra_alloc(size_t size) { if (tls->used < tls->capacity) { tls->used++; } + if (header_light && tls->headers_initialized) { + return (uint8_t*)p + 1; + } return tiny_region_id_write_header(p, 7); } @@ -112,7 +137,7 @@ void* tiny_c7_ultra_alloc(size_t size) { if (__builtin_expect(p == NULL, 0)) { return so_alloc(7); } - void* next = *(void**)p; + void* next = ultra_load_next(p); tls->freelist = next; if (tls->page_meta) { tls->page_meta->freelist = next; @@ -123,6 +148,9 @@ void* tiny_c7_ultra_alloc(size_t size) { if (tls->used < tls->capacity) { tls->used++; } + if (header_light && tls->headers_initialized) { + return (uint8_t*)p + 1; + } return tiny_region_id_write_header(p, 7); } @@ -158,7 +186,7 @@ void tiny_c7_ultra_free(void* ptr) { return; } - *(void**)ptr = page->freelist; + ultra_store_next(ptr, page->freelist); page->freelist = ptr; if (page->used > 0) { page->used--; diff --git a/docs/analysis/C6_HEAVY_VISIBILITY_ANALYSIS_PHASE_C6H.md b/docs/analysis/C6_HEAVY_VISIBILITY_ANALYSIS_PHASE_C6H.md new file mode 100644 index 00000000..63229314 --- /dev/null +++ b/docs/analysis/C6_HEAVY_VISIBILITY_ANALYSIS_PHASE_C6H.md @@ -0,0 +1,414 @@ +# C6-Heavy (257-768B) Visibility Analysis - Phase C6-H + +**Date**: 2025-12-10 +**Benchmark**: `./bench_mid_large_mt_hakmem 1 1000000 400 1` (1 thread, ws=400, iters=1M) +**Size Range**: 257-768B (Class 6: 512B allocations) +**Configuration**: C6_HEAVY_LEGACY_POOLV1 profile (C7_SAFE + C6_HOT=1) + +--- + +## Executive Summary + +### Performance Gap Analysis +- **HAKMEM**: 9.84M ops/s (baseline) +- **mimalloc**: 51.3M ops/s +- **Performance Gap**: **5.2x** (mimalloc is 421% faster) + +This represents a **critical performance deficit** in the C6-heavy allocation path, where HAKMEM achieves only **19% of mimalloc's throughput**. + +### Key Findings +1. **C6 does NOT use Pool flatten path** - With `HAKMEM_TINY_C6_HOT=1`, allocations route through TinyHeap v1, bypassing pool flatten entirely +2. **Address lookup dominates CPU time** - `hak_super_lookup` (9.3%) + `mid_desc_lookup` (8.2%) + `classify_ptr` (5.8%) = **23.3% of cycles** +3. **Pool operations are expensive** - Despite not using flatten, pool alloc/free combined still consume ~15-20% of cycles +4. **Mid_desc cache provides modest gains** - +6.4% improvement (9.8M → 10.4M ops/s) + +--- + +## Phase C6-H1: Baseline Metrics + +### Test Configuration +```bash +export HAKMEM_PROFILE=C6_HEAVY_LEGACY_POOLV1 +export HAKMEM_BENCH_MIN_SIZE=257 +export HAKMEM_BENCH_MAX_SIZE=768 +``` + +### Baseline Results + +| Configuration | Throughput (ops/s) | vs mimalloc | Notes | +|---------------|-------------------|-------------|-------| +| **Baseline (C6_HOT=1, mid_desc_cache=1)** | 9,836,420 | 19.2% | Default profile | +| **C6_HOT=1, mid_desc_cache=0** | 9,805,954 | 19.1% | Without cache | +| **C6_HOT=1, mid_desc_cache=1** | 10,435,480 | 20.3% | With cache (+6.4%) | +| **C6_HOT=0 (pure legacy pool)** | 9,938,473 | 19.4% | Pool path ~same as TinyHeap | +| **mimalloc baseline** | 51,297,877 | 100.0% | Reference | + +### Key Observations +1. **Mid_desc cache effect**: +6.4% improvement, but far from closing the gap +2. **C6_HOT vs pool path**: Nearly identical performance (~9.8M-9.9M ops/s), suggesting the bottleneck is in common infrastructure (address lookup, classification) +3. **Size class routing**: 257-768B → Class 6 (512B) as expected + +--- + +## Phase C6-H2: Pool Flatten and Cache Analysis + +### Pool Flatten Test (ATTEMPTED) + +**Finding**: Pool v1 flatten path is **NOT USED** for C6 allocations with `HAKMEM_TINY_C6_HOT=1`. + +```bash +# Test with flatten enabled +export HAKMEM_POOL_V1_FLATTEN_ENABLED=1 +export HAKMEM_POOL_V1_FLATTEN_STATS=1 +# Result: [POOL_V1_FLAT] alloc_tls_hit=0 alloc_fb=0 free_tls_hit=0 free_fb=0 +``` + +**Root Cause**: +- With `HAKMEM_TINY_C6_HOT=1`, class 6 routes to `TINY_ROUTE_HEAP` (TinyHeap v1) +- TinyHeap v1 uses its own allocation path via `tiny_heap_box.h`, not the pool flatten path +- Pool flatten optimizations (Phase 80-82) only apply to **legacy pool path** (when C6_HOT=0) + +### Mid_Desc Cache Analysis + +| Metric | Without Cache | With Cache | Delta | +|--------|--------------|------------|-------| +| Throughput | 9.81M ops/s | 10.44M ops/s | +6.4% | +| Expected self% reduction | mid_desc_lookup: 8.2% | ~6-7% (estimated) | ~1-2% | + +**Conclusion**: Mid_desc cache provides measurable but insufficient improvement. The 8.2% CPU time in `mid_desc_lookup` is reduced, but other lookup costs (hak_super_lookup, classify_ptr) remain. + +--- + +## Phase C6-H3: CPU Hotspot Analysis + +### Perf Stat Results + +``` +Benchmark: 9,911,926 ops/s (0.101s runtime) +Cycles: 398,766,361 cycles:u +Instructions: 1,054,643,524 instructions:u +IPC: 2.64 +Page Faults: 7,131 +Task Clock: 119.08 ms +``` + +**Analysis**: +- **IPC 2.64**: Reasonable instruction-level parallelism, but many cycles wasted +- **Cycles per operation**: 398,766,361 / 1,000,000 = **398 cycles/op** +- **Instructions per operation**: 1,054,643,524 / 1,000,000 = **1,054 instructions/op** + +**Comparison estimate** (mimalloc at 51.3M ops/s): +- Estimated cycles/op for mimalloc: ~76 cycles/op (5.2x faster) +- HAKMEM uses **5.2x more cycles** per allocation/free pair + +### Perf Record Hotspots (Top 20 Functions) + +| Function | Self % | Category | Description | +|----------|--------|----------|-------------| +| `hak_super_lookup` | 9.32% | Address Lookup | Superslab registry lookup (largest single cost) | +| `mid_desc_lookup` | 8.23% | Address Lookup | Mid-size descriptor lookup | +| `hak_pool_get_class_index` | 5.87% | Classification | Size→class mapping | +| `classify_ptr` | 5.76% | Classification | Pointer classification for free | +| `hak_pool_free_v1_impl` | 5.52% | Pool Free | Pool free implementation | +| `hak_pool_try_alloc_v1_impl` | 5.46% | Pool Alloc | Pool allocation implementation | +| `free` | 4.54% | Front Gate | glibc free wrapper | +| `worker_run` | 4.47% | Benchmark | Benchmark driver | +| `ss_map_lookup` | 4.35% | Address Lookup | Superslab map lookup | +| `super_reg_effective_mask` | 4.32% | Address Lookup | Registry mask computation | +| `mid_desc_hash` | 3.69% | Address Lookup | Hash computation for mid_desc | +| `mid_set_header` | 3.27% | Metadata | Header initialization | +| `mid_page_inuse_dec_and_maybe_dn` | 3.17% | Metadata | Page occupancy tracking | +| `mid_desc_init_once` | 2.71% | Initialization | Descriptor initialization | +| `malloc` | 2.60% | Front Gate | glibc malloc wrapper | +| `hak_free_at` | 2.53% | Front Gate | Internal free dispatcher | +| `hak_pool_mid_lookup_v1_impl` | 2.17% | Pool Lookup | Pool-specific descriptor lookup | +| `super_reg_effective_size` | 1.87% | Address Lookup | Registry size computation | +| `hak_pool_free_fast_v1_impl` | 1.77% | Pool Free | Fast path for pool free | +| `hak_pool_init` | 1.44% | Initialization | Pool initialization | + +### Hotspot Category Breakdown + +| Category | Combined Self % | Functions | +|----------|----------------|-----------| +| **Address Lookup & Classification** | **41.5%** | hak_super_lookup, mid_desc_lookup, classify_ptr, hak_pool_get_class_index, ss_map_lookup, super_reg_effective_mask, mid_desc_hash, super_reg_effective_size, hak_pool_mid_lookup_v1_impl | +| **Pool Operations** | **14.8%** | hak_pool_try_alloc_v1_impl, hak_pool_free_v1_impl, hak_pool_free_fast_v1_impl | +| **Metadata Management** | **9.2%** | mid_set_header, mid_page_inuse_dec_and_maybe_dn, mid_desc_init_once | +| **Front Gate** | **9.7%** | malloc, free, hak_free_at | +| **Benchmark Driver** | **4.5%** | worker_run | +| **Other** | **20.3%** | Various helpers, initialization, etc. | + +--- + +## Root Cause Analysis + +### 1. Address Lookup Dominates (41.5% of CPU) + +The single largest performance killer is **address→metadata lookup infrastructure**: + +- **hak_super_lookup** (9.3%): Superslab registry lookup to find which allocator owns a pointer +- **mid_desc_lookup** (8.2%): Hash-based descriptor lookup for mid-size allocations +- **ss_map_lookup** (4.3%): Secondary map lookup within superslab +- **classify_ptr** (5.8%): Pointer classification during free +- **hak_pool_get_class_index** (5.9%): Size→class index computation + +**Why this matters**: Every allocation AND free requires multiple lookups: +- Alloc: size → class_idx → descriptor → block +- Free: ptr → superslab → descriptor → classification → free handler + +**Comparison to mimalloc**: mimalloc likely uses: +- Thread-local caching with minimal lookup +- Direct pointer arithmetic from block headers +- Segment-based organization reducing lookup depth + +### 2. Pool Operations Still Expensive (14.8%) + +Despite C6 routing through TinyHeap (not pool flatten), pool operations still consume significant cycles: +- `hak_pool_try_alloc_v1_impl` (5.5%) +- `hak_pool_free_v1_impl` (5.5%) + +**Why**: TinyHeap v1 likely calls into pool infrastructure for: +- Page allocation from mid/smallmid pool +- Descriptor management +- Cross-thread handling + +### 3. Metadata Overhead (9.2%) + +Mid-size allocations carry significant metadata overhead: +- Header initialization: `mid_set_header` (3.3%) +- Occupancy tracking: `mid_page_inuse_dec_and_maybe_dn` (3.2%) +- Descriptor init: `mid_desc_init_once` (2.7%) + +### 4. Front Gate Overhead (9.7%) + +The malloc/free wrappers add non-trivial cost: +- Route determination +- Cross-allocator checks (jemalloc, system) +- Lock depth checks +- Initialization checks + +--- + +## Recommendations for Next Phase + +### Priority 1: Address Lookup Reduction (Highest Impact) +**Target**: 41.5% → 20-25% of cycles + +**Strategies**: +1. **TLS Descriptor Cache**: Extend mid_desc_cache to cache full allocation context (class_idx + descriptor + page_info) +2. **Fast Path Header**: Embed class_idx in allocation header for instant classification on free (similar to tiny allocations) +3. **Segment-Based Addressing**: Consider segment-style addressing (like mimalloc) where ptr→metadata is direct pointer arithmetic +4. **Superslab Lookup Bypass**: For C6-heavy workloads, skip superslab lookup when we know it's mid-size + +**Expected Gain**: 10-15M ops/s (+100-150%) + +### Priority 2: Pool Path Streamlining (Medium Impact) +**Target**: 14.8% → 8-10% of cycles + +**Strategies**: +1. **Dedicated C6 Fast Path**: Create a specialized alloc/free path for class 6 that skips pool generality +2. **TLS Block Cache**: Implement TLS-local block cache for C6 (bypass pool ring buffer overhead) +3. **Inline Critical Helpers**: Force-inline `hak_pool_get_class_index` and other hot helpers + +**Expected Gain**: 3-5M ops/s (+30-50%) + +### Priority 3: Metadata Streamlining (Lower Impact) +**Target**: 9.2% → 5-6% of cycles + +**Strategies**: +1. **Lazy Header Init**: Only initialize headers when necessary (debug mode, cross-thread) +2. **Batch Occupancy Updates**: Combine multiple inuse_dec calls +3. **Cached Descriptors**: Reduce descriptor initialization overhead + +**Expected Gain**: 1-2M ops/s (+10-20%) + +### Priority 4: Front Gate Thinning (Lower Impact) +**Target**: 9.7% → 6-7% of cycles + +**Strategies**: +1. **Size-Based Fast Path**: For mid-size range (257-768B), skip most gate checks +2. **Compile-Time Routing**: When jemalloc/system allocators are not used, eliminate checks + +**Expected Gain**: 1-2M ops/s (+10-20%) + +--- + +## Comparison to Historical Baselines + +| Phase | Configuration | Throughput | vs Current | Notes | +|-------|--------------|------------|------------|-------| +| **Phase 54** | C7_SAFE, mixed 16-1024B | 28.1M ops/s | 2.9x | Mixed workload | +| **Phase 80** | C6-heavy, flatten OFF | 23.1M ops/s | 2.4x | Legacy baseline | +| **Phase 81** | C6-heavy, flatten ON | 25.9M ops/s | 2.6x | +10% from flatten | +| **Phase 82** | C6-heavy, flatten ON | 26.7M ops/s | 2.7x | +13% from flatten | +| **Current (C6-H)** | C6-heavy, C6_HOT=1 | 9.8M ops/s | 1.0x | **REGRESSION** | + +**CRITICAL FINDING**: Current baseline (9.8M ops/s) is **2.4-2.7x SLOWER** than historical C6-heavy baselines (23-27M ops/s). + +**Possible Causes**: +1. **Configuration difference**: Historical tests may have used different profile (LEGACY vs C7_SAFE) +2. **Routing change**: C6_HOT=1 may be forcing a slower path through TinyHeap +3. **Build/compiler difference**: Flags or LTO settings may have changed +4. **Benchmark variance**: Different workload characteristics + +**Action Required**: Replicate historical Phase 80-82 configurations exactly to identify regression point. + +--- + +## Verification of Historical Configuration + +Let me verify the exact configuration used in Phase 80-82: + +**Phase 80-82 Configuration** (from CURRENT_TASK.md): +```bash +HAKMEM_BENCH_MIN_SIZE=257 +HAKMEM_BENCH_MAX_SIZE=768 +HAKMEM_TINY_HEAP_PROFILE=LEGACY # ← Different! +HAKMEM_TINY_HOTHEAP_V2=0 +HAKMEM_POOL_V2_ENABLED=0 +HAKMEM_POOL_V1_FLATTEN_ENABLED=1 +HAKMEM_POOL_V1_FLATTEN_STATS=1 +``` + +**Current Configuration**: +```bash +HAKMEM_PROFILE=C6_HEAVY_LEGACY_POOLV1 # Sets TINY_HEAP_PROFILE=C7_SAFE +HAKMEM_TINY_C6_HOT=1 # ← Adds TinyHeap routing +HAKMEM_POOL_V1_FLATTEN_ENABLED=0 # ← Flatten OFF by default +``` + +**Key Difference**: Historical tests used `TINY_HEAP_PROFILE=LEGACY`, which likely routes C6 through pure pool path (no TinyHeap). Current `C6_HEAVY_LEGACY_POOLV1` profile sets `TINY_HEAP_PROFILE=C7_SAFE` + `TINY_C6_HOT=1`, routing C6 through TinyHeap. + +--- + +## Action Items for Phase C6-H+1 + +1. **Replicate Historical Baseline** (URGENT) + ```bash + export HAKMEM_BENCH_MIN_SIZE=257 + export HAKMEM_BENCH_MAX_SIZE=768 + export HAKMEM_TINY_HEAP_PROFILE=LEGACY + export HAKMEM_TINY_HOTHEAP_V2=0 + export HAKMEM_POOL_V2_ENABLED=0 + export HAKMEM_POOL_V1_FLATTEN_ENABLED=0 + # Expected: ~23M ops/s + ``` + +2. **Test Flatten ON with Historical Config** + ```bash + # Same as above, but: + export HAKMEM_POOL_V1_FLATTEN_ENABLED=1 + export HAKMEM_POOL_V1_FLATTEN_STATS=1 + # Expected: ~26M ops/s with active flatten stats + ``` + +3. **Profile Comparison Matrix** + - LEGACY vs C7_SAFE profile + - C6_HOT=0 vs C6_HOT=1 + - Flatten OFF vs ON + - Identify which combination yields best performance + +4. **Address Lookup Prototype** + - Implement TLS allocation context cache (class_idx + descriptor + page) + - Measure impact on lookup overhead (target: 41.5% → 25%) + +5. **Update ENV_PROFILE_PRESETS.md** + - Clarify that `C6_HEAVY_LEGACY_POOLV1` uses C7_SAFE profile (not pure LEGACY) + - Add note about C6_HOT routing implications + - Document performance differences between profile choices + +--- + +## Success Criteria for Phase C6-H+1 + +- **Reproduce historical baseline**: Achieve 23-27M ops/s with LEGACY profile +- **Understand routing impact**: Quantify C6_HOT=0 vs C6_HOT=1 difference +- **Identify optimization path**: Choose between: + - Optimizing TinyHeap C6 path (if C6_HOT=1 is strategic) + - Optimizing pool flatten path (if LEGACY/C6_HOT=0 is preferred) + - Hybrid approach with runtime selection + +**Target**: Close to **30M ops/s** (1/2 of current gap to 51.3M mimalloc baseline) by end of next phase. + +--- + +## Appendix A: Full Perf Report Output + +``` +# Samples: 656 of event 'cycles:u' +# Event count (approx.): 409,174,521 +# +# Overhead Symbol +# ........ ..................................... + 9.32% [.] hak_super_lookup + 8.23% [.] mid_desc_lookup + 5.87% [.] hak_pool_get_class_index + 5.76% [.] classify_ptr + 5.52% [.] hak_pool_free_v1_impl + 5.46% [.] hak_pool_try_alloc_v1_impl + 4.54% [.] free + 4.47% [.] worker_run + 4.35% [.] ss_map_lookup + 4.32% [.] super_reg_effective_mask + 3.69% [.] mid_desc_hash + 3.27% [.] mid_set_header + 3.17% [.] mid_page_inuse_dec_and_maybe_dn + 2.71% [.] mid_desc_init_once + 2.60% [.] malloc + 2.53% [.] hak_free_at + 2.17% [.] hak_pool_mid_lookup_v1_impl + 1.87% [.] super_reg_effective_size + 1.77% [.] hak_pool_free_fast_v1_impl + 1.64% [k] 0xffffffffae200ba0 (kernel) + 1.44% [.] hak_pool_init + 1.42% [.] hak_pool_is_poolable + 1.21% [.] should_sample + 1.12% [.] hak_pool_free + 1.11% [.] hak_super_hash + 1.09% [.] hak_pool_try_alloc + 0.95% [.] mid_desc_lookup_cached + 0.93% [.] hak_pool_v1_flatten_enabled + 0.76% [.] hak_pool_v2_route + 0.57% [.] ss_map_hash + 0.55% [.] hak_in_wrapper +``` + +--- + +## Appendix B: Test Commands Summary + +```bash +# Baseline +export HAKMEM_PROFILE=C6_HEAVY_LEGACY_POOLV1 +export HAKMEM_BENCH_MIN_SIZE=257 +export HAKMEM_BENCH_MAX_SIZE=768 +./bench_mid_large_mt_hakmem 1 1000000 400 1 +# Result: 9,836,420 ops/s + +# Mimalloc comparison +./bench_mid_large_mt_mi 1 1000000 400 1 +# Result: 51,297,877 ops/s (5.2x faster) + +# Mid_desc cache OFF +export HAKMEM_MID_DESC_CACHE_ENABLED=0 +./bench_mid_large_mt_hakmem 1 1000000 400 1 +# Result: 9,805,954 ops/s + +# Mid_desc cache ON +export HAKMEM_MID_DESC_CACHE_ENABLED=1 +./bench_mid_large_mt_hakmem 1 1000000 400 1 +# Result: 10,435,480 ops/s (+6.4%) + +# Perf stat +perf stat -e cycles:u,instructions:u,task-clock,page-faults:u \ + ./bench_mid_large_mt_hakmem 1 1000000 400 1 +# Result: 398M cycles, 1.05B instructions, IPC=2.64 + +# Perf record +perf record -F 5000 --call-graph dwarf -e cycles:u \ + -o perf.data.c6_flat ./bench_mid_large_mt_hakmem 1 1000000 400 1 +perf report -i perf.data.c6_flat --stdio --no-children +``` + +--- + +**End of Report** diff --git a/docs/analysis/ENV_PROFILE_PRESETS.md b/docs/analysis/ENV_PROFILE_PRESETS.md index 9639ba21..3be71c17 100644 --- a/docs/analysis/ENV_PROFILE_PRESETS.md +++ b/docs/analysis/ENV_PROFILE_PRESETS.md @@ -10,7 +10,9 @@ ### 目的 - Mixed 16–1024B の標準ベンチ用。 - C7-only SmallObject v3 + Tiny front v3 + LUT + fast classify ON。 -- Tiny/Pool v2 はすべて OFF。 +- v4 系(C6/C7 v4、fast classify v4、small segment v4)はすべて OFF。 +- Tiny/Pool v2 もすべて OFF。 +- C6 は凍結中(Tiny/SmallObject の特別扱いなし)。mid/pool の通常経路に任せる。 ### ENV 最小セット(Release) ```sh @@ -21,6 +23,19 @@ HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE HAKMEM_BENCH_MIN_SIZE=16 HAKMEM_BENCH_MAX_SIZE=1024 ``` +プリセットで自動設定される主な ENV: +- `HAKMEM_TINY_HEAP_PROFILE=C7_SAFE` +- `HAKMEM_TINY_C7_HOT=1` +- `HAKMEM_TINY_HOTHEAP_V2=0` +- `HAKMEM_SMALL_HEAP_V3_ENABLED=1` +- `HAKMEM_SMALL_HEAP_V3_CLASSES=0x80`(C7-only v3) +- `HAKMEM_SMALL_HEAP_V4_ENABLED=0` / `HAKMEM_SMALL_HEAP_V4_CLASSES=0x0` +- `HAKMEM_TINY_PTR_FAST_CLASSIFY_ENABLED=1` +- `HAKMEM_TINY_PTR_FAST_CLASSIFY_V4_ENABLED=0` +- `HAKMEM_SMALL_SEGMENT_V4_ENABLED=0` +- `HAKMEM_POOL_V2_ENABLED=0` +- `HAKMEM_TINY_FRONT_V3_ENABLED=1` +- `HAKMEM_TINY_FRONT_V3_LUT_ENABLED=1` ### 任意オプション - stats を見たいとき: @@ -39,6 +54,10 @@ HAKMEM_SS_MADVISE_STRICT=0 HAKMEM_FREE_POLICY=batch HAKMEM_THP=auto ``` +- 参考(v4 研究箱の現状): + - C7/C6 v4 + fast classify v4 ON(v3 OFF, segment OFF): **≈32.0–32.5M ops/s**(MIXED 1M/ws=400, Release)。 + - C7-only v4(C6 v1、v3 OFF): **≈33.0M ops/s**。 + - 現状は v3 構成が最速のため、標準プロファイルでは v4 系をすべて OFF に固定。 --- @@ -46,14 +65,14 @@ HAKMEM_THP=auto ### 目的 - C6-heavy mid/smallmid のベンチ用。 -- C6 は v1 固定(C6 v3 OFF)、Pool v2 OFF。Pool v1 flatten は bench 用に opt-in。 +- C6 は v1 固定(C6 v3/v4/ULTRA は研究箱のみ)。Pool v2 OFF。Pool v1 flatten は bench 用に opt-in。 ### ENV(v1 基準線) ```sh HAKMEM_BENCH_MIN_SIZE=257 HAKMEM_BENCH_MAX_SIZE=768 HAKMEM_TINY_HEAP_PROFILE=C7_SAFE -HAKMEM_TINY_C6_HOT=1 +HAKMEM_TINY_C6_HOT=0 HAKMEM_TINY_HOTHEAP_V2=0 HAKMEM_SMALL_HEAP_V3_ENABLED=1 HAKMEM_SMALL_HEAP_V3_CLASSES=0x80 # C7-only v3, C6 v3 は OFF @@ -69,6 +88,7 @@ HAKMEM_TINY_HEAP_PROFILE=LEGACY HAKMEM_POOL_V2_ENABLED=0 HAKMEM_POOL_V1_FLATTEN_ENABLED=1 HAKMEM_POOL_V1_FLATTEN_STATS=1 +``` ## Profile 2b: C6_HEAVY_LEGACY_POOLV1_FLATTEN(mid/smallmid LEGACY flatten ベンチ専用) @@ -84,9 +104,35 @@ HAKMEM_POOL_ZERO_MODE=header HAKMEM_POOL_V1_FLATTEN_STATS=1 ``` ※ LEGACY 専用。C7_SAFE / C7_ULTRA_BENCH ではこのプリセットを使用しないこと。 -``` - flatten は LEGACY 専用。C7_SAFE / C7_ULTRA_BENCH ではコード側で強制 OFF になる前提。 +### C6 研究用プリセット(標準ラインには影響させない) + +- C6 v3 研究(Tiny/SmallObject に C6 を載せるときだけ) +```sh +HAKMEM_PROFILE=C6_SMALL_HEAP_V3_EXPERIMENT +HAKMEM_BENCH_MIN_SIZE=257 +HAKMEM_BENCH_MAX_SIZE=768 +# bench_profile が以下を自動注入(既存 ENV を上書きしません): +# HAKMEM_TINY_C6_HOT=1 +# HAKMEM_SMALL_HEAP_V3_ENABLED=1 +# HAKMEM_SMALL_HEAP_V3_CLASSES=0x40 # C6 only v3 +``` + +- C6 v4 研究(C6 を v4 に載せるときだけ) +```sh +HAKMEM_PROFILE=C6_SMALL_HEAP_V4_EXPERIMENT +HAKMEM_BENCH_MIN_SIZE=257 +HAKMEM_BENCH_MAX_SIZE=768 +# bench_profile が以下を自動注入(既存 ENV を上書きしません): +# HAKMEM_TINY_C6_HOT=1 +# HAKMEM_SMALL_HEAP_V3_ENABLED=0 +# HAKMEM_SMALL_HEAP_V4_ENABLED=1 +# HAKMEM_SMALL_HEAP_V4_CLASSES=0x40 # C6 only v4 +``` + +※ いずれも「研究箱」です。Mixed/C6-heavy の標準評価では使わず、回帰やセグフォを許容できるときだけ明示的に opt-in してください。 + --- ## Profile 3: DEBUG_TINY_FRONT_PERF(perf 用 DEBUG プロファイル) diff --git a/docs/analysis/PF_STATUS_V4_202502.md b/docs/analysis/PF_STATUS_V4_202502.md index 2adbfabc..de8ca84f 100644 --- a/docs/analysis/PF_STATUS_V4_202502.md +++ b/docs/analysis/PF_STATUS_V4_202502.md @@ -1,3 +1,23 @@ +# PF/OS ベースライン + +# BASELINE-LOCK (Mixed 16–1024B v3 vs v4, Release) +- コマンド共通 (ws=400, iters=1M): + ``` + HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE + HAKMEM_BENCH_MIN_SIZE=16 + HAKMEM_BENCH_MAX_SIZE=1024 + ``` +- v3 本命構成(C7-only v3, v4/segment すべて OFF, fast classify v3 ON): + - `HAKMEM_SMALL_HEAP_V3_ENABLED=1 HAKMEM_SMALL_HEAP_V3_CLASSES=0x80 HAKMEM_SMALL_HEAP_V4_ENABLED=0 HAKMEM_SMALL_HEAP_V4_CLASSES=0 HAKMEM_TINY_PTR_FAST_CLASSIFY_V4_ENABLED=0 HAKMEM_SMALL_SEGMENT_V4_ENABLED=0` + - Throughput: **33.7–33.9M ops/s**(2 run, segv/assert なし) +- v4 強制(C7+C6 v4 + fast classify v4, v3 OFF, segment OFF): + - `HAKMEM_SMALL_HEAP_V3_ENABLED=0 HAKMEM_SMALL_HEAP_V3_CLASSES=0 HAKMEM_SMALL_HEAP_V4_ENABLED=1 HAKMEM_SMALL_HEAP_V4_CLASSES=0xC0 HAKMEM_TINY_PTR_FAST_CLASSIFY_V4_ENABLED=1` + - Throughput: **32.0–32.5M ops/s** +- C7-only v4(C6 v1, v3 OFF, fast classify v4 ON): + - `HAKMEM_SMALL_HEAP_V4_CLASSES=0x80 HAKMEM_SMALL_HEAP_V3_ENABLED=0` + - Throughput: **≈33.0M ops/s** +- 判断: 現行 Mixed の本命は v3 構成(上記)。v4 系は研究箱として opt-in 扱いを維持。 + # PF/OS ベースライン (PF2, small-object v4 状態) - コマンド (Release, v4: C7+C6 を v4 に強制、v3 OFF): @@ -20,6 +40,29 @@ - v4 (C7+C6) 強制時の pf/OS 基準値。v3 基準 (~40M) より遅めだが、pf 数値と OS stats を PF2 の起点として固定。 - 今後 SmallSegmentBox_v4 を繋ぐ A/B では、page-faults/SS_OS_STATS をこの値からどこまで下げられるかを指標にする。 +## PF3: smallsegment_v4 ゲート A/B(C7+C6 v4 強制) + +- コマンド (Release, v4: C7+C6, v3 OFF): + ``` + HAKMEM_PROFILE=MIXED_TINYV3_C7_SAFE \ + HAKMEM_BENCH_MIN_SIZE=16 \ + HAKMEM_BENCH_MAX_SIZE=1024 \ + HAKMEM_SMALL_HEAP_V4_ENABLED=1 \ + HAKMEM_SMALL_HEAP_V4_CLASSES=0xC0 \ + HAKMEM_SMALL_HEAP_V3_ENABLED=0 \ + HAKMEM_SMALL_HEAP_V3_CLASSES=0 \ + perf stat -e cycles,instructions,task-clock,page-faults \ + HAKMEM_SMALL_SEGMENT_V4_ENABLED=0 ./bench_random_mixed_hakmem 1000000 400 1 + perf stat -e cycles,instructions,task-clock,page-faults \ + HAKMEM_SMALL_SEGMENT_V4_ENABLED=1 ./bench_random_mixed_hakmem 1000000 400 1 + ``` +- 結果 (ws=400, iters=1M): + - OFF: Throughput **28,890,266 ops/s**, page-faults=6,744, task-clock=54.84ms + - ON : Throughput **28,849,781 ops/s**, page-faults=6,746, task-clock=61.49ms +- 所感: + - smallsegment_v4 ゲートを通しても pf/ops はほぼ変化なし(現状は Tiny v1 lease 経由の薄い実装)。 + - 「Segment 経由の入り口」はできたので、PF4 以降で専用 mmap/segment 分割を実装して再 A/B する。 + ## DEBUG perf (cycles:u, -O0/-g, v4=C7+C6) - ビルド: diff --git a/docs/analysis/SMALLOBJECT_SEGMENT_V4_DESIGN.md b/docs/analysis/SMALLOBJECT_SEGMENT_V4_DESIGN.md index 24ec7f95..7e35d7d0 100644 --- a/docs/analysis/SMALLOBJECT_SEGMENT_V4_DESIGN.md +++ b/docs/analysis/SMALLOBJECT_SEGMENT_V4_DESIGN.md @@ -24,6 +24,13 @@ - **PF3**: SmallSegmentBox_v4 を実装し、C7/C6 v4 で small-object 専用 Segment を試す A/B を実施。 - **PF4**: Segment サイズ/ポリシーのチューニングと pf/OS スタッツの可視化強化。成功したら ENV プリセットに反映。 +## PF3 進捗メモ +- smallsegment_v4_box をホットコードに接続し、ENV `HAKMEM_SMALL_SEGMENT_V4_ENABLED` で Tiny v1 経由と segment 経由を切替可能にした(現段階は Tiny v1 lease を薄くラップする構造)。 +- Mixed 16–1024B(v4 強制、ws=400, iters=1M)で A/B: + - OFF: 28.89M ops/s, page-faults=6,744 + - ON : 28.85M ops/s, page-faults=6,746 +- pf/ops はまだ変化なし。次フェーズで実際の small-object 専用 mmap/segment carve を入れて再 A/B する。 + ## メモ - C5 v4 はまだ研究箱(C5-heavy 専用)。Mixed では C5 v1 を維持する予定。 - C6 v4 は C6-heavy で +4〜5% が見えており、Mixed ではデフォルト OFF(研究箱)。 diff --git a/docs/analysis/SMALLOBJECT_V4_BOX_DESIGN.md b/docs/analysis/SMALLOBJECT_V4_BOX_DESIGN.md index 01821486..a171853a 100644 --- a/docs/analysis/SMALLOBJECT_V4_BOX_DESIGN.md +++ b/docs/analysis/SMALLOBJECT_V4_BOX_DESIGN.md @@ -32,22 +32,10 @@ - `core/front/malloc_tiny_fast.h`: route switch に v4 の case を足し、C7 v4 が ON のときは v4 経路(現在は C7 自前 freelist, それ以外は v1/v3 へフォールバック)、OFF 時は従来の v3/v1。 - `core/box/smallsegment_v4_box.h` / `core/box/smallsegment_v4_env_box.h`: PF2 で追加した small-object Segment Box の足場(型と ENV だけ、挙動不変)。設計メモは `docs/analysis/SMALLOBJECT_SEGMENT_V4_DESIGN.md` にまとめる。 -## A/B と運用 -- Phase v4-3.1 時点の健康診断: - - C7-only A/B (ws=400, iters=1M, size=1024 固定): - - v3: 41.67M ops/s, prepare_calls=5,077 - - v4: 42.13M ops/s, prepare_calls=4,701(current/partial 再利用で 3.4x→約1.0x に改善) - - Mixed 16–1024B (MIXED_TINYV3_C7_SAFE, ws=400, iters=1M): - - v3 route: 40.66M ops/s - - v4 route: 40.01M ops/s(-1.6% 以内、回帰なし) -- どちらも segv/assert なし。C7 v4 の prepare 増加は解消済み。Mixed ではまだ v3 がわずかに優勢だが許容範囲。 -- Phase v4-4 (C6 v4 パイロット): - - ENV: `HAKMEM_SMALL_HEAP_V4_ENABLED=1`, `HAKMEM_SMALL_HEAP_V4_CLASSES=0x40`(C6-only v4)。Mixed では標準 OFF(0x80= C7-only)。 - - C6-heavy ベンチ (ws=400, iters=1M, size 257–768): - - C6 v1: 28.69M ops/s - - C6 v4: 30.07M ops/s(+4.8%)segv/assert なし - - Mixed 16–1024B はデフォルトで C6 v1 のまま(C6 v4 は研究箱)。今後 C6 v4 の安定度を見つつ拡張予定。 -- Phase v4-5 (C5 v4 パイロット; C5-heavy 専用 opt-in): - - ENV: `HAKMEM_SMALL_HEAP_V4_ENABLED=1`, `HAKMEM_SMALL_HEAP_V4_CLASSES=0x20`(C5-only v4)。C7 v4 / C6 v4 とは独立にビットで切替。 - - 目的: C5-heavy ワークロードで v4 が v1 を上回るか確認。Mixed 標準は C5 v1 のまま(C5 v4 は研究箱)。 - - ステータス: 実装済み。C5-heavy / Mixed の A/B は未実施。segv/assert の有無と throughput を確認してから昇格判断。 +## A/B と運用(2025-12 時点の整理) +- v4 C7/C6/C5 はいずれも **研究箱**。Mixed の標準ラインは C7-only v3 + C7 ULTRA(UF-3 セグメント)で固定し、v4 系は ENV opt-in のみで利用する。 +- C6/FREEZE 方針により、C6 v4 / C5 v4 は mid/pool 再設計が進むまで本線に載せない(C6 は「普通の mid クラス」として pool/mid 側で扱う)。 +- 今後 small-object v4 を攻めるときは: + - まず C7 ULTRA で固めた設計(Segment + Page + TLS freelist + mask free)を「small-object 全体の共通パターン」として整理し、 + - その上で 16〜2KiB 帯を SmallHeapCtx v4 に寄せる(ヘッダレス化・lookup 削減を C7 と mid で統合)、 + という順番で進める。 diff --git a/docs/analysis/TINY_C7_ULTRA_DESIGN.md b/docs/analysis/TINY_C7_ULTRA_DESIGN.md index 93162fb6..8b248e30 100644 --- a/docs/analysis/TINY_C7_ULTRA_DESIGN.md +++ b/docs/analysis/TINY_C7_ULTRA_DESIGN.md @@ -40,8 +40,13 @@ - 管理内 → page_idx = (p - seg_base) >> PAGE_SHIFT で page_meta を取得し、ヘッダ無しで freelist push。 - Remote/cross-thread free は UF-3 でも非対応(同一スレッド C7 専用のまま)。 +## UF-4: C7 ULTRA header light(研究箱) +- 目的: C7 ULTRA の alloc/free から tiny_region_id_write_header の毎回実行を外し、carve 時だけに寄せる。 +- 手段: freelist の next をヘッダ直後に格納してヘッダを保持し、ENV `HAKMEM_TINY_C7_ULTRA_HEADER_LIGHT` (default 0) ON のときだけ carve 時に一括書き込み。alloc はヘッダ済みならスキップ。 +- Fail-Fast: ULTRA 管理外 ptr は従来どおり v3 free 経路へ落とす。 + ## フェーズ - UF-1: 箱・ENV・front フックだけ stub で入れる(中身は v3 C7 経由、挙動変化なし)。 - UF-2: ULTRA TLS freelist を実装(C7 ページ 1 枚を TLS で握る。同一スレッドのみ)。C7 ページ供給は当面 v3/v4 経由。 - UF-3: C7UltraSegmentBox を実装し、ptr→segment mask でヘッダレス free に寄せる(セグメント 1 枚のみでも可)。 -- UF-4: pf/segment/学習層との統合を調整し、Mixed で本格的に A/B。 +- UF-4: C7 ULTRA header light を研究箱として追加し、ON/OFF A/B(Mixed / C7-only 両方)で評価する。 diff --git a/hakmem.d b/hakmem.d index f5d6e4f4..35c30d75 100644 --- a/hakmem.d +++ b/hakmem.d @@ -103,8 +103,12 @@ hakmem.o: core/hakmem.c core/hakmem.h core/hakmem_build_flags.h \ core/box/../front/../box/../hakmem_tiny_superslab_internal.h \ core/box/../front/../box/../superslab/superslab_inline.h \ core/box/../front/../box/smallobject_hotbox_v3_env_box.h \ + core/box/../front/../box/smallobject_hotbox_v4_box.h \ + core/box/../front/../box/tiny_c7_ultra_box.h \ + core/box/../front/../box/tiny_c7_ultra_segment_box.h \ core/box/../front/../box/tiny_front_v3_env_box.h \ core/box/../front/../box/tiny_route_env_box.h \ + core/box/../front/../box/smallobject_hotbox_v4_env_box.h \ core/box/../front/../box/tiny_front_stats_box.h \ core/box/tiny_alloc_gate_box.h core/box/tiny_route_box.h \ core/box/tiny_front_config_box.h core/box/wrapper_env_box.h \ @@ -286,8 +290,12 @@ core/box/../front/../box/tiny_geometry_box.h: core/box/../front/../box/../hakmem_tiny_superslab_internal.h: core/box/../front/../box/../superslab/superslab_inline.h: core/box/../front/../box/smallobject_hotbox_v3_env_box.h: +core/box/../front/../box/smallobject_hotbox_v4_box.h: +core/box/../front/../box/tiny_c7_ultra_box.h: +core/box/../front/../box/tiny_c7_ultra_segment_box.h: core/box/../front/../box/tiny_front_v3_env_box.h: core/box/../front/../box/tiny_route_env_box.h: +core/box/../front/../box/smallobject_hotbox_v4_env_box.h: core/box/../front/../box/tiny_front_stats_box.h: core/box/tiny_alloc_gate_box.h: core/box/tiny_route_box.h: diff --git a/perf.data.c6_flat b/perf.data.c6_flat new file mode 100644 index 00000000..5f000090 Binary files /dev/null and b/perf.data.c6_flat differ diff --git a/perf.data.c6heavy b/perf.data.c6heavy index 605ca087..1c465037 100644 Binary files a/perf.data.c6heavy and b/perf.data.c6heavy differ diff --git a/perf.data.c6heavy.old b/perf.data.c6heavy.old new file mode 100644 index 00000000..605ca087 Binary files /dev/null and b/perf.data.c6heavy.old differ diff --git a/perf.data.fc2 b/perf.data.fc2 new file mode 100644 index 00000000..548d6268 Binary files /dev/null and b/perf.data.fc2 differ diff --git a/perf.data.random_mixed b/perf.data.random_mixed new file mode 100644 index 00000000..e57f95df Binary files /dev/null and b/perf.data.random_mixed differ diff --git a/perf.data.tiny_ultra_hdr b/perf.data.tiny_ultra_hdr new file mode 100644 index 00000000..5155acd4 Binary files /dev/null and b/perf.data.tiny_ultra_hdr differ diff --git a/perf.data.ultra_mixed b/perf.data.ultra_mixed new file mode 100644 index 00000000..16758893 Binary files /dev/null and b/perf.data.ultra_mixed differ diff --git a/perf.data.ultra_mixed.old b/perf.data.ultra_mixed.old new file mode 100644 index 00000000..f2ef22ee Binary files /dev/null and b/perf.data.ultra_mixed.old differ