2025-12-09 21:50:15 +09:00
|
|
|
// smallobject_hotbox_v3.c - SmallObject HotHeap v3 skeleton (C7-first)
|
|
|
|
|
// Phase A/B: 型と stats だけ。alloc/free は v1 にフォールバックさせる。
|
|
|
|
|
|
|
|
|
|
#include <stdlib.h>
|
|
|
|
|
#include <stdio.h>
|
|
|
|
|
#include <string.h>
|
|
|
|
|
#include "box/smallobject_hotbox_v3_box.h"
|
|
|
|
|
#include "box/smallobject_cold_iface_v1.h"
|
|
|
|
|
#include "box/tiny_heap_box.h"
|
|
|
|
|
#include "box/tiny_front_v3_env_box.h"
|
|
|
|
|
#include "hakmem_tiny.h" // TINY_SLAB_SIZE mask for page_of
|
|
|
|
|
#include "tiny_region_id.h"
|
|
|
|
|
|
|
|
|
|
static __thread so_ctx_v3* g_so_ctx_v3;
|
|
|
|
|
static int g_so_stats_enabled = -1;
|
|
|
|
|
static so_stats_class_v3 g_so_stats[SMALLOBJECT_NUM_CLASSES];
|
|
|
|
|
|
|
|
|
|
int so_v3_stats_enabled(void) {
|
|
|
|
|
if (__builtin_expect(g_so_stats_enabled == -1, 0)) {
|
|
|
|
|
const char* e = getenv("HAKMEM_SMALL_HEAP_V3_STATS");
|
|
|
|
|
g_so_stats_enabled = (e && *e && *e != '0') ? 1 : 0;
|
|
|
|
|
}
|
|
|
|
|
return g_so_stats_enabled;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline so_stats_class_v3* so_stats_for(uint8_t ci) {
|
|
|
|
|
if (!so_v3_stats_enabled()) return NULL;
|
|
|
|
|
if (ci >= SMALLOBJECT_NUM_CLASSES) return NULL;
|
|
|
|
|
return &g_so_stats[ci];
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_route_hit(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->route_hits, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_alloc_call(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->alloc_calls, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_alloc_refill(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->alloc_refill, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_alloc_fallback(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->alloc_fallback_v1, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
void so_v3_record_alloc_current_hit(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->alloc_current_hit, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_alloc_partial_hit(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->alloc_partial_hit, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-09 21:50:15 +09:00
|
|
|
void so_v3_record_free_call(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->free_calls, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_free_fallback(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->free_fallback_v1, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
void so_v3_record_free_current(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->free_current, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_free_partial(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->free_partial, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_v3_record_free_retire(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->free_retire, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-10 09:08:18 +09:00
|
|
|
void so_v3_record_page_of_fail(uint8_t ci) {
|
|
|
|
|
so_stats_class_v3* st = so_stats_for(ci);
|
|
|
|
|
if (st) atomic_fetch_add_explicit(&st->page_of_fail, 1, memory_order_relaxed);
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-09 21:50:15 +09:00
|
|
|
so_ctx_v3* so_tls_get(void) {
|
|
|
|
|
so_ctx_v3* ctx = g_so_ctx_v3;
|
|
|
|
|
if (__builtin_expect(ctx == NULL, 0)) {
|
|
|
|
|
ctx = (so_ctx_v3*)calloc(1, sizeof(so_ctx_v3));
|
|
|
|
|
if (!ctx) {
|
|
|
|
|
fprintf(stderr, "[SMALL_HEAP_V3] TLS alloc failed\n");
|
|
|
|
|
abort();
|
|
|
|
|
}
|
|
|
|
|
for (int i = 0; i < SMALLOBJECT_NUM_CLASSES; i++) {
|
|
|
|
|
so_class_v3* hc = &ctx->cls[i];
|
|
|
|
|
hc->block_size = (uint32_t)tiny_stride_for_class(i);
|
|
|
|
|
hc->max_partial_pages = 2;
|
|
|
|
|
}
|
|
|
|
|
g_so_ctx_v3 = ctx;
|
|
|
|
|
}
|
|
|
|
|
return ctx;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void* so_build_freelist(so_page_v3* page) {
|
|
|
|
|
if (!page || !page->base || page->block_size == 0 || page->capacity == 0) return NULL;
|
|
|
|
|
uint8_t* base = (uint8_t*)page->base;
|
|
|
|
|
void* head = NULL;
|
|
|
|
|
for (uint32_t i = 0; i < page->capacity; i++) {
|
|
|
|
|
uint8_t* blk = base + ((size_t)i * page->block_size);
|
|
|
|
|
*(void**)blk = head;
|
|
|
|
|
head = blk;
|
|
|
|
|
}
|
|
|
|
|
return head;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline int so_ptr_in_page(so_page_v3* page, void* ptr) {
|
|
|
|
|
if (!page || !ptr) return 0;
|
|
|
|
|
uintptr_t base = (uintptr_t)page->base;
|
|
|
|
|
uintptr_t p = (uintptr_t)ptr;
|
|
|
|
|
uintptr_t span = (uintptr_t)page->block_size * (uintptr_t)page->capacity;
|
|
|
|
|
if (p < base || p >= base + span) return 0;
|
|
|
|
|
if (((p - base) % page->block_size) != 0) return 0;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline so_page_v3* so_page_of(so_class_v3* hc, void* ptr) {
|
|
|
|
|
if (!ptr || !hc) return NULL;
|
|
|
|
|
so_page_v3* page = hc->current;
|
|
|
|
|
if (page && so_ptr_in_page(page, ptr)) {
|
|
|
|
|
return page;
|
|
|
|
|
}
|
|
|
|
|
page = hc->partial;
|
|
|
|
|
while (page) {
|
|
|
|
|
if (so_ptr_in_page(page, ptr)) {
|
|
|
|
|
return page;
|
|
|
|
|
}
|
|
|
|
|
page = page->next;
|
|
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void so_page_push_partial(so_class_v3* hc, so_page_v3* page) {
|
|
|
|
|
if (!hc || !page) return;
|
|
|
|
|
page->next = hc->partial;
|
|
|
|
|
hc->partial = page;
|
|
|
|
|
hc->partial_count++;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void so_page_retire_slow(so_ctx_v3* ctx, uint32_t ci, so_page_v3* page);
|
|
|
|
|
|
|
|
|
|
static inline void* so_alloc_fast(so_ctx_v3* ctx, uint32_t ci) {
|
|
|
|
|
so_class_v3* hc = &ctx->cls[ci];
|
|
|
|
|
const bool skip_header_c7 = (ci == 7) && tiny_header_v3_enabled() && tiny_header_v3_skip_c7();
|
|
|
|
|
so_page_v3* p = hc->current;
|
|
|
|
|
if (p && p->freelist && p->used < p->capacity) {
|
|
|
|
|
void* blk = p->freelist;
|
|
|
|
|
p->freelist = *(void**)blk;
|
|
|
|
|
p->used++;
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
so_v3_record_alloc_current_hit((uint8_t)ci);
|
2025-12-09 21:50:15 +09:00
|
|
|
if (skip_header_c7) {
|
|
|
|
|
uint8_t* header_ptr = (uint8_t*)blk;
|
|
|
|
|
*header_ptr = (uint8_t)(HEADER_MAGIC | (ci & HEADER_CLASS_MASK));
|
|
|
|
|
return header_ptr + 1; // mirror tiny_region_id_write_header fast path
|
|
|
|
|
}
|
|
|
|
|
return tiny_region_id_write_header(blk, (int)ci);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (hc->partial) {
|
|
|
|
|
so_page_v3* old_cur = hc->current;
|
|
|
|
|
p = hc->partial;
|
|
|
|
|
hc->partial = p->next;
|
|
|
|
|
if (hc->partial_count > 0) {
|
|
|
|
|
hc->partial_count--;
|
|
|
|
|
}
|
|
|
|
|
p->next = NULL;
|
|
|
|
|
hc->current = p;
|
|
|
|
|
if (old_cur && old_cur != p) {
|
|
|
|
|
if (hc->partial_count < hc->max_partial_pages) {
|
|
|
|
|
so_page_push_partial(hc, old_cur);
|
|
|
|
|
} else {
|
|
|
|
|
so_page_retire_slow(ctx, ci, old_cur);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
if (p->freelist && p->used < p->capacity) {
|
|
|
|
|
void* blk = p->freelist;
|
|
|
|
|
p->freelist = *(void**)blk;
|
|
|
|
|
p->used++;
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
so_v3_record_alloc_partial_hit((uint8_t)ci);
|
2025-12-09 21:50:15 +09:00
|
|
|
if (skip_header_c7) {
|
|
|
|
|
uint8_t* header_ptr = (uint8_t*)blk;
|
|
|
|
|
*header_ptr = (uint8_t)(HEADER_MAGIC | (ci & HEADER_CLASS_MASK));
|
|
|
|
|
return header_ptr + 1;
|
|
|
|
|
}
|
|
|
|
|
return tiny_region_id_write_header(blk, (int)ci);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline int so_unlink_partial(so_class_v3* hc, so_page_v3* target) {
|
|
|
|
|
if (!hc || !target) return 0;
|
|
|
|
|
so_page_v3* prev = NULL;
|
|
|
|
|
so_page_v3* cur = hc->partial;
|
|
|
|
|
while (cur) {
|
|
|
|
|
if (cur == target) {
|
|
|
|
|
if (prev) {
|
|
|
|
|
prev->next = cur->next;
|
|
|
|
|
} else {
|
|
|
|
|
hc->partial = cur->next;
|
|
|
|
|
}
|
|
|
|
|
if (hc->partial_count > 0) {
|
|
|
|
|
hc->partial_count--;
|
|
|
|
|
}
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
prev = cur;
|
|
|
|
|
cur = cur->next;
|
|
|
|
|
}
|
|
|
|
|
return 0;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void so_page_retire_slow(so_ctx_v3* ctx, uint32_t ci, so_page_v3* page) {
|
|
|
|
|
SmallObjectColdIface cold = smallobject_cold_iface_v1();
|
|
|
|
|
void* cold_ctx = (void*)tiny_heap_ctx_for_thread();
|
|
|
|
|
if (cold.retire_page) {
|
|
|
|
|
cold.retire_page(cold_ctx, ci, page);
|
|
|
|
|
} else {
|
|
|
|
|
free(page);
|
|
|
|
|
}
|
|
|
|
|
(void)ctx;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline void so_free_fast(so_ctx_v3* ctx, uint32_t ci, void* ptr) {
|
|
|
|
|
so_class_v3* hc = &ctx->cls[ci];
|
|
|
|
|
so_page_v3* page = so_page_of(hc, ptr);
|
|
|
|
|
if (!page) {
|
2025-12-10 09:08:18 +09:00
|
|
|
so_v3_record_page_of_fail((uint8_t)ci);
|
2025-12-09 21:50:15 +09:00
|
|
|
so_v3_record_free_fallback((uint8_t)ci);
|
|
|
|
|
tiny_heap_free_class_fast(tiny_heap_ctx_for_thread(), (int)ci, ptr);
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
*(void**)ptr = page->freelist;
|
|
|
|
|
page->freelist = ptr;
|
|
|
|
|
if (page->used > 0) {
|
|
|
|
|
page->used--;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
if (page->used == 0) {
|
|
|
|
|
(void)so_unlink_partial(hc, page);
|
|
|
|
|
if (hc->partial_count < hc->max_partial_pages) {
|
|
|
|
|
so_page_push_partial(hc, page);
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
so_v3_record_free_partial((uint8_t)ci);
|
2025-12-09 21:50:15 +09:00
|
|
|
if (!hc->current) {
|
|
|
|
|
hc->current = page;
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
so_v3_record_free_current((uint8_t)ci);
|
2025-12-09 21:50:15 +09:00
|
|
|
}
|
|
|
|
|
} else {
|
|
|
|
|
if (hc->current == page) {
|
|
|
|
|
hc->current = NULL;
|
|
|
|
|
}
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
so_v3_record_free_retire((uint8_t)ci);
|
2025-12-09 21:50:15 +09:00
|
|
|
so_page_retire_slow(ctx, ci, page);
|
|
|
|
|
}
|
|
|
|
|
} else if (!hc->current) {
|
|
|
|
|
hc->current = page;
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
so_v3_record_free_current((uint8_t)ci);
|
2025-12-09 21:50:15 +09:00
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
static inline so_page_v3* so_alloc_refill_slow(so_ctx_v3* ctx, uint32_t ci) {
|
|
|
|
|
SmallObjectColdIface cold = smallobject_cold_iface_v1();
|
|
|
|
|
void* cold_ctx = (void*)tiny_heap_ctx_for_thread();
|
|
|
|
|
if (!cold.refill_page) return NULL;
|
|
|
|
|
so_page_v3* page = cold.refill_page(cold_ctx, ci);
|
|
|
|
|
if (!page) return NULL;
|
2025-12-10 09:08:18 +09:00
|
|
|
if (!page->base || page->capacity == 0) {
|
|
|
|
|
if (cold.retire_page) {
|
|
|
|
|
cold.retire_page(cold_ctx, ci, page);
|
|
|
|
|
} else {
|
|
|
|
|
free(page);
|
|
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
2025-12-09 21:50:15 +09:00
|
|
|
|
|
|
|
|
if (page->block_size == 0) {
|
|
|
|
|
page->block_size = (uint32_t)tiny_stride_for_class((int)ci);
|
|
|
|
|
}
|
|
|
|
|
page->class_idx = ci;
|
|
|
|
|
|
|
|
|
|
page->used = 0;
|
|
|
|
|
page->freelist = so_build_freelist(page);
|
|
|
|
|
if (!page->freelist) {
|
|
|
|
|
if (cold.retire_page) {
|
|
|
|
|
cold.retire_page(cold_ctx, ci, page);
|
|
|
|
|
} else {
|
|
|
|
|
free(page);
|
|
|
|
|
}
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
page->next = NULL;
|
|
|
|
|
|
|
|
|
|
so_class_v3* hc = &ctx->cls[ci];
|
|
|
|
|
if (hc->current) {
|
|
|
|
|
if (hc->partial_count < hc->max_partial_pages) {
|
|
|
|
|
so_page_push_partial(hc, hc->current);
|
|
|
|
|
} else {
|
|
|
|
|
so_page_retire_slow(ctx, ci, hc->current);
|
|
|
|
|
}
|
|
|
|
|
}
|
|
|
|
|
hc->current = page;
|
|
|
|
|
return page;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void* so_alloc(uint32_t class_idx) {
|
|
|
|
|
if (__builtin_expect(class_idx >= SMALLOBJECT_NUM_CLASSES, 0)) {
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
so_v3_record_route_hit((uint8_t)class_idx);
|
|
|
|
|
so_v3_record_alloc_call((uint8_t)class_idx);
|
|
|
|
|
|
|
|
|
|
so_ctx_v3* ctx = so_tls_get();
|
|
|
|
|
void* blk = so_alloc_fast(ctx, class_idx);
|
|
|
|
|
if (blk) return blk;
|
|
|
|
|
|
|
|
|
|
so_page_v3* page = so_alloc_refill_slow(ctx, class_idx);
|
|
|
|
|
if (!page) {
|
|
|
|
|
so_v3_record_alloc_fallback((uint8_t)class_idx);
|
|
|
|
|
return NULL;
|
|
|
|
|
}
|
|
|
|
|
so_v3_record_alloc_refill((uint8_t)class_idx);
|
|
|
|
|
blk = so_alloc_fast(ctx, class_idx);
|
|
|
|
|
if (!blk) {
|
|
|
|
|
so_v3_record_alloc_fallback((uint8_t)class_idx);
|
|
|
|
|
}
|
|
|
|
|
return blk;
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
void so_free(uint32_t class_idx, void* ptr) {
|
|
|
|
|
if (__builtin_expect(class_idx >= SMALLOBJECT_NUM_CLASSES, 0)) {
|
|
|
|
|
return;
|
|
|
|
|
}
|
|
|
|
|
so_v3_record_free_call((uint8_t)class_idx);
|
|
|
|
|
so_ctx_v3* ctx = so_tls_get();
|
|
|
|
|
so_free_fast(ctx, class_idx, ptr);
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-10 09:08:18 +09:00
|
|
|
int smallobject_hotbox_v3_can_own_c7(void* ptr) {
|
|
|
|
|
if (!ptr) return 0;
|
|
|
|
|
if (!small_heap_v3_c7_enabled()) return 0;
|
|
|
|
|
so_ctx_v3* ctx = g_so_ctx_v3;
|
|
|
|
|
if (!ctx) return 0; // TLS 未初期化なら ownership なし
|
|
|
|
|
so_class_v3* hc = &ctx->cls[7];
|
|
|
|
|
so_page_v3* page = so_page_of(hc, ptr);
|
|
|
|
|
if (!page) return 0;
|
|
|
|
|
if (page->class_idx != 7) return 0;
|
|
|
|
|
return 1;
|
|
|
|
|
}
|
|
|
|
|
|
2025-12-09 21:50:15 +09:00
|
|
|
__attribute__((destructor))
|
|
|
|
|
static void so_v3_stats_dump(void) {
|
|
|
|
|
if (!so_v3_stats_enabled()) return;
|
|
|
|
|
for (int i = 0; i < SMALLOBJECT_NUM_CLASSES; i++) {
|
|
|
|
|
so_stats_class_v3* st = &g_so_stats[i];
|
|
|
|
|
uint64_t rh = atomic_load_explicit(&st->route_hits, memory_order_relaxed);
|
|
|
|
|
uint64_t ac = atomic_load_explicit(&st->alloc_calls, memory_order_relaxed);
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
uint64_t ach = atomic_load_explicit(&st->alloc_current_hit, memory_order_relaxed);
|
|
|
|
|
uint64_t aph = atomic_load_explicit(&st->alloc_partial_hit, memory_order_relaxed);
|
2025-12-09 21:50:15 +09:00
|
|
|
uint64_t ar = atomic_load_explicit(&st->alloc_refill, memory_order_relaxed);
|
|
|
|
|
uint64_t afb = atomic_load_explicit(&st->alloc_fallback_v1, memory_order_relaxed);
|
|
|
|
|
uint64_t fc = atomic_load_explicit(&st->free_calls, memory_order_relaxed);
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
uint64_t fcur = atomic_load_explicit(&st->free_current, memory_order_relaxed);
|
|
|
|
|
uint64_t fpar = atomic_load_explicit(&st->free_partial, memory_order_relaxed);
|
|
|
|
|
uint64_t fret = atomic_load_explicit(&st->free_retire, memory_order_relaxed);
|
2025-12-09 21:50:15 +09:00
|
|
|
uint64_t ffb = atomic_load_explicit(&st->free_fallback_v1, memory_order_relaxed);
|
2025-12-10 09:08:18 +09:00
|
|
|
uint64_t pof = atomic_load_explicit(&st->page_of_fail, memory_order_relaxed);
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
if (rh + ac + afb + fc + ffb + ar + pof + ach + aph + fcur + fpar + fret == 0) continue;
|
|
|
|
|
|
|
|
|
|
// Main stats (basic)
|
2025-12-10 09:08:18 +09:00
|
|
|
fprintf(stderr, "[SMALL_HEAP_V3_STATS] cls=%d route_hits=%llu alloc_calls=%llu alloc_refill=%llu alloc_fb_v1=%llu free_calls=%llu free_fb_v1=%llu page_of_fail=%llu\n",
|
2025-12-09 21:50:15 +09:00
|
|
|
i, (unsigned long long)rh, (unsigned long long)ac,
|
2025-12-10 09:08:18 +09:00
|
|
|
(unsigned long long)ar, (unsigned long long)afb, (unsigned long long)fc,
|
|
|
|
|
(unsigned long long)ffb, (unsigned long long)pof);
|
Phase SO-BACKEND-OPT-1: v3 backend 分解&Tiny/ULTRA 完成世代宣言
=== 実装内容 ===
1. v3 backend 詳細計測
- ENV: HAKMEM_SO_V3_STATS で alloc/free パス内訳計測
- 追加 stats: alloc_current_hit, alloc_partial_hit, free_current, free_partial, free_retire
- so_alloc_fast / so_free_fast に埋め込み
- デストラクタで [ALLOC_DETAIL] / [FREE_DETAIL] 出力
2. v3 backend ボトルネック分析完了
- C7-only: alloc_current_hit=99.99%, alloc_refill=0.9%, free_retire=0.1%, page_of_fail=0
- Mixed: alloc_current_hit=100%, alloc_refill=0.85%, free_retire=0.07%, page_of_fail=0
- 結論: v3 ロジック部分(ページ選択・retire)は完全最適化済み
- 残り 5% overhead は内部コスト(header write, memcpy, 分岐)
3. Tiny/ULTRA 層「完成世代」宣言
- 総括ドキュメント作成: docs/analysis/PERF_EXEC_SUMMARY_ULTRA_PHASE_20251211.md
- CURRENT_TASK.md に Phase ULTRA 総括セクション追加
- AGENTS.md に Tiny/ULTRA 完成世代宣言追加
- 最終成果: Mixed 16–1024B = 43.9M ops/s (baseline 30.6M → +43.5%)
=== ボトルネック地図 ===
| 層 | 関数 | overhead |
|-----|------|----------|
| Front | malloc/free dispatcher | ~40–45% |
| ULTRA | C4–C7 alloc/free/refill | ~12% |
| v3 backend | so_alloc/so_free | ~5% |
| mid/pool | hak_super_lookup | 3–5% |
=== フェーズ履歴(Phase ULTRA cycle) ===
- Phase PERF-ULTRA-FREE-OPT-1: C4–C7 ULTRA統合 → +9.3%
- Phase REFACTOR: Code quality (60行削減)
- Phase PERF-ULTRA-REFILL-OPT-1a/1b: C7 ULTRA refill最適化 → +11.1%
- Phase SO-BACKEND-OPT-1: v3 backend分解 → 設計限界確認
=== 次フェーズ(独立ライン) ===
1. Phase SO-BACKEND-OPT-2: v3 header write削減 (1-2%)
2. Headerless/v6系: out-of-band header (1-2%)
3. mid/pool v3新設計: C6-heavy 10M → 20–25M
本フェーズでTiny/ULTRA層は「完成世代」として基盤固定。
今後の大きい変更はHeaderless/mid系の独立ラインで検討。
🤖 Generated with Claude Code
Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-11 22:45:14 +09:00
|
|
|
|
|
|
|
|
// Detailed alloc path breakdown
|
|
|
|
|
if (ach + aph > 0) {
|
|
|
|
|
fprintf(stderr, " [ALLOC_DETAIL] alloc_current_hit=%llu alloc_partial_hit=%llu\n",
|
|
|
|
|
(unsigned long long)ach, (unsigned long long)aph);
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
// Detailed free path breakdown
|
|
|
|
|
if (fcur + fpar + fret > 0) {
|
|
|
|
|
fprintf(stderr, " [FREE_DETAIL] free_current=%llu free_partial=%llu free_retire=%llu\n",
|
|
|
|
|
(unsigned long long)fcur, (unsigned long long)fpar, (unsigned long long)fret);
|
|
|
|
|
}
|
2025-12-09 21:50:15 +09:00
|
|
|
}
|
|
|
|
|
}
|