Phase v4-mid-2, v4-mid-3, v4-mid-5: SmallObject HotBox v4 implementation and docs update

Implementation:
- SmallObject HotBox v4 (core/smallobject_hotbox_v4.c) now fully implements C6-only allocations and frees, including current/partial management and freelist operations.
- Cold Iface (tiny_heap based) for page refill/retire is integrated.
- Stats instrumentation (v4-mid-5) added to small_heap_alloc_fast_v4 and small_heap_free_fast_v4, with a new header file core/box/smallobject_hotbox_v4_stats_box.h and atexit dump function.

Updates:
- CURRENT_TASK.md has been condensed and updated with summaries of Phase v4-mid-2 (C6-only v4), Phase v4-mid-3 (C5-only v4 pilot), and the stats implementation (v4-mid-5).
- docs/analysis/SMALLOBJECT_V4_BOX_DESIGN.md updated with A/B results and conclusions for C6-only and C5-only v4 implementations.
- The previous CURRENT_TASK.md content has been archived to CURRENT_TASK_ARCHIVE_20251210.md.
This commit is contained in:
Moe Charm (CI)
2025-12-11 01:01:15 +09:00
parent 3b4449d773
commit dd974b49c5
5 changed files with 1569 additions and 1182 deletions

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,89 @@
// smallobject_hotbox_v4_stats_box.h - Stats instrumentation for SmallObject HotHeap v4
// ENV gated counters for C6 v4 alloc/free path debugging
#pragma once
#include <stdint.h>
#include <stdlib.h>
#include <stdatomic.h>
#include <stdio.h>
// Per-class stats
typedef struct {
_Atomic uint64_t alloc_calls;
_Atomic uint64_t alloc_success;
_Atomic uint64_t alloc_null_page;
_Atomic uint64_t alloc_fallback_pool;
_Atomic uint64_t free_calls;
_Atomic uint64_t free_page_found;
_Atomic uint64_t free_page_not_found;
} small_heap_v4_class_stats_t;
extern small_heap_v4_class_stats_t g_small_heap_v4_stats[8]; // For classes 0-7
static inline int small_heap_v4_stats_enabled(void) {
static int g = -1;
if (__builtin_expect(g == -1, 0)) {
const char* e = getenv("HAKMEM_SMALL_HEAP_V4_STATS");
g = (e && *e && *e != '0') ? 1 : 0;
}
return g;
}
static inline void small_heap_v4_stat_alloc_call(int class_idx) {
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
if (class_idx >= 0 && class_idx < 8) {
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_calls, 1, memory_order_relaxed);
}
}
}
static inline void small_heap_v4_stat_alloc_success(int class_idx) {
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
if (class_idx >= 0 && class_idx < 8) {
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_success, 1, memory_order_relaxed);
}
}
}
static inline void small_heap_v4_stat_alloc_null_page(int class_idx) {
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
if (class_idx >= 0 && class_idx < 8) {
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_null_page, 1, memory_order_relaxed);
}
}
}
static inline void small_heap_v4_stat_alloc_fallback_pool(int class_idx) {
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
if (class_idx >= 0 && class_idx < 8) {
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_fallback_pool, 1, memory_order_relaxed);
}
}
}
static inline void small_heap_v4_stat_free_call(int class_idx) {
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
if (class_idx >= 0 && class_idx < 8) {
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].free_calls, 1, memory_order_relaxed);
}
}
}
static inline void small_heap_v4_stat_free_page_found(int class_idx) {
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
if (class_idx >= 0 && class_idx < 8) {
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].free_page_found, 1, memory_order_relaxed);
}
}
}
static inline void small_heap_v4_stat_free_page_not_found(int class_idx) {
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
if (class_idx >= 0 && class_idx < 8) {
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].free_page_not_found, 1, memory_order_relaxed);
}
}
}
// Dump stats at exit (call from atexit)
void small_heap_v4_stats_dump(void);

View File

@ -7,6 +7,7 @@
#include "box/smallobject_hotbox_v4_box.h"
#include "box/smallobject_hotbox_v4_env_box.h"
#include "box/smallobject_hotbox_v4_stats_box.h"
#include "box/smallobject_cold_iface_v4.h"
#include "box/smallobject_hotbox_v3_env_box.h"
#include "box/tiny_heap_box.h"
@ -16,6 +17,11 @@
#include "box/tiny_geometry_box.h"
#include "tiny_region_id.h"
// ============================================================================
// Stats storage (Phase v4-mid-5)
// ============================================================================
small_heap_v4_class_stats_t g_small_heap_v4_stats[8];
// ============================================================================
// v4 Segment Configuration (Phase v4-mid-0+)
// ============================================================================
@ -235,9 +241,16 @@ static small_page_v4* cold_refill_page_v4(small_heap_ctx_v4* hot_ctx, uint32_t c
tiny_heap_ctx_t* tctx = tiny_heap_ctx_for_thread();
if (!tctx) return NULL;
// Phase v4-mid-6: Get a fresh page from TinyHeap
tiny_heap_page_t* lease = tiny_heap_prepare_page(tctx, (int)class_idx);
if (!lease) return NULL;
// Clear TinyHeap's current so next call gets fresh page
tiny_heap_class_t* hcls = tiny_heap_class(tctx, (int)class_idx);
if (hcls) {
tiny_heap_class_unlink(hcls, lease);
}
return v4_page_from_lease(lease, (int)class_idx, NULL);
}
@ -350,31 +363,49 @@ static small_page_v4* small_alloc_slow_v4(small_heap_ctx_v4* ctx, int class_idx)
}
void* small_heap_alloc_fast_v4(small_heap_ctx_v4* ctx, int class_idx) {
// Phase v4-mid-1: C6 stub - fallback to pool v1
if (__builtin_expect(class_idx == 6, 0)) {
return NULL; // C6: fallback to pool v1 (no v4 alloc yet)
}
// Phase v4-mid-5: Add stats instrumentation
small_heap_v4_stat_alloc_call(class_idx);
// Phase v4-mid-2: C6-only full SmallHeapCtx v4 implementation
if (__builtin_expect(!v4_class_supported(class_idx), 0)) {
small_heap_v4_stat_alloc_fallback_pool(class_idx);
return NULL; // C5/C6/C7 以外は未対応
}
if (!small_heap_v4_class_enabled((uint8_t)class_idx)) return NULL;
small_class_heap_v4* h = &ctx->cls[class_idx];
small_page_v4* page = h->current;
if (!page || !page->freelist) {
page = small_alloc_slow_v4(ctx, class_idx);
}
if (!page || !page->freelist) {
if (!small_heap_v4_class_enabled((uint8_t)class_idx)) {
small_heap_v4_stat_alloc_fallback_pool(class_idx);
return NULL;
}
small_class_heap_v4* h = &ctx->cls[class_idx];
small_page_v4* page = h->current;
// Try current page freelist
if (page && page->freelist) {
void* blk = page->freelist;
void* next = NULL;
memcpy(&next, blk, sizeof(void*));
page->freelist = next;
page->used++;
small_heap_v4_stat_alloc_success(class_idx);
return tiny_region_id_write_header(blk, class_idx);
}
// Current exhausted or NULL, try slow path (partial/refill)
page = small_alloc_slow_v4(ctx, class_idx);
if (!page || !page->freelist) {
small_heap_v4_stat_alloc_null_page(class_idx);
small_heap_v4_stat_alloc_fallback_pool(class_idx);
return NULL;
}
// Allocate from newly acquired/promoted page
void* blk = page->freelist;
void* next = NULL;
memcpy(&next, blk, sizeof(void*));
page->freelist = next;
page->used++;
small_heap_v4_stat_alloc_success(class_idx);
return tiny_region_id_write_header(blk, class_idx);
}
@ -402,33 +433,44 @@ static void v4_unlink_from_list(small_class_heap_v4* h, v4_loc_t loc, small_page
}
void small_heap_free_fast_v4(small_heap_ctx_v4* ctx, int class_idx, void* ptr) {
// Phase v4-mid-1: C6 stub - test page_meta_of() lookup, fallback to pool v1
if (__builtin_expect(class_idx == 6, 0)) {
// C6-only: Test page_meta_of() for Fail-Fast validation
SmallSegment* dummy_seg = (SmallSegment*)NULL; // Will be retrieved later
SmallPageMeta* m = smallsegment_v4_page_meta_of(dummy_seg, ptr);
(void)m; // Unused in v4-mid-1, but confirms function works
return; // Fallback to pool v1 (handled by front)
}
// Phase v4-mid-5: Add stats instrumentation
small_heap_v4_stat_free_call(class_idx);
// Phase v4-mid-2: C6-only full SmallHeapCtx v4 implementation
if (__builtin_expect(!v4_class_supported(class_idx), 0)) {
return;
}
if (!small_heap_v4_class_enabled((uint8_t)class_idx)) return;
if (!ptr) return;
// Phase v4-mid-6: ptr is already BASE (caller converts USER→BASE before calling us)
// See malloc_tiny_fast.h L254: base = ptr - 1, then L354/L282 passes base
void* base_ptr = ptr;
small_class_heap_v4* h = &ctx->cls[class_idx];
small_page_v4* prev = NULL;
v4_loc_t loc = V4_LOC_NONE;
small_page_v4* page = v4_find_page(h, (const uint8_t*)ptr, &loc, &prev);
if (!page) return;
// Try to find page in current/partial/full lists (using BASE pointer)
small_page_v4* page = v4_find_page(h, (const uint8_t*)base_ptr, &loc, &prev);
// Phase v4-mid-2: If page not found in v4 heap, try page_meta_of() for segment lookup
if (!page) {
small_heap_v4_stat_free_page_not_found(class_idx);
// Try to find via segment mask+shift (requires segment to be initialized)
// For now, this is a fallback for future segment-based allocation
// Return without freeing (pool v1 will handle)
return;
}
small_heap_v4_stat_free_page_found(class_idx);
const uint32_t partial_limit = v4_partial_limit(class_idx);
// freelist push
// freelist push (use BASE pointer, not USER pointer)
void* head = page->freelist;
memcpy(ptr, &head, sizeof(void*));
page->freelist = ptr;
memcpy(base_ptr, &head, sizeof(void*));
page->freelist = base_ptr;
if (page->used > 0) {
page->used--;
}
@ -472,3 +514,45 @@ void small_heap_free_fast_v4(small_heap_ctx_v4* ctx, int class_idx, void* ptr) {
}
}
}
// ============================================================================
// Stats dump (Phase v4-mid-5)
// ============================================================================
void small_heap_v4_stats_dump(void) {
if (!small_heap_v4_stats_enabled()) {
return;
}
fprintf(stderr, "\n========================================\n");
fprintf(stderr, "[SMALL_HEAP_V4_STATS] Summary\n");
fprintf(stderr, "========================================\n");
for (int c = 0; c < 8; c++) {
uint64_t alloc_calls = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_calls, memory_order_relaxed);
uint64_t alloc_success = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_success, memory_order_relaxed);
uint64_t alloc_null_page = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_null_page, memory_order_relaxed);
uint64_t alloc_fallback = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_fallback_pool, memory_order_relaxed);
uint64_t free_calls = atomic_load_explicit(&g_small_heap_v4_stats[c].free_calls, memory_order_relaxed);
uint64_t free_found = atomic_load_explicit(&g_small_heap_v4_stats[c].free_page_found, memory_order_relaxed);
uint64_t free_not_found = atomic_load_explicit(&g_small_heap_v4_stats[c].free_page_not_found, memory_order_relaxed);
if (alloc_calls > 0 || free_calls > 0) {
fprintf(stderr, "\nClass C%d:\n", c);
fprintf(stderr, " Alloc: calls=%lu success=%lu null_page=%lu fallback_pool=%lu\n",
(unsigned long)alloc_calls, (unsigned long)alloc_success,
(unsigned long)alloc_null_page, (unsigned long)alloc_fallback);
fprintf(stderr, " Free: calls=%lu page_found=%lu page_not_found=%lu\n",
(unsigned long)free_calls, (unsigned long)free_found,
(unsigned long)free_not_found);
}
}
fprintf(stderr, "========================================\n\n");
fflush(stderr);
}
// Automatic dump at program exit
static void small_heap_v4_stats_atexit(void) __attribute__((destructor));
static void small_heap_v4_stats_atexit(void) {
small_heap_v4_stats_dump();
}

View File

@ -141,3 +141,70 @@ static inline SmallPageMeta* small_page_meta_of(void* p) {
- block_size[class]
- max_partial_pages[class]
を更新するだけ。HotBox_v4 側は snapshot を読むだけに留める。
---
## Phase v4-mid-2: C6-only SmallHeapCtx v4 本実装完了 (2025-12-10)
### 実装内容
- `core/smallobject_hotbox_v4.c` に C6 用 SmallHeapCtx v4 を本格稼働:
- `small_heap_alloc_fast_v4()`: current freelist → partial head → cold_refill の順で探索。
- `small_heap_free_fast_v4()`: v4_find_page で page を特定 → freelist push → used==0 なら partial に温存 or retire。
- Cold Iface (`small_cold_v4_refill_page` / `small_cold_v4_retire_page`) は tiny_heap 経由で動作済み。
- Segment 関連 (`smallsegment_v4_*`) は tiny_heap を利用して page lease/retire を実装。
### A/B 結果C6-heavy 257768B, 1 thread, ws=400, iters=1M
| 構成 | Throughput (ops/s) | 備考 |
|------|-------------------|------|
| v4 OFF (baseline) | 9.13M 9.79M | 平均 ~9.36M |
| v4 ON (C6-only) | 10.00M 10.30M | 平均 ~10.15M |
| **改善** | **+8〜9%** | segv/assert なし |
### Mixed への影響161024B, ws=400, iters=1M
| 構成 | Throughput (ops/s) |
|------|-------------------|
| v4 OFF | 29.66M |
| v4 ON (C6-only) | 29.96M (+1%) |
→ 大きな回帰なし、研究箱として安全。
### ENV 設定例
```bash
HAKMEM_PROFILE=C6_HEAVY_LEGACY_POOLV1 \
HAKMEM_SMALL_HEAP_V4_ENABLED=1 \
HAKMEM_SMALL_HEAP_V4_CLASSES=0x40 \
./bench_mid_large_mt_hakmem 1 1000000 400 1
```
### 次ステップ
- Phase v4-mid-3: C5 v4 パイロット、または Mixed の一部クラスを SMALL_V4 route に昇格させて A/B。
---
## Phase v4-mid-3: C5-only v4 研究箱 A/B (2025-12-11)
### 実装内容
- C5 (256B class) は既に `v4_class_supported()` で許可済み。コード変更は不要。
- ENV `HAKMEM_SMALL_HEAP_V4_CLASSES=0x20` で C5 v4 を有効化。
### A/B 結果
**C5-heavy (129256B, ws=400, iters=1M)**
| 構成 | Throughput (ops/s) | 備考 |
|------|-------------------|------|
| v4 OFF | 53.6M 55.4M | 平均 ~54.4M |
| v4 ON (C5-only 0x20) | 47.6M 49.5M | 平均 ~48.7M |
| **結果** | **10〜11% 回帰** | 既存 Tiny/front v3 経路が速い |
**Mixed 161024B (C5+C6 v4)**
| 構成 | Throughput (ops/s) |
|------|-------------------|
| C6-only v4 (0x40) | 27.5M 29.3M (平均 ~28.3M) |
| C5+C6 v4 (0x60) | 28.3M 29.4M (平均 ~28.9M) |
| **結果** | **+2〜3% (誤差〜微改善)** |
### 方針
- C5-heavy では v4 が既存経路より劣後するため、C5 v4 は**研究箱のまま**標準プロファイルには入れない。
- Mixed では影響が小さく回帰なし。C5+C6 v4 (0x60) は研究箱として安全に利用可能。
- C5 サイズ帯は既存 Tiny/front v3 経路が十分最適化されており、v4 へ寄せるメリットは薄い。
- 今後の small-object v4 拡張は C6-heavy / mid 帯に集中する。