Phase v4-mid-2, v4-mid-3, v4-mid-5: SmallObject HotBox v4 implementation and docs update
Implementation: - SmallObject HotBox v4 (core/smallobject_hotbox_v4.c) now fully implements C6-only allocations and frees, including current/partial management and freelist operations. - Cold Iface (tiny_heap based) for page refill/retire is integrated. - Stats instrumentation (v4-mid-5) added to small_heap_alloc_fast_v4 and small_heap_free_fast_v4, with a new header file core/box/smallobject_hotbox_v4_stats_box.h and atexit dump function. Updates: - CURRENT_TASK.md has been condensed and updated with summaries of Phase v4-mid-2 (C6-only v4), Phase v4-mid-3 (C5-only v4 pilot), and the stats implementation (v4-mid-5). - docs/analysis/SMALLOBJECT_V4_BOX_DESIGN.md updated with A/B results and conclusions for C6-only and C5-only v4 implementations. - The previous CURRENT_TASK.md content has been archived to CURRENT_TASK_ARCHIVE_20251210.md.
This commit is contained in:
1276
CURRENT_TASK.md
1276
CURRENT_TASK.md
File diff suppressed because it is too large
Load Diff
1177
CURRENT_TASK_ARCHIVE_20251210.md
Normal file
1177
CURRENT_TASK_ARCHIVE_20251210.md
Normal file
File diff suppressed because it is too large
Load Diff
89
core/box/smallobject_hotbox_v4_stats_box.h
Normal file
89
core/box/smallobject_hotbox_v4_stats_box.h
Normal file
@ -0,0 +1,89 @@
|
||||
// smallobject_hotbox_v4_stats_box.h - Stats instrumentation for SmallObject HotHeap v4
|
||||
// ENV gated counters for C6 v4 alloc/free path debugging
|
||||
#pragma once
|
||||
|
||||
#include <stdint.h>
|
||||
#include <stdlib.h>
|
||||
#include <stdatomic.h>
|
||||
#include <stdio.h>
|
||||
|
||||
// Per-class stats
|
||||
typedef struct {
|
||||
_Atomic uint64_t alloc_calls;
|
||||
_Atomic uint64_t alloc_success;
|
||||
_Atomic uint64_t alloc_null_page;
|
||||
_Atomic uint64_t alloc_fallback_pool;
|
||||
_Atomic uint64_t free_calls;
|
||||
_Atomic uint64_t free_page_found;
|
||||
_Atomic uint64_t free_page_not_found;
|
||||
} small_heap_v4_class_stats_t;
|
||||
|
||||
extern small_heap_v4_class_stats_t g_small_heap_v4_stats[8]; // For classes 0-7
|
||||
|
||||
static inline int small_heap_v4_stats_enabled(void) {
|
||||
static int g = -1;
|
||||
if (__builtin_expect(g == -1, 0)) {
|
||||
const char* e = getenv("HAKMEM_SMALL_HEAP_V4_STATS");
|
||||
g = (e && *e && *e != '0') ? 1 : 0;
|
||||
}
|
||||
return g;
|
||||
}
|
||||
|
||||
static inline void small_heap_v4_stat_alloc_call(int class_idx) {
|
||||
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
|
||||
if (class_idx >= 0 && class_idx < 8) {
|
||||
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_calls, 1, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void small_heap_v4_stat_alloc_success(int class_idx) {
|
||||
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
|
||||
if (class_idx >= 0 && class_idx < 8) {
|
||||
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_success, 1, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void small_heap_v4_stat_alloc_null_page(int class_idx) {
|
||||
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
|
||||
if (class_idx >= 0 && class_idx < 8) {
|
||||
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_null_page, 1, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void small_heap_v4_stat_alloc_fallback_pool(int class_idx) {
|
||||
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
|
||||
if (class_idx >= 0 && class_idx < 8) {
|
||||
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].alloc_fallback_pool, 1, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void small_heap_v4_stat_free_call(int class_idx) {
|
||||
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
|
||||
if (class_idx >= 0 && class_idx < 8) {
|
||||
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].free_calls, 1, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void small_heap_v4_stat_free_page_found(int class_idx) {
|
||||
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
|
||||
if (class_idx >= 0 && class_idx < 8) {
|
||||
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].free_page_found, 1, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
static inline void small_heap_v4_stat_free_page_not_found(int class_idx) {
|
||||
if (__builtin_expect(small_heap_v4_stats_enabled(), 0)) {
|
||||
if (class_idx >= 0 && class_idx < 8) {
|
||||
atomic_fetch_add_explicit(&g_small_heap_v4_stats[class_idx].free_page_not_found, 1, memory_order_relaxed);
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Dump stats at exit (call from atexit)
|
||||
void small_heap_v4_stats_dump(void);
|
||||
@ -7,6 +7,7 @@
|
||||
|
||||
#include "box/smallobject_hotbox_v4_box.h"
|
||||
#include "box/smallobject_hotbox_v4_env_box.h"
|
||||
#include "box/smallobject_hotbox_v4_stats_box.h"
|
||||
#include "box/smallobject_cold_iface_v4.h"
|
||||
#include "box/smallobject_hotbox_v3_env_box.h"
|
||||
#include "box/tiny_heap_box.h"
|
||||
@ -16,6 +17,11 @@
|
||||
#include "box/tiny_geometry_box.h"
|
||||
#include "tiny_region_id.h"
|
||||
|
||||
// ============================================================================
|
||||
// Stats storage (Phase v4-mid-5)
|
||||
// ============================================================================
|
||||
small_heap_v4_class_stats_t g_small_heap_v4_stats[8];
|
||||
|
||||
// ============================================================================
|
||||
// v4 Segment Configuration (Phase v4-mid-0+)
|
||||
// ============================================================================
|
||||
@ -235,9 +241,16 @@ static small_page_v4* cold_refill_page_v4(small_heap_ctx_v4* hot_ctx, uint32_t c
|
||||
tiny_heap_ctx_t* tctx = tiny_heap_ctx_for_thread();
|
||||
if (!tctx) return NULL;
|
||||
|
||||
// Phase v4-mid-6: Get a fresh page from TinyHeap
|
||||
tiny_heap_page_t* lease = tiny_heap_prepare_page(tctx, (int)class_idx);
|
||||
if (!lease) return NULL;
|
||||
|
||||
// Clear TinyHeap's current so next call gets fresh page
|
||||
tiny_heap_class_t* hcls = tiny_heap_class(tctx, (int)class_idx);
|
||||
if (hcls) {
|
||||
tiny_heap_class_unlink(hcls, lease);
|
||||
}
|
||||
|
||||
return v4_page_from_lease(lease, (int)class_idx, NULL);
|
||||
}
|
||||
|
||||
@ -350,31 +363,49 @@ static small_page_v4* small_alloc_slow_v4(small_heap_ctx_v4* ctx, int class_idx)
|
||||
}
|
||||
|
||||
void* small_heap_alloc_fast_v4(small_heap_ctx_v4* ctx, int class_idx) {
|
||||
// Phase v4-mid-1: C6 stub - fallback to pool v1
|
||||
if (__builtin_expect(class_idx == 6, 0)) {
|
||||
return NULL; // C6: fallback to pool v1 (no v4 alloc yet)
|
||||
}
|
||||
// Phase v4-mid-5: Add stats instrumentation
|
||||
small_heap_v4_stat_alloc_call(class_idx);
|
||||
|
||||
// Phase v4-mid-2: C6-only full SmallHeapCtx v4 implementation
|
||||
if (__builtin_expect(!v4_class_supported(class_idx), 0)) {
|
||||
small_heap_v4_stat_alloc_fallback_pool(class_idx);
|
||||
return NULL; // C5/C6/C7 以外は未対応
|
||||
}
|
||||
if (!small_heap_v4_class_enabled((uint8_t)class_idx)) return NULL;
|
||||
small_class_heap_v4* h = &ctx->cls[class_idx];
|
||||
small_page_v4* page = h->current;
|
||||
|
||||
if (!page || !page->freelist) {
|
||||
page = small_alloc_slow_v4(ctx, class_idx);
|
||||
}
|
||||
if (!page || !page->freelist) {
|
||||
if (!small_heap_v4_class_enabled((uint8_t)class_idx)) {
|
||||
small_heap_v4_stat_alloc_fallback_pool(class_idx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
small_class_heap_v4* h = &ctx->cls[class_idx];
|
||||
small_page_v4* page = h->current;
|
||||
|
||||
// Try current page freelist
|
||||
if (page && page->freelist) {
|
||||
void* blk = page->freelist;
|
||||
void* next = NULL;
|
||||
memcpy(&next, blk, sizeof(void*));
|
||||
page->freelist = next;
|
||||
page->used++;
|
||||
small_heap_v4_stat_alloc_success(class_idx);
|
||||
return tiny_region_id_write_header(blk, class_idx);
|
||||
}
|
||||
|
||||
// Current exhausted or NULL, try slow path (partial/refill)
|
||||
page = small_alloc_slow_v4(ctx, class_idx);
|
||||
if (!page || !page->freelist) {
|
||||
small_heap_v4_stat_alloc_null_page(class_idx);
|
||||
small_heap_v4_stat_alloc_fallback_pool(class_idx);
|
||||
return NULL;
|
||||
}
|
||||
|
||||
// Allocate from newly acquired/promoted page
|
||||
void* blk = page->freelist;
|
||||
void* next = NULL;
|
||||
memcpy(&next, blk, sizeof(void*));
|
||||
page->freelist = next;
|
||||
page->used++;
|
||||
|
||||
small_heap_v4_stat_alloc_success(class_idx);
|
||||
return tiny_region_id_write_header(blk, class_idx);
|
||||
}
|
||||
|
||||
@ -402,33 +433,44 @@ static void v4_unlink_from_list(small_class_heap_v4* h, v4_loc_t loc, small_page
|
||||
}
|
||||
|
||||
void small_heap_free_fast_v4(small_heap_ctx_v4* ctx, int class_idx, void* ptr) {
|
||||
// Phase v4-mid-1: C6 stub - test page_meta_of() lookup, fallback to pool v1
|
||||
if (__builtin_expect(class_idx == 6, 0)) {
|
||||
// C6-only: Test page_meta_of() for Fail-Fast validation
|
||||
SmallSegment* dummy_seg = (SmallSegment*)NULL; // Will be retrieved later
|
||||
SmallPageMeta* m = smallsegment_v4_page_meta_of(dummy_seg, ptr);
|
||||
(void)m; // Unused in v4-mid-1, but confirms function works
|
||||
return; // Fallback to pool v1 (handled by front)
|
||||
}
|
||||
// Phase v4-mid-5: Add stats instrumentation
|
||||
small_heap_v4_stat_free_call(class_idx);
|
||||
|
||||
// Phase v4-mid-2: C6-only full SmallHeapCtx v4 implementation
|
||||
if (__builtin_expect(!v4_class_supported(class_idx), 0)) {
|
||||
return;
|
||||
}
|
||||
if (!small_heap_v4_class_enabled((uint8_t)class_idx)) return;
|
||||
if (!ptr) return;
|
||||
|
||||
// Phase v4-mid-6: ptr is already BASE (caller converts USER→BASE before calling us)
|
||||
// See malloc_tiny_fast.h L254: base = ptr - 1, then L354/L282 passes base
|
||||
void* base_ptr = ptr;
|
||||
|
||||
small_class_heap_v4* h = &ctx->cls[class_idx];
|
||||
small_page_v4* prev = NULL;
|
||||
v4_loc_t loc = V4_LOC_NONE;
|
||||
small_page_v4* page = v4_find_page(h, (const uint8_t*)ptr, &loc, &prev);
|
||||
if (!page) return;
|
||||
|
||||
// Try to find page in current/partial/full lists (using BASE pointer)
|
||||
small_page_v4* page = v4_find_page(h, (const uint8_t*)base_ptr, &loc, &prev);
|
||||
|
||||
// Phase v4-mid-2: If page not found in v4 heap, try page_meta_of() for segment lookup
|
||||
if (!page) {
|
||||
small_heap_v4_stat_free_page_not_found(class_idx);
|
||||
// Try to find via segment mask+shift (requires segment to be initialized)
|
||||
// For now, this is a fallback for future segment-based allocation
|
||||
// Return without freeing (pool v1 will handle)
|
||||
return;
|
||||
}
|
||||
|
||||
small_heap_v4_stat_free_page_found(class_idx);
|
||||
|
||||
const uint32_t partial_limit = v4_partial_limit(class_idx);
|
||||
|
||||
// freelist push
|
||||
// freelist push (use BASE pointer, not USER pointer)
|
||||
void* head = page->freelist;
|
||||
memcpy(ptr, &head, sizeof(void*));
|
||||
page->freelist = ptr;
|
||||
memcpy(base_ptr, &head, sizeof(void*));
|
||||
page->freelist = base_ptr;
|
||||
if (page->used > 0) {
|
||||
page->used--;
|
||||
}
|
||||
@ -472,3 +514,45 @@ void small_heap_free_fast_v4(small_heap_ctx_v4* ctx, int class_idx, void* ptr) {
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Stats dump (Phase v4-mid-5)
|
||||
// ============================================================================
|
||||
void small_heap_v4_stats_dump(void) {
|
||||
if (!small_heap_v4_stats_enabled()) {
|
||||
return;
|
||||
}
|
||||
|
||||
fprintf(stderr, "\n========================================\n");
|
||||
fprintf(stderr, "[SMALL_HEAP_V4_STATS] Summary\n");
|
||||
fprintf(stderr, "========================================\n");
|
||||
|
||||
for (int c = 0; c < 8; c++) {
|
||||
uint64_t alloc_calls = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_calls, memory_order_relaxed);
|
||||
uint64_t alloc_success = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_success, memory_order_relaxed);
|
||||
uint64_t alloc_null_page = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_null_page, memory_order_relaxed);
|
||||
uint64_t alloc_fallback = atomic_load_explicit(&g_small_heap_v4_stats[c].alloc_fallback_pool, memory_order_relaxed);
|
||||
uint64_t free_calls = atomic_load_explicit(&g_small_heap_v4_stats[c].free_calls, memory_order_relaxed);
|
||||
uint64_t free_found = atomic_load_explicit(&g_small_heap_v4_stats[c].free_page_found, memory_order_relaxed);
|
||||
uint64_t free_not_found = atomic_load_explicit(&g_small_heap_v4_stats[c].free_page_not_found, memory_order_relaxed);
|
||||
|
||||
if (alloc_calls > 0 || free_calls > 0) {
|
||||
fprintf(stderr, "\nClass C%d:\n", c);
|
||||
fprintf(stderr, " Alloc: calls=%lu success=%lu null_page=%lu fallback_pool=%lu\n",
|
||||
(unsigned long)alloc_calls, (unsigned long)alloc_success,
|
||||
(unsigned long)alloc_null_page, (unsigned long)alloc_fallback);
|
||||
fprintf(stderr, " Free: calls=%lu page_found=%lu page_not_found=%lu\n",
|
||||
(unsigned long)free_calls, (unsigned long)free_found,
|
||||
(unsigned long)free_not_found);
|
||||
}
|
||||
}
|
||||
|
||||
fprintf(stderr, "========================================\n\n");
|
||||
fflush(stderr);
|
||||
}
|
||||
|
||||
// Automatic dump at program exit
|
||||
static void small_heap_v4_stats_atexit(void) __attribute__((destructor));
|
||||
static void small_heap_v4_stats_atexit(void) {
|
||||
small_heap_v4_stats_dump();
|
||||
}
|
||||
|
||||
@ -141,3 +141,70 @@ static inline SmallPageMeta* small_page_meta_of(void* p) {
|
||||
- block_size[class]
|
||||
- max_partial_pages[class]
|
||||
を更新するだけ。HotBox_v4 側は snapshot を読むだけに留める。
|
||||
|
||||
---
|
||||
|
||||
## Phase v4-mid-2: C6-only SmallHeapCtx v4 本実装完了 (2025-12-10)
|
||||
|
||||
### 実装内容
|
||||
- `core/smallobject_hotbox_v4.c` に C6 用 SmallHeapCtx v4 を本格稼働:
|
||||
- `small_heap_alloc_fast_v4()`: current freelist → partial head → cold_refill の順で探索。
|
||||
- `small_heap_free_fast_v4()`: v4_find_page で page を特定 → freelist push → used==0 なら partial に温存 or retire。
|
||||
- Cold Iface (`small_cold_v4_refill_page` / `small_cold_v4_retire_page`) は tiny_heap 経由で動作済み。
|
||||
- Segment 関連 (`smallsegment_v4_*`) は tiny_heap を利用して page lease/retire を実装。
|
||||
|
||||
### A/B 結果(C6-heavy 257–768B, 1 thread, ws=400, iters=1M)
|
||||
| 構成 | Throughput (ops/s) | 備考 |
|
||||
|------|-------------------|------|
|
||||
| v4 OFF (baseline) | 9.13M – 9.79M | 平均 ~9.36M |
|
||||
| v4 ON (C6-only) | 10.00M – 10.30M | 平均 ~10.15M |
|
||||
| **改善** | **+8〜9%** | segv/assert なし |
|
||||
|
||||
### Mixed への影響(16–1024B, ws=400, iters=1M)
|
||||
| 構成 | Throughput (ops/s) |
|
||||
|------|-------------------|
|
||||
| v4 OFF | 29.66M |
|
||||
| v4 ON (C6-only) | 29.96M (+1%) |
|
||||
|
||||
→ 大きな回帰なし、研究箱として安全。
|
||||
|
||||
### ENV 設定例
|
||||
```bash
|
||||
HAKMEM_PROFILE=C6_HEAVY_LEGACY_POOLV1 \
|
||||
HAKMEM_SMALL_HEAP_V4_ENABLED=1 \
|
||||
HAKMEM_SMALL_HEAP_V4_CLASSES=0x40 \
|
||||
./bench_mid_large_mt_hakmem 1 1000000 400 1
|
||||
```
|
||||
|
||||
### 次ステップ
|
||||
- Phase v4-mid-3: C5 v4 パイロット、または Mixed の一部クラスを SMALL_V4 route に昇格させて A/B。
|
||||
|
||||
---
|
||||
|
||||
## Phase v4-mid-3: C5-only v4 研究箱 A/B (2025-12-11)
|
||||
|
||||
### 実装内容
|
||||
- C5 (256B class) は既に `v4_class_supported()` で許可済み。コード変更は不要。
|
||||
- ENV `HAKMEM_SMALL_HEAP_V4_CLASSES=0x20` で C5 v4 を有効化。
|
||||
|
||||
### A/B 結果
|
||||
|
||||
**C5-heavy (129–256B, ws=400, iters=1M)**
|
||||
| 構成 | Throughput (ops/s) | 備考 |
|
||||
|------|-------------------|------|
|
||||
| v4 OFF | 53.6M – 55.4M | 平均 ~54.4M |
|
||||
| v4 ON (C5-only 0x20) | 47.6M – 49.5M | 平均 ~48.7M |
|
||||
| **結果** | **−10〜11% 回帰** | 既存 Tiny/front v3 経路が速い |
|
||||
|
||||
**Mixed 16–1024B (C5+C6 v4)**
|
||||
| 構成 | Throughput (ops/s) |
|
||||
|------|-------------------|
|
||||
| C6-only v4 (0x40) | 27.5M – 29.3M (平均 ~28.3M) |
|
||||
| C5+C6 v4 (0x60) | 28.3M – 29.4M (平均 ~28.9M) |
|
||||
| **結果** | **+2〜3% (誤差〜微改善)** |
|
||||
|
||||
### 方針
|
||||
- C5-heavy では v4 が既存経路より劣後するため、C5 v4 は**研究箱のまま**標準プロファイルには入れない。
|
||||
- Mixed では影響が小さく回帰なし。C5+C6 v4 (0x60) は研究箱として安全に利用可能。
|
||||
- C5 サイズ帯は既存 Tiny/front v3 経路が十分最適化されており、v4 へ寄せるメリットは薄い。
|
||||
- 今後の small-object v4 拡張は C6-heavy / mid 帯に集中する。
|
||||
|
||||
Reference in New Issue
Block a user