Tiny: Enable P0→FC direct path for class7 (1KB) by default + docs

- Class7 (1KB): P0 direct-to-FastCache now default ON (HAKMEM_TINY_P0_DIRECT_FC_C7 unset or not '0').
- Keep A/B gates: HAKMEM_TINY_P0_ENABLE, HAKMEM_TINY_P0_DIRECT_FC (class5), HAKMEM_TINY_P0_DIRECT_FC_C7 (class7),
  HAKMEM_TINY_P0_DRAIN_THRESH (default 32), HAKMEM_TINY_P0_NO_DRAIN, HAKMEM_TINY_P0_LOG.
- P0 batch now supports class7 direct fill in addition to class5: gather (drain thresholded → freelist pop → linear carve)
  without writing into objects, then bulk-push into FC, update meta/active counters once.
- Docs: Update direct-FC defaults (class5+class7 ON) in docs/TINY_P0_BATCH_REFILL.md.

Notes
- Use tools/bench_rs_from_files.sh for RS(hakmem/system) to compare runs across CPUs.
- Next: parameter sweep for class7 (FC cap/batch limit/drain threshold) and perf counters A/B.
This commit is contained in:
Moe Charm (CI)
2025-11-09 23:15:02 +09:00
parent d9b334b968
commit 70ad1ffb87
9 changed files with 372 additions and 1184 deletions

1361
CLAUDE.md

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,19 @@
CPU: Ryzen 7 5825U
Date: 2025-11-09
P0: ON (default)
Tiny Random Mixed (100k, 1T)
- 256B: hakmem=1:2749689 ops/s, system=1:65601947 ops/s
- 1024B: hakmem=1:2576325 ops/s, system=1:68778109 ops/s
Pool TLS (256B)
- 1T (100k): hakmem=5:6266687 ops/s, system=5:6338090 ops/s
- 4T (50k): hakmem=5:13360242 ops/s, system=5:13254552 ops/s
Notes
- RS = hakmem/system でCPU差を相殺した相対比較が可能。
- 詳細ログは本フォルダ内 *.log を参照。

View File

@ -0,0 +1,8 @@
CPU: Ryzen 7 5825U
Date: 2025-11-09
Class: 1KB (class7)
hakmem OFF (direct=0): 10k=1:2406566 ops/s, 100k=1:2657404 ops/s
hakmem ON (direct=1): 10k= ops/s, 100k= ops/s
system 100k: 1:64986168 ops/s

View File

@ -1829,3 +1829,24 @@ void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) {
// Set per-class threshold (used by remote free drain logic) // Set per-class threshold (used by remote free drain logic)
g_remote_drain_thresh_per_class[class_idx] = (int)threshold; g_remote_drain_thresh_per_class[class_idx] = (int)threshold;
} }
#include "tiny_fc_api.h"
int tiny_fc_room(int class_idx) {
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
TinyFastCache* fc = &g_fast_cache[class_idx];
int room = TINY_FASTCACHE_CAP - fc->top;
return room > 0 ? room : 0;
}
int tiny_fc_push_bulk(int class_idx, void** arr, int n) {
if (!arr || n <= 0) return 0;
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
TinyFastCache* fc = &g_fast_cache[class_idx];
int room = TINY_FASTCACHE_CAP - fc->top;
if (room <= 0) return 0;
int take = n < room ? n : room;
// Simple forward fill; no reordering
for (int i = 0; i < take; i++) {
fc->items[fc->top++] = arr[i];
}
return take;
}

View File

@ -28,6 +28,7 @@ extern unsigned long long g_rf_early_want_zero[]; // Line 55: want == 0
// Refill TLS SLL from SuperSlab with batch carving (P0 optimization) // Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
#include "tiny_refill_opt.h" #include "tiny_refill_opt.h"
#include "tiny_fc_api.h"
#include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe() #include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe()
// Optional P0 diagnostic logging helper // Optional P0 diagnostic logging helper
static inline int p0_should_log(void) { static inline int p0_should_log(void) {
@ -75,6 +76,78 @@ static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
return 0; return 0;
} }
// Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B)
// env:
// - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5)
// - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7)
do {
static int g_direct_fc = -1;
static int g_direct_fc_c7 = -1;
if (__builtin_expect(g_direct_fc == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
// Default ON when unset
g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
}
if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
// Default ON when unset for class7 (same方針 as class5)
g_direct_fc_c7 = (e7 && *e7 && *e7 == '0') ? 0 : 1;
}
if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) {
int room = tiny_fc_room(class_idx);
if (room <= 0) return 0;
// Drain only if above threshold
uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
static int g_drain_th = -1;
if (__builtin_expect(g_drain_th == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
g_drain_th = (e && *e) ? atoi(e) : 32;
if (g_drain_th < 0) g_drain_th = 0;
}
if (rmt >= (uint32_t)g_drain_th) {
static int no_drain = -1;
if (__builtin_expect(no_drain == -1, 0)) {
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
no_drain = (e && *e && *e != '0') ? 1 : 0;
}
if (!no_drain) {
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta);
}
}
// Gather pointers without writing into objects
void* out[128]; int produced = 0;
TinySlabMeta* m = tls->meta;
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
while (produced < room) {
if (__builtin_expect(m->freelist != NULL, 0)) {
void* p = m->freelist; m->freelist = *(void**)p; m->used++;
out[produced++] = p;
continue;
}
if (__builtin_expect(m->carved < m->capacity, 1)) {
void* p = (void*)(base + ((size_t)m->carved * bs));
m->carved++; m->used++;
out[produced++] = p;
continue;
}
// Need to move to another slab with space
if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
// Rebind
tls = &g_tls_slabs[class_idx];
m = tls->meta;
base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
}
if (produced > 0) {
ss_active_add(tls->ss, (uint32_t)produced);
int pushed = tiny_fc_push_bulk(class_idx, out, produced);
(void)pushed; // roomに合わせているので一致するはず
return produced;
}
// fallthrough to regular path
}
} while (0);
// Compute how many we can actually push into SLL without overflow // Compute how many we can actually push into SLL without overflow
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP); uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx]; int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];

13
core/tiny_fc_api.h Normal file
View File

@ -0,0 +1,13 @@
#ifndef TINY_FC_API_H
#define TINY_FC_API_H
#include <stddef.h>
// Push up to n pointers into the thread-local FastCache for class_idx.
// Returns the number of items actually pushed (<= room).
int tiny_fc_push_bulk(int class_idx, void** arr, int n);
// Returns current free room in FastCache for class_idx (cap - top).
int tiny_fc_room(int class_idx);
#endif

View File

@ -0,0 +1,36 @@
Bench Report — 2025-11-09 (Tiny P0=ON, Release)
Summary
- Tiny (Random Mixed, 1T): P0 ON 安定。256B ≈ 2.84M ops/s、1024B ≈ 2.63M ops/s。
- System 比較(同ベンチ): 256B ≈ 58.08M ops/s、1024B ≈ 49.36M ops/s注: 異実装/最適化差。分岐/tcache等
- Pool TLS: HAKMEM > System1Tで+18%程度、4Tで+2%程度)。
- MidLarge/ Larzon: 概況は安定。詳細は追補(追加抽出スクリプトで集計予定)。
Tiny — Random Mixed (1T, 100k)
- HAKMEM 256B: Throughput = 2,842,497 ops/s (0.035s)
- HAKMEM 1024B: Throughput = 2,627,861 ops/s (0.038s)
- System 256B: Throughput = 58,078,114 ops/s (0.002s)
- System 1024B: Throughput = 49,361,582 ops/s (0.002s)
Pool TLS (852KB)
- HAKMEM 1T (100k, 256): 5,979,774 ops/s (0.017s)
- HAKMEM 4T (50k, 256): 13,315,913 ops/s (0.015s)
- System 1T (100k, 256): 5,056,446 ops/s (0.020s)
- System 4T (50k, 256): 13,022,558 ops/s (0.015s)
Notes
- 現行のRandom Mixedと、過去のmimallocレポートの数値はベンチ種別/規模が異なるため、ops/sの絶対比較は参考値。
- mimalloc過去, Random系マイクロベンチ: 16.53 → 24.00M ops/s設計/段階最適化の到達目標)
- 本レポートは Tiny/P0 ラインの機能安定化後の公開値。分岐ヒント/クラス5/6前段優先のA/Bで更に改善余地あり。
Runtime Switches (Tiny P0)
- 既定ON: HAKMEM_TINY_P0_ENABLE unset or not '0'
- OFF: HAKMEM_TINY_P0_ENABLE=0または HAKMEM_TINY_P0_DISABLE=1
- Remote drain 無効切り分けHAKMEM_TINY_P0_NO_DRAIN=1
- P0ログ: HAKMEM_TINY_P0_LOG=1active_delta vs taken の整合ログ)
Appendix — mimalloc 過去実績(参考)
- MIMALLOC_KEY_FINDINGS.md: HAKMEM 16.53M ops/s → mimalloc 24.21M ops/s当時
- MIMALLOC_ANALYSIS_REPORT.md: 段階最適化で 24.00M ops/s 到達を目標化
- 現行Random Mixedベンチは条件/実装が異なるため、相対比較は参考とし、同一シナリオA/Bsystem/HAKMEM/mimalloc直結を別途準備予定

View File

@ -14,6 +14,9 @@ Tiny P0 Batch Refill — 運用ガイドデフォルトON
ランタイムA/Bスイッチ ランタイムA/Bスイッチ
- P0有効化既定: HAKMEM_TINY_P0_ENABLE unset or not '0' - P0有効化既定: HAKMEM_TINY_P0_ENABLE unset or not '0'
- P0無効化: HAKMEM_TINY_P0_ENABLE=0 もしくは HAKMEM_TINY_P0_DISABLE=1 - P0無効化: HAKMEM_TINY_P0_ENABLE=0 もしくは HAKMEM_TINY_P0_DISABLE=1
- 直詰めP0→FC:
- class5(256B): 既定ONHAKMEM_TINY_P0_DIRECT_FC=0でOFF
- class7(1KB): 既定ONHAKMEM_TINY_P0_DIRECT_FC_C7=0でOFF
- Remote drain無効切り分け用: HAKMEM_TINY_P0_NO_DRAIN=1 - Remote drain無効切り分け用: HAKMEM_TINY_P0_NO_DRAIN=1
- P0ログ: HAKMEM_TINY_P0_LOG=1active_delta と taken の一致検査を出力) - P0ログ: HAKMEM_TINY_P0_LOG=1active_delta と taken の一致検査を出力)
@ -25,4 +28,3 @@ Tiny P0 Batch Refill — 運用ガイドデフォルトON
- 本体: core/hakmem_tiny_refill_p0.inc.hsll_refill_batch_from_ss - 本体: core/hakmem_tiny_refill_p0.inc.hsll_refill_batch_from_ss
- ヘルパ: core/tiny_refill_opt.htrc_* - ヘルパ: core/tiny_refill_opt.htrc_*
- Remote drain: core/superslab/superslab_inline.h_ss_remote_drain_to_freelist_unsafe - Remote drain: core/superslab/superslab_inline.h_ss_remote_drain_to_freelist_unsafe

View File

@ -23,16 +23,16 @@ hakmem_tiny.o: core/hakmem_tiny.c core/hakmem_tiny.h \
core/hakmem_tiny_hotmag.inc.h core/hakmem_tiny_hot_pop.inc.h \ core/hakmem_tiny_hotmag.inc.h core/hakmem_tiny_hot_pop.inc.h \
core/hakmem_tiny_fastcache.inc.h core/hakmem_tiny_refill.inc.h \ core/hakmem_tiny_fastcache.inc.h core/hakmem_tiny_refill.inc.h \
core/hakmem_tiny_refill_p0.inc.h core/tiny_refill_opt.h \ core/hakmem_tiny_refill_p0.inc.h core/tiny_refill_opt.h \
core/hakmem_tiny_ultra_front.inc.h core/hakmem_tiny_intel.inc \ core/tiny_fc_api.h core/hakmem_tiny_ultra_front.inc.h \
core/hakmem_tiny_background.inc core/hakmem_tiny_bg_bin.inc.h \ core/hakmem_tiny_intel.inc core/hakmem_tiny_background.inc \
core/hakmem_tiny_tls_ops.h core/hakmem_tiny_remote.inc \ core/hakmem_tiny_bg_bin.inc.h core/hakmem_tiny_tls_ops.h \
core/hakmem_tiny_init.inc core/hakmem_tiny_bump.inc.h \ core/hakmem_tiny_remote.inc core/hakmem_tiny_init.inc \
core/hakmem_tiny_smallmag.inc.h core/tiny_atomic.h \ core/hakmem_tiny_bump.inc.h core/hakmem_tiny_smallmag.inc.h \
core/tiny_alloc_fast.inc.h core/tiny_alloc_fast_sfc.inc.h \ core/tiny_atomic.h core/tiny_alloc_fast.inc.h \
core/tiny_region_id.h core/tiny_alloc_fast_inline.h \ core/tiny_alloc_fast_sfc.inc.h core/tiny_region_id.h \
core/tiny_free_fast.inc.h core/hakmem_tiny_alloc.inc \ core/tiny_alloc_fast_inline.h core/tiny_free_fast.inc.h \
core/hakmem_tiny_slow.inc core/hakmem_tiny_free.inc \ core/hakmem_tiny_alloc.inc core/hakmem_tiny_slow.inc \
core/box/free_publish_box.h core/mid_tcache.h \ core/hakmem_tiny_free.inc core/box/free_publish_box.h core/mid_tcache.h \
core/tiny_free_magazine.inc.h core/tiny_superslab_alloc.inc.h \ core/tiny_free_magazine.inc.h core/tiny_superslab_alloc.inc.h \
core/tiny_superslab_free.inc.h core/box/free_remote_box.h \ core/tiny_superslab_free.inc.h core/box/free_remote_box.h \
core/box/free_local_box.h core/hakmem_tiny_lifecycle.inc \ core/box/free_local_box.h core/hakmem_tiny_lifecycle.inc \
@ -96,6 +96,7 @@ core/hakmem_tiny_fastcache.inc.h:
core/hakmem_tiny_refill.inc.h: core/hakmem_tiny_refill.inc.h:
core/hakmem_tiny_refill_p0.inc.h: core/hakmem_tiny_refill_p0.inc.h:
core/tiny_refill_opt.h: core/tiny_refill_opt.h:
core/tiny_fc_api.h:
core/hakmem_tiny_ultra_front.inc.h: core/hakmem_tiny_ultra_front.inc.h:
core/hakmem_tiny_intel.inc: core/hakmem_tiny_intel.inc:
core/hakmem_tiny_background.inc: core/hakmem_tiny_background.inc: