Tiny: Enable P0→FC direct path for class7 (1KB) by default + docs
- Class7 (1KB): P0 direct-to-FastCache now default ON (HAKMEM_TINY_P0_DIRECT_FC_C7 unset or not '0'). - Keep A/B gates: HAKMEM_TINY_P0_ENABLE, HAKMEM_TINY_P0_DIRECT_FC (class5), HAKMEM_TINY_P0_DIRECT_FC_C7 (class7), HAKMEM_TINY_P0_DRAIN_THRESH (default 32), HAKMEM_TINY_P0_NO_DRAIN, HAKMEM_TINY_P0_LOG. - P0 batch now supports class7 direct fill in addition to class5: gather (drain thresholded → freelist pop → linear carve) without writing into objects, then bulk-push into FC, update meta/active counters once. - Docs: Update direct-FC defaults (class5+class7 ON) in docs/TINY_P0_BATCH_REFILL.md. Notes - Use tools/bench_rs_from_files.sh for RS(hakmem/system) to compare runs across CPUs. - Next: parameter sweep for class7 (FC cap/batch limit/drain threshold) and perf counters A/B.
This commit is contained in:
19
benchmarks/results/2025-11-09_ryzen7-5825U/SUMMARY.md
Normal file
19
benchmarks/results/2025-11-09_ryzen7-5825U/SUMMARY.md
Normal file
@ -0,0 +1,19 @@
|
|||||||
|
CPU: Ryzen 7 5825U
|
||||||
|
Date: 2025-11-09
|
||||||
|
P0: ON (default)
|
||||||
|
|
||||||
|
Tiny Random Mixed (100k, 1T)
|
||||||
|
- 256B: hakmem=1:2749689 ops/s, system=1:65601947 ops/s
|
||||||
|
- 1024B: hakmem=1:2576325 ops/s, system=1:68778109 ops/s
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Pool TLS (256B)
|
||||||
|
- 1T (100k): hakmem=5:6266687 ops/s, system=5:6338090 ops/s
|
||||||
|
- 4T (50k): hakmem=5:13360242 ops/s, system=5:13254552 ops/s
|
||||||
|
|
||||||
|
|
||||||
|
|
||||||
|
Notes
|
||||||
|
- RS = hakmem/system でCPU差を相殺した相対比較が可能。
|
||||||
|
- 詳細ログは本フォルダ内 *.log を参照。
|
||||||
@ -0,0 +1,8 @@
|
|||||||
|
CPU: Ryzen 7 5825U
|
||||||
|
Date: 2025-11-09
|
||||||
|
Class: 1KB (class7)
|
||||||
|
|
||||||
|
hakmem OFF (direct=0): 10k=1:2406566 ops/s, 100k=1:2657404 ops/s
|
||||||
|
hakmem ON (direct=1): 10k= ops/s, 100k= ops/s
|
||||||
|
system 100k: 1:64986168 ops/s
|
||||||
|
|
||||||
@ -1829,3 +1829,24 @@ void hkm_ace_set_drain_threshold(int class_idx, uint32_t threshold) {
|
|||||||
// Set per-class threshold (used by remote free drain logic)
|
// Set per-class threshold (used by remote free drain logic)
|
||||||
g_remote_drain_thresh_per_class[class_idx] = (int)threshold;
|
g_remote_drain_thresh_per_class[class_idx] = (int)threshold;
|
||||||
}
|
}
|
||||||
|
#include "tiny_fc_api.h"
|
||||||
|
int tiny_fc_room(int class_idx) {
|
||||||
|
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
|
||||||
|
TinyFastCache* fc = &g_fast_cache[class_idx];
|
||||||
|
int room = TINY_FASTCACHE_CAP - fc->top;
|
||||||
|
return room > 0 ? room : 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
int tiny_fc_push_bulk(int class_idx, void** arr, int n) {
|
||||||
|
if (!arr || n <= 0) return 0;
|
||||||
|
if (class_idx < 0 || class_idx >= TINY_NUM_CLASSES) return 0;
|
||||||
|
TinyFastCache* fc = &g_fast_cache[class_idx];
|
||||||
|
int room = TINY_FASTCACHE_CAP - fc->top;
|
||||||
|
if (room <= 0) return 0;
|
||||||
|
int take = n < room ? n : room;
|
||||||
|
// Simple forward fill; no reordering
|
||||||
|
for (int i = 0; i < take; i++) {
|
||||||
|
fc->items[fc->top++] = arr[i];
|
||||||
|
}
|
||||||
|
return take;
|
||||||
|
}
|
||||||
|
|||||||
@ -28,6 +28,7 @@ extern unsigned long long g_rf_early_want_zero[]; // Line 55: want == 0
|
|||||||
|
|
||||||
// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
|
// Refill TLS SLL from SuperSlab with batch carving (P0 optimization)
|
||||||
#include "tiny_refill_opt.h"
|
#include "tiny_refill_opt.h"
|
||||||
|
#include "tiny_fc_api.h"
|
||||||
#include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe()
|
#include "superslab/superslab_inline.h" // For _ss_remote_drain_to_freelist_unsafe()
|
||||||
// Optional P0 diagnostic logging helper
|
// Optional P0 diagnostic logging helper
|
||||||
static inline int p0_should_log(void) {
|
static inline int p0_should_log(void) {
|
||||||
@ -75,6 +76,78 @@ static inline int sll_refill_batch_from_ss(int class_idx, int max_take) {
|
|||||||
return 0;
|
return 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Optional: Direct-FC fast path for class 5 (256B) / class 7 (1024B)
|
||||||
|
// env:
|
||||||
|
// - HAKMEM_TINY_P0_DIRECT_FC (default ON for class5)
|
||||||
|
// - HAKMEM_TINY_P0_DIRECT_FC_C7 (default OFF for class7)
|
||||||
|
do {
|
||||||
|
static int g_direct_fc = -1;
|
||||||
|
static int g_direct_fc_c7 = -1;
|
||||||
|
if (__builtin_expect(g_direct_fc == -1, 0)) {
|
||||||
|
const char* e = getenv("HAKMEM_TINY_P0_DIRECT_FC");
|
||||||
|
// Default ON when unset
|
||||||
|
g_direct_fc = (e && *e && *e == '0') ? 0 : 1;
|
||||||
|
}
|
||||||
|
if (__builtin_expect(g_direct_fc_c7 == -1, 0)) {
|
||||||
|
const char* e7 = getenv("HAKMEM_TINY_P0_DIRECT_FC_C7");
|
||||||
|
// Default ON when unset for class7 (same方針 as class5)
|
||||||
|
g_direct_fc_c7 = (e7 && *e7 && *e7 == '0') ? 0 : 1;
|
||||||
|
}
|
||||||
|
if (__builtin_expect((g_direct_fc && class_idx == 5) || (g_direct_fc_c7 && class_idx == 7), 0)) {
|
||||||
|
int room = tiny_fc_room(class_idx);
|
||||||
|
if (room <= 0) return 0;
|
||||||
|
// Drain only if above threshold
|
||||||
|
uint32_t rmt = atomic_load_explicit(&tls->ss->remote_counts[tls->slab_idx], memory_order_relaxed);
|
||||||
|
static int g_drain_th = -1;
|
||||||
|
if (__builtin_expect(g_drain_th == -1, 0)) {
|
||||||
|
const char* e = getenv("HAKMEM_TINY_P0_DRAIN_THRESH");
|
||||||
|
g_drain_th = (e && *e) ? atoi(e) : 32;
|
||||||
|
if (g_drain_th < 0) g_drain_th = 0;
|
||||||
|
}
|
||||||
|
if (rmt >= (uint32_t)g_drain_th) {
|
||||||
|
static int no_drain = -1;
|
||||||
|
if (__builtin_expect(no_drain == -1, 0)) {
|
||||||
|
const char* e = getenv("HAKMEM_TINY_P0_NO_DRAIN");
|
||||||
|
no_drain = (e && *e && *e != '0') ? 1 : 0;
|
||||||
|
}
|
||||||
|
if (!no_drain) {
|
||||||
|
_ss_remote_drain_to_freelist_unsafe(tls->ss, tls->slab_idx, tls->meta);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
// Gather pointers without writing into objects
|
||||||
|
void* out[128]; int produced = 0;
|
||||||
|
TinySlabMeta* m = tls->meta;
|
||||||
|
size_t bs = g_tiny_class_sizes[class_idx] + ((class_idx != 7) ? 1 : 0);
|
||||||
|
uint8_t* base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
|
||||||
|
while (produced < room) {
|
||||||
|
if (__builtin_expect(m->freelist != NULL, 0)) {
|
||||||
|
void* p = m->freelist; m->freelist = *(void**)p; m->used++;
|
||||||
|
out[produced++] = p;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
if (__builtin_expect(m->carved < m->capacity, 1)) {
|
||||||
|
void* p = (void*)(base + ((size_t)m->carved * bs));
|
||||||
|
m->carved++; m->used++;
|
||||||
|
out[produced++] = p;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
// Need to move to another slab with space
|
||||||
|
if (__builtin_expect(superslab_refill(class_idx) == NULL, 0)) break;
|
||||||
|
// Rebind
|
||||||
|
tls = &g_tls_slabs[class_idx];
|
||||||
|
m = tls->meta;
|
||||||
|
base = tls->slab_base ? tls->slab_base : tiny_slab_base_for(tls->ss, tls->slab_idx);
|
||||||
|
}
|
||||||
|
if (produced > 0) {
|
||||||
|
ss_active_add(tls->ss, (uint32_t)produced);
|
||||||
|
int pushed = tiny_fc_push_bulk(class_idx, out, produced);
|
||||||
|
(void)pushed; // roomに合わせているので一致するはず
|
||||||
|
return produced;
|
||||||
|
}
|
||||||
|
// fallthrough to regular path
|
||||||
|
}
|
||||||
|
} while (0);
|
||||||
|
|
||||||
// Compute how many we can actually push into SLL without overflow
|
// Compute how many we can actually push into SLL without overflow
|
||||||
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
|
uint32_t sll_cap = sll_cap_for_class(class_idx, (uint32_t)TINY_TLS_MAG_CAP);
|
||||||
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
|
int room = (int)sll_cap - (int)g_tls_sll_count[class_idx];
|
||||||
|
|||||||
13
core/tiny_fc_api.h
Normal file
13
core/tiny_fc_api.h
Normal file
@ -0,0 +1,13 @@
|
|||||||
|
#ifndef TINY_FC_API_H
|
||||||
|
#define TINY_FC_API_H
|
||||||
|
#include <stddef.h>
|
||||||
|
|
||||||
|
// Push up to n pointers into the thread-local FastCache for class_idx.
|
||||||
|
// Returns the number of items actually pushed (<= room).
|
||||||
|
int tiny_fc_push_bulk(int class_idx, void** arr, int n);
|
||||||
|
|
||||||
|
// Returns current free room in FastCache for class_idx (cap - top).
|
||||||
|
int tiny_fc_room(int class_idx);
|
||||||
|
|
||||||
|
#endif
|
||||||
|
|
||||||
36
docs/BENCH_REPORT_2025_11_09.md
Normal file
36
docs/BENCH_REPORT_2025_11_09.md
Normal file
@ -0,0 +1,36 @@
|
|||||||
|
Bench Report — 2025-11-09 (Tiny P0=ON, Release)
|
||||||
|
|
||||||
|
Summary
|
||||||
|
- Tiny (Random Mixed, 1T): P0 ON 安定。256B ≈ 2.84M ops/s、1024B ≈ 2.63M ops/s。
|
||||||
|
- System 比較(同ベンチ): 256B ≈ 58.08M ops/s、1024B ≈ 49.36M ops/s(注: 異実装/最適化差。分岐/tcache等)。
|
||||||
|
- Pool TLS: HAKMEM > System(1Tで+18%程度、4Tで+2%程度)。
|
||||||
|
- Mid‑Large/ Larzon: 概況は安定。詳細は追補(追加抽出スクリプトで集計予定)。
|
||||||
|
|
||||||
|
Tiny — Random Mixed (1T, 100k)
|
||||||
|
- HAKMEM 256B: Throughput = 2,842,497 ops/s (0.035s)
|
||||||
|
- HAKMEM 1024B: Throughput = 2,627,861 ops/s (0.038s)
|
||||||
|
- System 256B: Throughput = 58,078,114 ops/s (0.002s)
|
||||||
|
- System 1024B: Throughput = 49,361,582 ops/s (0.002s)
|
||||||
|
|
||||||
|
Pool TLS (8–52KB)
|
||||||
|
- HAKMEM 1T (100k, 256): 5,979,774 ops/s (0.017s)
|
||||||
|
- HAKMEM 4T (50k, 256): 13,315,913 ops/s (0.015s)
|
||||||
|
- System 1T (100k, 256): 5,056,446 ops/s (0.020s)
|
||||||
|
- System 4T (50k, 256): 13,022,558 ops/s (0.015s)
|
||||||
|
|
||||||
|
Notes
|
||||||
|
- 現行のRandom Mixedと、過去のmimallocレポートの数値はベンチ種別/規模が異なるため、ops/sの絶対比較は参考値。
|
||||||
|
- mimalloc(過去, Random系マイクロベンチ): 16.53 → 24.00M ops/s(設計/段階最適化の到達目標)
|
||||||
|
- 本レポートは Tiny/P0 ラインの機能安定化後の公開値。分岐ヒント/クラス5/6前段優先のA/Bで更に改善余地あり。
|
||||||
|
|
||||||
|
Runtime Switches (Tiny P0)
|
||||||
|
- 既定ON: HAKMEM_TINY_P0_ENABLE unset or not '0'
|
||||||
|
- OFF: HAKMEM_TINY_P0_ENABLE=0(または HAKMEM_TINY_P0_DISABLE=1)
|
||||||
|
- Remote drain 無効(切り分け):HAKMEM_TINY_P0_NO_DRAIN=1
|
||||||
|
- P0ログ: HAKMEM_TINY_P0_LOG=1(active_delta vs taken の整合ログ)
|
||||||
|
|
||||||
|
Appendix — mimalloc 過去実績(参考)
|
||||||
|
- MIMALLOC_KEY_FINDINGS.md: HAKMEM 16.53M ops/s → mimalloc 24.21M ops/s(当時)
|
||||||
|
- MIMALLOC_ANALYSIS_REPORT.md: 段階最適化で 24.00M ops/s 到達を目標化
|
||||||
|
- 現行Random Mixedベンチは条件/実装が異なるため、相対比較は参考とし、同一シナリオA/B(system/HAKMEM/mimalloc直結)を別途準備予定
|
||||||
|
|
||||||
@ -14,6 +14,9 @@ Tiny P0 Batch Refill — 運用ガイド(デフォルトON)
|
|||||||
ランタイムA/Bスイッチ
|
ランタイムA/Bスイッチ
|
||||||
- P0有効化(既定): HAKMEM_TINY_P0_ENABLE unset or not '0'
|
- P0有効化(既定): HAKMEM_TINY_P0_ENABLE unset or not '0'
|
||||||
- P0無効化: HAKMEM_TINY_P0_ENABLE=0 もしくは HAKMEM_TINY_P0_DISABLE=1
|
- P0無効化: HAKMEM_TINY_P0_ENABLE=0 もしくは HAKMEM_TINY_P0_DISABLE=1
|
||||||
|
- 直詰め(P0→FC):
|
||||||
|
- class5(256B): 既定ON(HAKMEM_TINY_P0_DIRECT_FC=0でOFF)
|
||||||
|
- class7(1KB): 既定ON(HAKMEM_TINY_P0_DIRECT_FC_C7=0でOFF)
|
||||||
- Remote drain無効(切り分け用): HAKMEM_TINY_P0_NO_DRAIN=1
|
- Remote drain無効(切り分け用): HAKMEM_TINY_P0_NO_DRAIN=1
|
||||||
- P0ログ: HAKMEM_TINY_P0_LOG=1(active_delta と taken の一致検査を出力)
|
- P0ログ: HAKMEM_TINY_P0_LOG=1(active_delta と taken の一致検査を出力)
|
||||||
|
|
||||||
@ -25,4 +28,3 @@ Tiny P0 Batch Refill — 運用ガイド(デフォルトON)
|
|||||||
- 本体: core/hakmem_tiny_refill_p0.inc.h(sll_refill_batch_from_ss)
|
- 本体: core/hakmem_tiny_refill_p0.inc.h(sll_refill_batch_from_ss)
|
||||||
- ヘルパ: core/tiny_refill_opt.h(trc_*)
|
- ヘルパ: core/tiny_refill_opt.h(trc_*)
|
||||||
- Remote drain: core/superslab/superslab_inline.h(_ss_remote_drain_to_freelist_unsafe)
|
- Remote drain: core/superslab/superslab_inline.h(_ss_remote_drain_to_freelist_unsafe)
|
||||||
|
|
||||||
|
|||||||
@ -23,16 +23,16 @@ hakmem_tiny.o: core/hakmem_tiny.c core/hakmem_tiny.h \
|
|||||||
core/hakmem_tiny_hotmag.inc.h core/hakmem_tiny_hot_pop.inc.h \
|
core/hakmem_tiny_hotmag.inc.h core/hakmem_tiny_hot_pop.inc.h \
|
||||||
core/hakmem_tiny_fastcache.inc.h core/hakmem_tiny_refill.inc.h \
|
core/hakmem_tiny_fastcache.inc.h core/hakmem_tiny_refill.inc.h \
|
||||||
core/hakmem_tiny_refill_p0.inc.h core/tiny_refill_opt.h \
|
core/hakmem_tiny_refill_p0.inc.h core/tiny_refill_opt.h \
|
||||||
core/hakmem_tiny_ultra_front.inc.h core/hakmem_tiny_intel.inc \
|
core/tiny_fc_api.h core/hakmem_tiny_ultra_front.inc.h \
|
||||||
core/hakmem_tiny_background.inc core/hakmem_tiny_bg_bin.inc.h \
|
core/hakmem_tiny_intel.inc core/hakmem_tiny_background.inc \
|
||||||
core/hakmem_tiny_tls_ops.h core/hakmem_tiny_remote.inc \
|
core/hakmem_tiny_bg_bin.inc.h core/hakmem_tiny_tls_ops.h \
|
||||||
core/hakmem_tiny_init.inc core/hakmem_tiny_bump.inc.h \
|
core/hakmem_tiny_remote.inc core/hakmem_tiny_init.inc \
|
||||||
core/hakmem_tiny_smallmag.inc.h core/tiny_atomic.h \
|
core/hakmem_tiny_bump.inc.h core/hakmem_tiny_smallmag.inc.h \
|
||||||
core/tiny_alloc_fast.inc.h core/tiny_alloc_fast_sfc.inc.h \
|
core/tiny_atomic.h core/tiny_alloc_fast.inc.h \
|
||||||
core/tiny_region_id.h core/tiny_alloc_fast_inline.h \
|
core/tiny_alloc_fast_sfc.inc.h core/tiny_region_id.h \
|
||||||
core/tiny_free_fast.inc.h core/hakmem_tiny_alloc.inc \
|
core/tiny_alloc_fast_inline.h core/tiny_free_fast.inc.h \
|
||||||
core/hakmem_tiny_slow.inc core/hakmem_tiny_free.inc \
|
core/hakmem_tiny_alloc.inc core/hakmem_tiny_slow.inc \
|
||||||
core/box/free_publish_box.h core/mid_tcache.h \
|
core/hakmem_tiny_free.inc core/box/free_publish_box.h core/mid_tcache.h \
|
||||||
core/tiny_free_magazine.inc.h core/tiny_superslab_alloc.inc.h \
|
core/tiny_free_magazine.inc.h core/tiny_superslab_alloc.inc.h \
|
||||||
core/tiny_superslab_free.inc.h core/box/free_remote_box.h \
|
core/tiny_superslab_free.inc.h core/box/free_remote_box.h \
|
||||||
core/box/free_local_box.h core/hakmem_tiny_lifecycle.inc \
|
core/box/free_local_box.h core/hakmem_tiny_lifecycle.inc \
|
||||||
@ -96,6 +96,7 @@ core/hakmem_tiny_fastcache.inc.h:
|
|||||||
core/hakmem_tiny_refill.inc.h:
|
core/hakmem_tiny_refill.inc.h:
|
||||||
core/hakmem_tiny_refill_p0.inc.h:
|
core/hakmem_tiny_refill_p0.inc.h:
|
||||||
core/tiny_refill_opt.h:
|
core/tiny_refill_opt.h:
|
||||||
|
core/tiny_fc_api.h:
|
||||||
core/hakmem_tiny_ultra_front.inc.h:
|
core/hakmem_tiny_ultra_front.inc.h:
|
||||||
core/hakmem_tiny_intel.inc:
|
core/hakmem_tiny_intel.inc:
|
||||||
core/hakmem_tiny_background.inc:
|
core/hakmem_tiny_background.inc:
|
||||||
|
|||||||
Reference in New Issue
Block a user