Add Page Box layer for C7 class optimization

- Implement tiny_page_box.c/h: per-thread page cache between UC and Shared Pool
- Integrate Page Box into Unified Cache refill path
- Remove legacy SuperSlab implementation (merged into smallmid)
- Add HAKMEM_TINY_PAGE_BOX_CLASSES env var for selective class enabling
- Update bench_random_mixed.c with Page Box statistics

Current status: Implementation safe, no regressions.
Page Box ON/OFF shows minimal difference - pool strategy needs tuning.

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
This commit is contained in:
Moe Charm (CI)
2025-12-05 15:31:44 +09:00
parent 2b2b607957
commit 093f362231
16 changed files with 651 additions and 1347 deletions

View File

@ -24,12 +24,22 @@ _Atomic uint64_t g_sp_stage2_lock_acquired_global = 0;
_Atomic uint64_t g_sp_stage3_lock_acquired_global = 0;
_Atomic uint64_t g_sp_alloc_lock_contention_global = 0;
// Per-class lock acquisition statisticsTiny クラス別の lock 負荷観測用)
_Atomic uint64_t g_sp_stage2_lock_acquired_by_class[TINY_NUM_CLASSES_SS] = {0};
_Atomic uint64_t g_sp_stage3_lock_acquired_by_class[TINY_NUM_CLASSES_SS] = {0};
// Check if measurement is enabled (cached)
static inline int sp_measure_enabled(void) {
static int g_measure = -1;
if (__builtin_expect(g_measure == -1, 0)) {
const char* e = getenv("HAKMEM_MEASURE_UNIFIED_CACHE");
g_measure = (e && *e && *e != '0') ? 1 : 0;
if (g_measure == 1) {
// Measurement が ON のときは per-class stage stats も有効化する
// Stage1/2/3 ヒット数は g_sp_stage*_hits に集計される)
extern int g_sp_stage_stats_enabled;
g_sp_stage_stats_enabled = 1;
}
}
return g_measure;
}
@ -319,8 +329,13 @@ stage2_fallback:
// Performance measurement: count Stage 2 lock acquisitions
if (__builtin_expect(sp_measure_enabled(), 0)) {
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(
&g_sp_stage2_lock_acquired_by_class[class_idx],
1, memory_order_relaxed);
}
// Update SuperSlab metadata under mutex
@ -408,8 +423,13 @@ stage2_scan:
// Performance measurement: count Stage 2 scan lock acquisitions
if (__builtin_expect(sp_measure_enabled(), 0)) {
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_stage2_lock_acquired_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(
&g_sp_stage2_lock_acquired_by_class[class_idx],
1, memory_order_relaxed);
}
// Update SuperSlab metadata under mutex
@ -486,8 +506,12 @@ stage2_scan:
// Performance measurement: count Stage 3 lock acquisitions
if (__builtin_expect(sp_measure_enabled(), 0)) {
atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_global, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global, 1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_alloc_lock_contention_global,
1, memory_order_relaxed);
atomic_fetch_add_explicit(&g_sp_stage3_lock_acquired_by_class[class_idx],
1, memory_order_relaxed);
}
// ========== Stage 3: Get new SuperSlab ==========
@ -619,9 +643,12 @@ void shared_pool_print_measurements(void) {
return; // Measurement disabled
}
uint64_t stage2 = atomic_load_explicit(&g_sp_stage2_lock_acquired_global, memory_order_relaxed);
uint64_t stage3 = atomic_load_explicit(&g_sp_stage3_lock_acquired_global, memory_order_relaxed);
uint64_t total_locks = atomic_load_explicit(&g_sp_alloc_lock_contention_global, memory_order_relaxed);
uint64_t stage2 = atomic_load_explicit(&g_sp_stage2_lock_acquired_global,
memory_order_relaxed);
uint64_t stage3 = atomic_load_explicit(&g_sp_stage3_lock_acquired_global,
memory_order_relaxed);
uint64_t total_locks = atomic_load_explicit(&g_sp_alloc_lock_contention_global,
memory_order_relaxed);
if (total_locks == 0) {
fprintf(stderr, "\n========================================\n");
@ -644,5 +671,27 @@ void shared_pool_print_measurements(void) {
(unsigned long long)stage3, stage3_pct);
fprintf(stderr, "Total Contention: %llu lock acquisitions\n",
(unsigned long long)total_locks);
// Per-class breakdownTiny 用クラス 0-7、特に C5C7 を観測)
fprintf(stderr, "\nPer-class Shared Pool Locks (Stage2/Stage3):\n");
for (int cls = 0; cls < TINY_NUM_CLASSES_SS; cls++) {
uint64_t s2c = atomic_load_explicit(
&g_sp_stage2_lock_acquired_by_class[cls],
memory_order_relaxed);
uint64_t s3c = atomic_load_explicit(
&g_sp_stage3_lock_acquired_by_class[cls],
memory_order_relaxed);
uint64_t tc = s2c + s3c;
if (tc == 0) {
continue; // ロック取得のないクラスは省略
}
fprintf(stderr,
" C%d: Stage2=%llu Stage3=%llu Total=%llu\n",
cls,
(unsigned long long)s2c,
(unsigned long long)s3c,
(unsigned long long)tc);
}
fprintf(stderr, "========================================\n\n");
}