Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
12 KiB
12 KiB
ACE Learning Layer - Implementation Plan
Goal: 断片化・巨大WS・reallocの弱点を学習で潰して"つよつよ"にする
優先順位(効果 × 実装容易性)
🔥 Phase 1: 最小実装(1日、最大効果)
Target: 断片化ストレス 3.87 → 10-20 M ops/s
1.1 メトリクス収集基盤(2-3時間)
// core/hakmem_ace_metrics.h
typedef struct ACEMetrics {
// Fast metrics (1Hz collection)
uint64_t throughput_ops; // ops/sec
uint64_t p50_lat_ns; // median latency
uint64_t p90_lat_ns; // p90 latency
double llc_miss_rate; // LLC misses / loads
uint64_t mutex_wait_ns; // mutex contention time
uint64_t remote_free_backlog[8]; // per size-class
// Slow metrics (30-60s collection)
uint64_t rss_mb; // resident set size
double fragmentation_ratio; // allocated / held
uint32_t partial_empty_slabs; // partially empty slabs
uint32_t arena_contention[4]; // per-arena contention
} ACEMetrics;
// Lightweight collection (no syscalls in fast path)
void ace_metrics_tick_1hz(ACEMetrics* m);
void ace_metrics_tick_60s(ACEMetrics* m);
実装:
throughput_ops: 既存のTLS countersから集計llc_miss_rate: rdpmc使用(軽量)mutex_wait_ns: 既存のlockタイミング統計remote_free_backlog: Tiny/Mid MTの既存カウンタfragmentation_ratio: Superslab/Mid MTの使用率から計算rss_mb: /proc/self/statm(60秒に1回のみ)
1.2 Fast Loop骨格(2-3時間)
// core/hakmem_ace_controller.c
typedef struct ACEKnobs {
// TLS/Magazine capacity (per class)
uint16_t tls_cap[8]; // [4, 8, 16, 32, 64]
// Bundle width (central freelist transfer)
uint16_t bundle_width; // [16, 32, 64, 128]
// Remote free drain threshold
uint16_t drain_threshold[8]; // [32, 64, 128, 256]
// Partial release threshold
uint8_t partial_release_pages; // [1, 2, 4, 8]
// Diet factor (巨大WS検知時)
float diet_factor; // [0.5, 0.66, 0.75, 0.9]
} ACEKnobs;
// Fast loop (every 0.5-1s)
void ace_fast_loop_tick(ACEMetrics* m, ACEKnobs* k) {
// Compute reward
double reward =
1.0 * normalize(m->throughput_ops)
- 0.3 * m->llc_miss_rate
- 0.2 * normalize(m->mutex_wait_ns)
- 0.2 * normalize_backlog(m->remote_free_backlog);
// 1) Remote backlog ↑ → drain threshold ↓
if (max_backlog(m->remote_free_backlog) > 128) {
lower_drain_threshold(k);
}
// 2) LLC miss ↑ → TLS capacity ↓ (diet)
if (m->llc_miss_rate > 0.15) {
apply_diet(k, k->diet_factor);
}
// 3) Mutex wait ↑ → bundle width ↑
if (m->mutex_wait_ns > threshold) {
increase_bundle_width(k);
}
// Update learning (UCB1 or simple hill-climb)
ucb1_update_knobs(reward, k);
}
実装優先度:
- ✅ TLS capacity調整(最も効果的、既存TINY_TLS_MAG_CAPを動的化)
- ✅ Drain threshold調整(既存POOL_REMOTE_DRAIN_THRESHOLDを動的化)
- ⚠️ Bundle width調整(新規実装必要、後回しOK)
1.3 学習アルゴリズム(1-2時間)
// Simple UCB1 for discrete knobs
typedef struct UCB1Arm {
uint32_t pulls;
double total_reward;
} UCB1Arm;
typedef struct UCB1Bandit {
UCB1Arm arms[5]; // 5 candidates per knob
uint32_t total_pulls;
} UCB1Bandit;
// Select next knob value
int ucb1_select_arm(UCB1Bandit* b) {
double best_ucb = -INFINITY;
int best_arm = 0;
for (int i = 0; i < 5; i++) {
double avg = b->arms[i].total_reward / (b->arms[i].pulls + 1e-6);
double ucb = avg + sqrt(2 * log(b->total_pulls + 1) / (b->arms[i].pulls + 1e-6));
if (ucb > best_ucb) {
best_ucb = ucb;
best_arm = i;
}
}
return best_arm;
}
候補値:
static const uint16_t TLS_CAP_CANDIDATES[] = {4, 8, 16, 32, 64};
static const uint16_t DRAIN_THRESHOLD_CANDIDATES[] = {32, 64, 128, 256, 512};
static const float DIET_FACTOR_CANDIDATES[] = {0.5, 0.66, 0.75, 0.9, 1.0};
🚀 Phase 2: 断片化対策(半日)
Target: Fragmentation stress 3.87 → 10-20 M ops/s
2.1 Slow Loop(2-3時間)
// Slow loop (every 30-60s)
void ace_slow_loop_tick(ACEMetrics* m, ACEKnobs* k) {
// Fragmentation ↑ or RSS ↑ → partial release
if (m->fragmentation_ratio > 0.6 || m->rss_mb > threshold) {
// Lower threshold to trigger more scavenging
if (k->partial_release_pages > 1) {
k->partial_release_pages--;
}
// Run budgeted scavenge (max 5ms)
run_partial_scavenge_budgeted(5);
}
// RSS stable & low miss → restore thresholds
else if (m->llc_miss_rate < 0.10 && m->fragmentation_ratio < 0.4) {
// Raise threshold (hysteresis)
if (k->partial_release_pages < 8) {
k->partial_release_pages++;
}
}
}
2.2 Budgeted Scavenge(2-3時間)
// Partial release with time budget
void run_partial_scavenge_budgeted(uint32_t max_ms) {
uint64_t start_ns = get_ns();
uint64_t budget_ns = max_ms * 1000000ULL;
// Iterate slabs/superslabs
for (each slab/superslab) {
// Check consecutive empty pages
uint32_t empty_pages = count_consecutive_empty_pages(slab);
if (empty_pages >= g_ace_knobs.partial_release_pages) {
// madvise DONTNEED or munmap
partial_release_pages(slab, empty_pages);
}
// Check budget
if (get_ns() - start_ns > budget_ns) {
break; // Stop, resume next cycle
}
}
}
実装ポイント:
- Superslab: 既存のビットマップから空きページ検出
- Mid MT: セグメントの使用率追跡が必要(新規フィールド追加)
- madvise(DONTNEED)で物理メモリ返却(munmapより軽量)
🎯 Phase 3: 巨大WS対策(半日)
Target: 巨大WS 22 → 30-45 M ops/s
3.1 自動ダイエット(実装済みのknobを動的化)
void apply_diet(ACEKnobs* k, float factor) {
for (int i = 0; i < 8; i++) {
// Hot classes (0-2) diet more aggressively
float class_factor = (i < 3) ? factor : (factor + 1.0) / 2.0;
uint16_t new_cap = (uint16_t)(k->tls_cap[i] * class_factor);
k->tls_cap[i] = MAX(4, new_cap); // Min capacity = 4
}
}
void restore_diet(ACEKnobs* k) {
for (int i = 0; i < 8; i++) {
// Gradual restore (10% per cycle)
k->tls_cap[i] = MIN(64, k->tls_cap[i] * 1.1);
}
}
3.2 LLC Miss監視
// Using rdpmc (lightweight)
double read_llc_miss_rate() {
static uint64_t prev_misses = 0;
static uint64_t prev_refs = 0;
uint64_t misses = rdpmc(LLC_MISS_EVENT);
uint64_t refs = rdpmc(LLC_REF_EVENT);
double rate = (double)(misses - prev_misses) / (refs - prev_refs + 1);
prev_misses = misses;
prev_refs = refs;
return rate;
}
🔧 Phase 4: realloc最適化(1日)
Target: 6.6-277ns ブレを 1.3-2x短縮
4.1 In-place拡張
void* hak_realloc_inplace(void* ptr, size_t old_size, size_t new_size) {
// Check adjacent free space
void* next_block = (char*)ptr + old_size;
if (is_free(next_block)) {
size_t available = get_free_size(next_block);
size_t needed = new_size - old_size;
if (available >= needed) {
// Merge and extend in-place
merge_free_block(ptr, next_block);
return ptr; // No memcpy!
}
}
return NULL; // Fall back to copy
}
4.2 コピー最適化
void* hak_realloc_copy(void* ptr, size_t old_size, size_t new_size) {
void* new_ptr = hak_malloc(new_size);
// Size-dependent strategy
if (old_size < 4096) {
// Small: direct memcpy
memcpy(new_ptr, ptr, old_size);
}
else if (old_size < k->nt_threshold) {
// Medium: prefetch + memcpy
__builtin_prefetch(ptr, 0, 3);
__builtin_prefetch((char*)ptr + 64, 0, 3);
memcpy(new_ptr, ptr, old_size);
}
else {
// Large: non-temporal store
memcpy_nt(new_ptr, ptr, old_size);
}
hak_free(ptr);
return new_ptr;
}
実装順序(推奨)
Day 1: 最小ACE(最大効果)
- ✅ メトリクス収集(throughput, llc_miss, backlog)
- ✅ Fast loop骨格
- ✅ TLS capacity調整(動的化)
- ✅ UCB1学習
- ✅ ON/OFF切替環境変数
Expected: 断片化ケースで即効果(3.9 → 8-12 M ops/s)
Day 2: 断片化対策
- ✅ Slow loop
- ✅ Fragmentation metrics
- ✅ Budgeted scavenge
- ✅ Partial release
Expected: 断片化ケースで 10-20 M ops/s
Day 3: 巨大WS対策
- ✅ LLC miss monitoring
- ✅ Diet adjustment
- ✅ Gradual restore
Expected: 巨大WSで 30-45 M ops/s
Day 4: realloc最適化(オプション)
- ✅ In-place拡張
- ✅ コピー最適化
- ✅ NT threshold学習
Expected: realloc 1.3-2x改善
安全弁(必須)
typedef struct ACEGuards {
uint64_t max_p99_lat_ns; // p99 latency上限
uint64_t max_rss_mb; // RSS上限
uint32_t max_cpu_percent; // ACE CPU占有上限
bool enabled; // ON/OFF switch
} ACEGuards;
void ace_check_guards(ACEMetrics* m, ACEGuards* g) {
if (!g->enabled) return;
// Latency guard
if (m->p90_lat_ns > g->max_p99_lat_ns) {
ace_emergency_stop("Latency spike");
return;
}
// RSS guard
if (m->rss_mb > g->max_rss_mb) {
ace_emergency_stop("RSS overflow");
return;
}
}
環境変数制御
# Enable/disable ACE
export HAKMEM_ACE_ENABLED=1
# Fast loop interval (ms)
export HAKMEM_ACE_FAST_INTERVAL_MS=500
# Slow loop interval (s)
export HAKMEM_ACE_SLOW_INTERVAL_S=30
# Guards
export HAKMEM_ACE_MAX_P99_LAT_NS=10000000 # 10ms
export HAKMEM_ACE_MAX_RSS_MB=16384 # 16GB
# Debug logging
export HAKMEM_ACE_LOG_LEVEL=1 # 0=off, 1=info, 2=debug
A/B計測
# Baseline (ACE OFF)
HAKMEM_ACE_ENABLED=0 ./bench_fragment_stress > baseline.txt
# ACE ON
HAKMEM_ACE_ENABLED=1 ./bench_fragment_stress > ace_on.txt
# Compare
diff <(grep "ops/sec" baseline.txt) <(grep "ops/sec" ace_on.txt)
期待効果(保守的見積り)
| ワークロード | Before | After (ACE) | 改善率 |
|---|---|---|---|
| 断片化ストレス | 3.87 M ops/s | 10-20 M ops/s | 2.6-5.2x |
| 巨大WS | 22.15 M ops/s | 30-45 M ops/s | 1.4-2.0x |
| realloc重め | 277ns (worst) | 140-210ns | 1.3-2.0x |
| Mid MT (維持) | 111.6 M ops/s | 110-115 M ops/s | ±5% |
ファイル構成
core/
├── hakmem_ace_metrics.h ← NEW (metrics collection)
├── hakmem_ace_metrics.c ← NEW
├── hakmem_ace_controller.h ← NEW (fast/slow loops)
├── hakmem_ace_controller.c ← NEW
├── hakmem_ace_ucb1.h ← NEW (learning)
├── hakmem_ace_ucb1.c ← NEW
├── hakmem_ace_scavenge.h ← NEW (partial release)
├── hakmem_ace_scavenge.c ← NEW
└── hakmem_ace_realloc.h ← NEW (realloc opt, Phase 4)
# Existing files to modify:
core/hakmem_tiny_magazine.c ← Add dynamic TLS capacity
core/hakmem_pool.c ← Add dynamic drain threshold
core/hakmem.c ← Integrate ace_tick() calls
Next Step
どのPhaseから始めますか?
推奨: Phase 1 (Day 1) - 最小実装で最大効果
- メトリクス収集
- Fast loop
- TLS capacity調整
- UCB1学習
このPhase 1のコード(hakmem_ace_metrics.h/.c + hakmem_ace_controller.h/.c)を 今すぐ書きましょうか? 🚀