Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
436 lines
12 KiB
Markdown
436 lines
12 KiB
Markdown
# ACE Learning Layer - Implementation Plan
|
||
**Goal**: 断片化・巨大WS・reallocの弱点を学習で潰して"つよつよ"にする
|
||
|
||
## 優先順位(効果 × 実装容易性)
|
||
|
||
### 🔥 Phase 1: 最小実装(1日、最大効果)
|
||
**Target**: 断片化ストレス 3.87 → 10-20 M ops/s
|
||
|
||
#### 1.1 メトリクス収集基盤(2-3時間)
|
||
```c
|
||
// core/hakmem_ace_metrics.h
|
||
typedef struct ACEMetrics {
|
||
// Fast metrics (1Hz collection)
|
||
uint64_t throughput_ops; // ops/sec
|
||
uint64_t p50_lat_ns; // median latency
|
||
uint64_t p90_lat_ns; // p90 latency
|
||
double llc_miss_rate; // LLC misses / loads
|
||
uint64_t mutex_wait_ns; // mutex contention time
|
||
uint64_t remote_free_backlog[8]; // per size-class
|
||
|
||
// Slow metrics (30-60s collection)
|
||
uint64_t rss_mb; // resident set size
|
||
double fragmentation_ratio; // allocated / held
|
||
uint32_t partial_empty_slabs; // partially empty slabs
|
||
uint32_t arena_contention[4]; // per-arena contention
|
||
} ACEMetrics;
|
||
|
||
// Lightweight collection (no syscalls in fast path)
|
||
void ace_metrics_tick_1hz(ACEMetrics* m);
|
||
void ace_metrics_tick_60s(ACEMetrics* m);
|
||
```
|
||
|
||
**実装**:
|
||
- `throughput_ops`: 既存のTLS countersから集計
|
||
- `llc_miss_rate`: rdpmc使用(軽量)
|
||
- `mutex_wait_ns`: 既存のlockタイミング統計
|
||
- `remote_free_backlog`: Tiny/Mid MTの既存カウンタ
|
||
- `fragmentation_ratio`: Superslab/Mid MTの使用率から計算
|
||
- `rss_mb`: /proc/self/statm(60秒に1回のみ)
|
||
|
||
#### 1.2 Fast Loop骨格(2-3時間)
|
||
```c
|
||
// core/hakmem_ace_controller.c
|
||
typedef struct ACEKnobs {
|
||
// TLS/Magazine capacity (per class)
|
||
uint16_t tls_cap[8]; // [4, 8, 16, 32, 64]
|
||
|
||
// Bundle width (central freelist transfer)
|
||
uint16_t bundle_width; // [16, 32, 64, 128]
|
||
|
||
// Remote free drain threshold
|
||
uint16_t drain_threshold[8]; // [32, 64, 128, 256]
|
||
|
||
// Partial release threshold
|
||
uint8_t partial_release_pages; // [1, 2, 4, 8]
|
||
|
||
// Diet factor (巨大WS検知時)
|
||
float diet_factor; // [0.5, 0.66, 0.75, 0.9]
|
||
} ACEKnobs;
|
||
|
||
// Fast loop (every 0.5-1s)
|
||
void ace_fast_loop_tick(ACEMetrics* m, ACEKnobs* k) {
|
||
// Compute reward
|
||
double reward =
|
||
1.0 * normalize(m->throughput_ops)
|
||
- 0.3 * m->llc_miss_rate
|
||
- 0.2 * normalize(m->mutex_wait_ns)
|
||
- 0.2 * normalize_backlog(m->remote_free_backlog);
|
||
|
||
// 1) Remote backlog ↑ → drain threshold ↓
|
||
if (max_backlog(m->remote_free_backlog) > 128) {
|
||
lower_drain_threshold(k);
|
||
}
|
||
|
||
// 2) LLC miss ↑ → TLS capacity ↓ (diet)
|
||
if (m->llc_miss_rate > 0.15) {
|
||
apply_diet(k, k->diet_factor);
|
||
}
|
||
|
||
// 3) Mutex wait ↑ → bundle width ↑
|
||
if (m->mutex_wait_ns > threshold) {
|
||
increase_bundle_width(k);
|
||
}
|
||
|
||
// Update learning (UCB1 or simple hill-climb)
|
||
ucb1_update_knobs(reward, k);
|
||
}
|
||
```
|
||
|
||
**実装優先度**:
|
||
1. ✅ **TLS capacity調整**(最も効果的、既存TINY_TLS_MAG_CAPを動的化)
|
||
2. ✅ **Drain threshold調整**(既存POOL_REMOTE_DRAIN_THRESHOLDを動的化)
|
||
3. ⚠️ **Bundle width調整**(新規実装必要、後回しOK)
|
||
|
||
#### 1.3 学習アルゴリズム(1-2時間)
|
||
```c
|
||
// Simple UCB1 for discrete knobs
|
||
typedef struct UCB1Arm {
|
||
uint32_t pulls;
|
||
double total_reward;
|
||
} UCB1Arm;
|
||
|
||
typedef struct UCB1Bandit {
|
||
UCB1Arm arms[5]; // 5 candidates per knob
|
||
uint32_t total_pulls;
|
||
} UCB1Bandit;
|
||
|
||
// Select next knob value
|
||
int ucb1_select_arm(UCB1Bandit* b) {
|
||
double best_ucb = -INFINITY;
|
||
int best_arm = 0;
|
||
|
||
for (int i = 0; i < 5; i++) {
|
||
double avg = b->arms[i].total_reward / (b->arms[i].pulls + 1e-6);
|
||
double ucb = avg + sqrt(2 * log(b->total_pulls + 1) / (b->arms[i].pulls + 1e-6));
|
||
if (ucb > best_ucb) {
|
||
best_ucb = ucb;
|
||
best_arm = i;
|
||
}
|
||
}
|
||
return best_arm;
|
||
}
|
||
```
|
||
|
||
**候補値**:
|
||
```c
|
||
static const uint16_t TLS_CAP_CANDIDATES[] = {4, 8, 16, 32, 64};
|
||
static const uint16_t DRAIN_THRESHOLD_CANDIDATES[] = {32, 64, 128, 256, 512};
|
||
static const float DIET_FACTOR_CANDIDATES[] = {0.5, 0.66, 0.75, 0.9, 1.0};
|
||
```
|
||
|
||
---
|
||
|
||
### 🚀 Phase 2: 断片化対策(半日)
|
||
**Target**: Fragmentation stress 3.87 → 10-20 M ops/s
|
||
|
||
#### 2.1 Slow Loop(2-3時間)
|
||
```c
|
||
// Slow loop (every 30-60s)
|
||
void ace_slow_loop_tick(ACEMetrics* m, ACEKnobs* k) {
|
||
// Fragmentation ↑ or RSS ↑ → partial release
|
||
if (m->fragmentation_ratio > 0.6 || m->rss_mb > threshold) {
|
||
// Lower threshold to trigger more scavenging
|
||
if (k->partial_release_pages > 1) {
|
||
k->partial_release_pages--;
|
||
}
|
||
|
||
// Run budgeted scavenge (max 5ms)
|
||
run_partial_scavenge_budgeted(5);
|
||
}
|
||
|
||
// RSS stable & low miss → restore thresholds
|
||
else if (m->llc_miss_rate < 0.10 && m->fragmentation_ratio < 0.4) {
|
||
// Raise threshold (hysteresis)
|
||
if (k->partial_release_pages < 8) {
|
||
k->partial_release_pages++;
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
#### 2.2 Budgeted Scavenge(2-3時間)
|
||
```c
|
||
// Partial release with time budget
|
||
void run_partial_scavenge_budgeted(uint32_t max_ms) {
|
||
uint64_t start_ns = get_ns();
|
||
uint64_t budget_ns = max_ms * 1000000ULL;
|
||
|
||
// Iterate slabs/superslabs
|
||
for (each slab/superslab) {
|
||
// Check consecutive empty pages
|
||
uint32_t empty_pages = count_consecutive_empty_pages(slab);
|
||
|
||
if (empty_pages >= g_ace_knobs.partial_release_pages) {
|
||
// madvise DONTNEED or munmap
|
||
partial_release_pages(slab, empty_pages);
|
||
}
|
||
|
||
// Check budget
|
||
if (get_ns() - start_ns > budget_ns) {
|
||
break; // Stop, resume next cycle
|
||
}
|
||
}
|
||
}
|
||
```
|
||
|
||
**実装ポイント**:
|
||
- Superslab: 既存のビットマップから空きページ検出
|
||
- Mid MT: セグメントの使用率追跡が必要(新規フィールド追加)
|
||
- madvise(DONTNEED)で物理メモリ返却(munmapより軽量)
|
||
|
||
---
|
||
|
||
### 🎯 Phase 3: 巨大WS対策(半日)
|
||
**Target**: 巨大WS 22 → 30-45 M ops/s
|
||
|
||
#### 3.1 自動ダイエット(実装済みのknobを動的化)
|
||
```c
|
||
void apply_diet(ACEKnobs* k, float factor) {
|
||
for (int i = 0; i < 8; i++) {
|
||
// Hot classes (0-2) diet more aggressively
|
||
float class_factor = (i < 3) ? factor : (factor + 1.0) / 2.0;
|
||
|
||
uint16_t new_cap = (uint16_t)(k->tls_cap[i] * class_factor);
|
||
k->tls_cap[i] = MAX(4, new_cap); // Min capacity = 4
|
||
}
|
||
}
|
||
|
||
void restore_diet(ACEKnobs* k) {
|
||
for (int i = 0; i < 8; i++) {
|
||
// Gradual restore (10% per cycle)
|
||
k->tls_cap[i] = MIN(64, k->tls_cap[i] * 1.1);
|
||
}
|
||
}
|
||
```
|
||
|
||
#### 3.2 LLC Miss監視
|
||
```c
|
||
// Using rdpmc (lightweight)
|
||
double read_llc_miss_rate() {
|
||
static uint64_t prev_misses = 0;
|
||
static uint64_t prev_refs = 0;
|
||
|
||
uint64_t misses = rdpmc(LLC_MISS_EVENT);
|
||
uint64_t refs = rdpmc(LLC_REF_EVENT);
|
||
|
||
double rate = (double)(misses - prev_misses) / (refs - prev_refs + 1);
|
||
|
||
prev_misses = misses;
|
||
prev_refs = refs;
|
||
|
||
return rate;
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
### 🔧 Phase 4: realloc最適化(1日)
|
||
**Target**: 6.6-277ns ブレを 1.3-2x短縮
|
||
|
||
#### 4.1 In-place拡張
|
||
```c
|
||
void* hak_realloc_inplace(void* ptr, size_t old_size, size_t new_size) {
|
||
// Check adjacent free space
|
||
void* next_block = (char*)ptr + old_size;
|
||
|
||
if (is_free(next_block)) {
|
||
size_t available = get_free_size(next_block);
|
||
size_t needed = new_size - old_size;
|
||
|
||
if (available >= needed) {
|
||
// Merge and extend in-place
|
||
merge_free_block(ptr, next_block);
|
||
return ptr; // No memcpy!
|
||
}
|
||
}
|
||
|
||
return NULL; // Fall back to copy
|
||
}
|
||
```
|
||
|
||
#### 4.2 コピー最適化
|
||
```c
|
||
void* hak_realloc_copy(void* ptr, size_t old_size, size_t new_size) {
|
||
void* new_ptr = hak_malloc(new_size);
|
||
|
||
// Size-dependent strategy
|
||
if (old_size < 4096) {
|
||
// Small: direct memcpy
|
||
memcpy(new_ptr, ptr, old_size);
|
||
}
|
||
else if (old_size < k->nt_threshold) {
|
||
// Medium: prefetch + memcpy
|
||
__builtin_prefetch(ptr, 0, 3);
|
||
__builtin_prefetch((char*)ptr + 64, 0, 3);
|
||
memcpy(new_ptr, ptr, old_size);
|
||
}
|
||
else {
|
||
// Large: non-temporal store
|
||
memcpy_nt(new_ptr, ptr, old_size);
|
||
}
|
||
|
||
hak_free(ptr);
|
||
return new_ptr;
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 実装順序(推奨)
|
||
|
||
### Day 1: 最小ACE(最大効果)
|
||
1. ✅ メトリクス収集(throughput, llc_miss, backlog)
|
||
2. ✅ Fast loop骨格
|
||
3. ✅ TLS capacity調整(動的化)
|
||
4. ✅ UCB1学習
|
||
5. ✅ ON/OFF切替環境変数
|
||
|
||
**Expected**: 断片化ケースで即効果(3.9 → 8-12 M ops/s)
|
||
|
||
### Day 2: 断片化対策
|
||
1. ✅ Slow loop
|
||
2. ✅ Fragmentation metrics
|
||
3. ✅ Budgeted scavenge
|
||
4. ✅ Partial release
|
||
|
||
**Expected**: 断片化ケースで 10-20 M ops/s
|
||
|
||
### Day 3: 巨大WS対策
|
||
1. ✅ LLC miss monitoring
|
||
2. ✅ Diet adjustment
|
||
3. ✅ Gradual restore
|
||
|
||
**Expected**: 巨大WSで 30-45 M ops/s
|
||
|
||
### Day 4: realloc最適化(オプション)
|
||
1. ✅ In-place拡張
|
||
2. ✅ コピー最適化
|
||
3. ✅ NT threshold学習
|
||
|
||
**Expected**: realloc 1.3-2x改善
|
||
|
||
---
|
||
|
||
## 安全弁(必須)
|
||
|
||
```c
|
||
typedef struct ACEGuards {
|
||
uint64_t max_p99_lat_ns; // p99 latency上限
|
||
uint64_t max_rss_mb; // RSS上限
|
||
uint32_t max_cpu_percent; // ACE CPU占有上限
|
||
bool enabled; // ON/OFF switch
|
||
} ACEGuards;
|
||
|
||
void ace_check_guards(ACEMetrics* m, ACEGuards* g) {
|
||
if (!g->enabled) return;
|
||
|
||
// Latency guard
|
||
if (m->p90_lat_ns > g->max_p99_lat_ns) {
|
||
ace_emergency_stop("Latency spike");
|
||
return;
|
||
}
|
||
|
||
// RSS guard
|
||
if (m->rss_mb > g->max_rss_mb) {
|
||
ace_emergency_stop("RSS overflow");
|
||
return;
|
||
}
|
||
}
|
||
```
|
||
|
||
---
|
||
|
||
## 環境変数制御
|
||
|
||
```bash
|
||
# Enable/disable ACE
|
||
export HAKMEM_ACE_ENABLED=1
|
||
|
||
# Fast loop interval (ms)
|
||
export HAKMEM_ACE_FAST_INTERVAL_MS=500
|
||
|
||
# Slow loop interval (s)
|
||
export HAKMEM_ACE_SLOW_INTERVAL_S=30
|
||
|
||
# Guards
|
||
export HAKMEM_ACE_MAX_P99_LAT_NS=10000000 # 10ms
|
||
export HAKMEM_ACE_MAX_RSS_MB=16384 # 16GB
|
||
|
||
# Debug logging
|
||
export HAKMEM_ACE_LOG_LEVEL=1 # 0=off, 1=info, 2=debug
|
||
```
|
||
|
||
---
|
||
|
||
## A/B計測
|
||
|
||
```bash
|
||
# Baseline (ACE OFF)
|
||
HAKMEM_ACE_ENABLED=0 ./bench_fragment_stress > baseline.txt
|
||
|
||
# ACE ON
|
||
HAKMEM_ACE_ENABLED=1 ./bench_fragment_stress > ace_on.txt
|
||
|
||
# Compare
|
||
diff <(grep "ops/sec" baseline.txt) <(grep "ops/sec" ace_on.txt)
|
||
```
|
||
|
||
---
|
||
|
||
## 期待効果(保守的見積り)
|
||
|
||
| ワークロード | Before | After (ACE) | 改善率 |
|
||
|------------|--------|-------------|--------|
|
||
| 断片化ストレス | 3.87 M ops/s | 10-20 M ops/s | **2.6-5.2x** |
|
||
| 巨大WS | 22.15 M ops/s | 30-45 M ops/s | **1.4-2.0x** |
|
||
| realloc重め | 277ns (worst) | 140-210ns | **1.3-2.0x** |
|
||
| Mid MT (維持) | 111.6 M ops/s | 110-115 M ops/s | **±5%** |
|
||
|
||
---
|
||
|
||
## ファイル構成
|
||
|
||
```
|
||
core/
|
||
├── hakmem_ace_metrics.h ← NEW (metrics collection)
|
||
├── hakmem_ace_metrics.c ← NEW
|
||
├── hakmem_ace_controller.h ← NEW (fast/slow loops)
|
||
├── hakmem_ace_controller.c ← NEW
|
||
├── hakmem_ace_ucb1.h ← NEW (learning)
|
||
├── hakmem_ace_ucb1.c ← NEW
|
||
├── hakmem_ace_scavenge.h ← NEW (partial release)
|
||
├── hakmem_ace_scavenge.c ← NEW
|
||
└── hakmem_ace_realloc.h ← NEW (realloc opt, Phase 4)
|
||
|
||
# Existing files to modify:
|
||
core/hakmem_tiny_magazine.c ← Add dynamic TLS capacity
|
||
core/hakmem_pool.c ← Add dynamic drain threshold
|
||
core/hakmem.c ← Integrate ace_tick() calls
|
||
```
|
||
|
||
---
|
||
|
||
## Next Step
|
||
|
||
**どのPhaseから始めますか?**
|
||
|
||
推奨: **Phase 1 (Day 1)** - 最小実装で最大効果
|
||
- メトリクス収集
|
||
- Fast loop
|
||
- TLS capacity調整
|
||
- UCB1学習
|
||
|
||
このPhase 1のコード(hakmem_ace_metrics.h/.c + hakmem_ace_controller.h/.c)を
|
||
今すぐ書きましょうか? 🚀
|