diff --git a/CURRENT_TASK.md b/CURRENT_TASK.md index 671944f4..ec53b563 100644 --- a/CURRENT_TASK.md +++ b/CURRENT_TASK.md @@ -1,253 +1,159 @@ -# Current Task: Phase 7 Task 5 - Comprehensive Benchmark Validation +# Current Task: ACE Investigation - Mid-Large Performance Recovery **Date**: 2025-11-08 **Status**: 🔄 IN PROGRESS -**Priority**: HIGH +**Priority**: CRITICAL --- -## 🎉 Phase 7 Tasks 1-3: COMPLETE! +## 🎉 Recent Achievements -**Achievement**: **+180-280% Performance Improvement!** 🚀 +### 100% Stability Fix (Commit 616070cf7) +- ✅ **50/50 consecutive 4T runs passed** +- ✅ Bitmap semantics corrected (0xFFFFFFFF = full) +- ✅ Race condition fixed with mutex protection +- ✅ User requirement MET: "5%でもクラッシュおこったら使えない" → **0% crash rate** -**Results (Quick Tests)**: -- Random Mixed 128B: **59M ops/s** (92% of System) ✅ -- Random Mixed 256B: **70M ops/s** (90% of System) ✅ -- Random Mixed 512B: **68M ops/s** (85% of System) ✅ -- Random Mixed 1024B: **65M ops/s** (146% of System!) 🏆 -- Larson 1T: **2.68M ops/s** (stable) ✅ +### Comprehensive Benchmark Results (2025-11-08) +Located at: `benchmarks/results/comprehensive_20251108_214317/` -**Improvement vs Phase 6**: **+180-280%** 🚀 +**Performance Summary:** -詳細: [`PHASE7_TASK3_RESULTS.md`](PHASE7_TASK3_RESULTS.md) +| Category | HAKMEM | vs System | vs mimalloc | Status | +|----------|--------|-----------|-------------|--------| +| **Tiny Hot Path** | 218.65 M/s | **+48.5%** 🏆 | **+23.0%** 🏆 | **HUGE WIN** | +| Random Mixed 128B | 16.92 M/s | 34% | 28% | Good (+3-4x from Phase 6) | +| Random Mixed 256B | 17.59 M/s | 42% | 32% | Good | +| Random Mixed 512B | 15.61 M/s | 42% | 33% | Good | +| Random Mixed 2048B | 11.14 M/s | 50% | 65% | Competitive | +| Random Mixed 4096B | 8.13 M/s | 61% | 66% | Competitive | +| Larson 1T | 3.92 M/s | 28% | - | Needs work | +| Larson 4T | 7.55 M/s | 45% | - | Needs work | +| **Mid-Large MT** | 1.05 M/s | **-88%** 🔴 | **-86%** 🔴 | **CRITICAL ISSUE** | + +**Key Findings:** +1. ✅ **First time beating BOTH System and mimalloc** (Tiny Hot Path) +2. ✅ **100% stability** - All benchmarks passed without crashes +3. 🔴 **Critical regression**: Mid-Large MT performance collapsed (-88%) --- -## Objective: Task 5 - Comprehensive Validation +## Objective: Investigate ACE for Mid-Large Performance Recovery -包括的ベンチマークスイートを実行して、Phase 7 の改善を検証し、本番環境デプロイのベースラインを確立する。 +**Problem:** +- Mid-Large MT: 1.05M ops/s (was +171% in docs, now -88%) +- Root cause (from Task Agent report): + - ACE disabled → all mid allocations go to mmap (slow) + - This used to be HAKMEM's strength + +**Goal:** +- Understand why ACE is disabled +- Determine if re-enabling ACE can recover performance +- If yes, implement ACE enablement +- If no, find alternative optimization + +**Note:** HAKX is legacy code, ignore it. Focus on ACE mechanism. --- -## Task Breakdown +## Task for Task Agent (Ultrathink Required) -### 1. 包括的ベンチマークスイート実行 (HIGH Priority) +### Investigation Scope -**Build Phase 7 最適化版**: -```bash -make clean -make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 \ - bench_comprehensive_hakmem \ - bench_fragment_stress_hakmem \ - larson_hakmem \ - bench_random_mixed_hakmem -``` +1. **ACE Current State** + - Why is ACE disabled? + - What does ACE do? (Adaptive Cache Engine) + - How does it help Mid-Large allocations? -**実行するベンチマーク**: +2. **Code Analysis** + - Find ACE enablement flags + - Find ACE initialization code + - Find ACE allocation path + - Understand ACE vs mmap decision -#### 1.1 Comprehensive Benchmark (21 patterns × 4 sizes) -```bash -./bench_comprehensive_hakmem -# 実行時間: ~5分 -# カバー範囲: LIFO, FIFO, Random, Interleaved, Long/Short-lived, Mixed -# サイズ: 16B, 32B, 64B, 128B -``` +3. **Root Cause** + - Why does disabling ACE cause -88% regression? + - What is the overhead of mmap for every allocation? + - Can we fix this by re-enabling ACE? -**期待結果**: -- Phase 6: -61.3% (52.59 M/s vs 135.94 M/s) -- Phase 7: **85-92%** (目標達成!) +4. **Proposed Solution** + - If ACE can be safely re-enabled: How? + - If ACE has bugs: What needs fixing? + - Alternative optimizations if ACE is not viable -#### 1.2 Fragmentation Stress Test -```bash -./bench_fragment_stress_hakmem 50 2000 -# 実行時間: ~2分 -# テスト: 50ラウンド, 2000スロット, 混合サイズ -``` - -**期待結果**: -- Phase 6: -75.0% (4.68 M/s vs 18.43 M/s) -- Phase 7: **大幅改善** (TLS キャッシュ事前ウォームで) - -#### 1.3 Larson Multi-Thread Stress -```bash -# 1 thread (ベースライン) -./larson_hakmem 1 1 128 1024 1 12345 1 -# 期待: 2.68M ops/s ✅ - -# 2 threads -./larson_hakmem 2 8 128 1024 1 12345 2 - -# 4 threads -./larson_hakmem 4 8 128 1024 1 12345 4 - -# 8 threads (ストレス) -./larson_hakmem 8 8 128 1024 1 12345 8 -``` - -**期待結果**: -- 1T: 2.68M ops/s (安定) -- 4T: スケール確認 (デグレなし) - -#### 1.4 Random Mixed (各サイズ) -```bash -# Tiny range -./bench_random_mixed_hakmem 100000 16 1234567 -./bench_random_mixed_hakmem 100000 32 1234567 -./bench_random_mixed_hakmem 100000 64 1234567 -./bench_random_mixed_hakmem 100000 128 1234567 -./bench_random_mixed_hakmem 100000 256 1234567 -./bench_random_mixed_hakmem 100000 512 1234567 -./bench_random_mixed_hakmem 100000 1024 1234567 - -# Mid range (mid_mt territory) -./bench_random_mixed_hakmem 100000 2048 1234567 -./bench_random_mixed_hakmem 100000 4096 1234567 -./bench_random_mixed_hakmem 100000 8192 1234567 -./bench_random_mixed_hakmem 100000 16384 1234567 -``` - -**期待結果**: -- Tiny (≤1KB): **85-92% of System** -- Mid (1-8KB): **146% of System** (1024B) -- Mid-Large (8-32KB): **+87% vs System** (既存 mid_mt) - -#### 1.5 長時間実行(安定性確認) -```bash -# 10倍長時間実行で安定結果 -./bench_random_mixed_hakmem 1000000 128 1234567 -./bench_random_mixed_hakmem 1000000 256 1234567 -./bench_random_mixed_hakmem 1000000 1024 1234567 -``` - -**期待結果**: -- 分散 ≤10% (安定性確認) -- 平均値がクイックテストと一致 +5. **Implementation Plan** + - Step-by-step plan to recover Mid-Large performance + - Estimated effort (days) + - Risk assessment --- -### 2. System malloc との比較 +## Success Criteria -**System malloc 版をビルド**: +✅ **Understand ACE mechanism and current state** +✅ **Identify why Mid-Large performance collapsed** +✅ **Propose concrete solution with implementation plan** +✅ **Return detailed analysis report** + +--- + +## Context for Task Agent + +**Current Build Flags:** ```bash -make bench_comprehensive_system \ - bench_fragment_stress_system \ - bench_random_mixed_system +make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 ``` -**並行比較実行**: -```bash -# 両方実行して比較 -./bench_comprehensive_hakmem > results_hakmem.txt -./bench_comprehensive_system > results_system.txt +**Relevant Files to Check:** +- `core/hakmem_ace*.c` - ACE implementation +- `core/hakmem_mid_mt.c` - Mid-Large allocator +- `core/hakmem_learner.c` - Learning mechanism +- Build flags in Makefile -# 比較レポート生成 -diff -y results_hakmem.txt results_system.txt +**Benchmark to Verify:** +```bash +# Mid-Large MT (currently broken) +./bench_mid_large_mt_hakmem +# Expected: Should improve significantly with ACE ``` --- -### 3. Phase 6 からの性能後退チェック +## Deliverables -**Phase 6 ベースライン**: -- Tiny: 21M ops/s (31% of System) -- Mid-Large: 97M ops/s (+87% vs System) +1. **ACE Analysis Report** (markdown) + - ACE mechanism explanation + - Current state diagnosis + - Root cause of -88% regression + - Proposed solution -**Phase 7 期待値**: -- Tiny: 59M ops/s (92% of System) ← **+181%** 🚀 -- Mid-Large: 97M ops/s (変化なし) ← 影響なし ✅ +2. **Implementation Plan** + - Concrete steps to fix + - Code changes needed + - Testing strategy -**確認項目**: -- ✅ Tiny が大幅改善 -- ✅ Mid-Large が維持 -- ✅ クラッシュなし +3. **Risk Assessment** + - Stability impact + - Performance trade-offs + - Alternative approaches --- -### 4. 成果物 +## Timeline -**包括的レポート作成**: -- `PHASE7_COMPREHENSIVE_BENCHMARK_RESULTS.md` -- 全ベンチマーク結果 -- 比較表 (HAKMEM vs System) -- 性能グラフ (可能なら) -- 安定性分析 (分散、外れ値) -- 本番環境準備度評価 - -**ドキュメント更新**: -- `CLAUDE.md` - 包括的結果セクション追加 -- `README.md` - 性能主張を更新 -- `benchmarks/results/` - 詳細結果をアーカイブ +- **Investigation**: Task Agent (Ultrathink mode) +- **Report Review**: 30 min +- **Implementation**: 1-2 days (depends on findings) +- **Validation**: Re-run benchmarks --- -## 成功基準 +## Notes -✅ **全ベンチマーククラッシュなく完了** -✅ **Tiny 性能: 85-92% of System** (目標: 40-55%) -✅ **Mid-Large 性能: 維持または改善** -✅ **マルチスレッド安定性: 後退なし** -✅ **フラグメンテーションストレス: 許容可能な性能** -✅ **包括的レポート生成完了** +- Debug logs now properly guarded with `HAKMEM_SUPERSLAB_VERBOSE` +- Can be enabled with `-DHAKMEM_SUPERSLAB_VERBOSE` for debugging +- Release builds will be clean (no log spam) --- -## タイムライン - -- **ベンチマーク実行**: 1-2時間 (自動化) -- **分析とレポート**: 2-3時間 -- **合計**: 4-5時間 - ---- - -## 検証後の次のステップ - -**ベンチマーク合格の場合**: -1. Task 6-9 (本番環境強化) に進む -2. Task 4 (PGO) を検討 (最終 +3-5% ブースト) -3. 本番環境デプロイ準備 - -**問題発見の場合**: -1. 性能後退を調査 -2. 修正して再テスト -3. 既知の制限をドキュメント化 - ---- - -## 備考 - -- `HAKMEM_LOG=1` で詳細な初期化ログ -- `valgrind --tool=massif` でメモリ使用量監視 -- `valgrind --leak-check=full` でメモリリークチェック -- `perf record -g` でホットパスプロファイル - ---- - -## 📋 実行コマンドまとめ - -```bash -# ビルド -make clean -make HEADER_CLASSIDX=1 AGGRESSIVE_INLINE=1 PREWARM_TLS=1 \ - bench_comprehensive_hakmem larson_hakmem bench_random_mixed_hakmem - -# クイック検証 (5分) -./bench_comprehensive_hakmem -./larson_hakmem 1 1 128 1024 1 12345 1 -./bench_random_mixed_hakmem 100000 128 1234567 - -# 完全検証 (1-2時間) -for size in 16 32 64 128 256 512 1024 2048 4096 8192; do - echo "=== Size: $size ===" - ./bench_random_mixed_hakmem 100000 $size 1234567 -done - -./larson_hakmem 4 8 128 1024 1 12345 4 -./bench_fragment_stress_hakmem 50 2000 - -# レポート生成 -# (結果を PHASE7_COMPREHENSIVE_BENCHMARK_RESULTS.md にまとめる) -``` - ---- - -**Status**: Task Agent に自動ベンチマーク実行を委譲する準備完了 🤖 +**Status**: Ready to launch Task Agent investigation 🚀 diff --git a/core/hakmem_tiny_superslab.c b/core/hakmem_tiny_superslab.c index be291f32..23fb2e5e 100644 --- a/core/hakmem_tiny_superslab.c +++ b/core/hakmem_tiny_superslab.c @@ -564,11 +564,13 @@ int expand_superslab_head(SuperSlabHead* head) { // Allocate new chunk via existing superslab_allocate SuperSlab* new_chunk = superslab_allocate(head->class_idx); if (!new_chunk) { +#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Failed to allocate new chunk for class %d (system OOM)\n", head->class_idx); g_hakmem_lock_depth--; +#endif return -1; // True OOM (system out of memory) } @@ -607,11 +609,13 @@ int expand_superslab_head(SuperSlabHead* head) { pthread_mutex_unlock(&head->expansion_lock); +#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] Expanded SuperSlabHead for class %d: %zu chunks now (bitmap=0x%08x)\n", head->class_idx, new_count, new_chunk->slab_bitmap); g_hakmem_lock_depth--; +#endif return 0; } diff --git a/core/tiny_superslab_alloc.inc.h b/core/tiny_superslab_alloc.inc.h index b866c059..cd277377 100644 --- a/core/tiny_superslab_alloc.inc.h +++ b/core/tiny_superslab_alloc.inc.h @@ -180,11 +180,13 @@ static SuperSlab* superslab_refill(int class_idx) { } } else { // Current chunk exhausted (all slabs occupied), try to expand +#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) extern __thread int g_hakmem_lock_depth; g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] SuperSlab chunk exhausted for class %d (bitmap=0x%08x), expanding...\n", class_idx, current_chunk->slab_bitmap); g_hakmem_lock_depth--; +#endif // Protect expansion with global lock (race condition fix) static pthread_mutex_t expand_lock = PTHREAD_MUTEX_INITIALIZER; @@ -199,15 +201,19 @@ static SuperSlab* superslab_refill(int class_idx) { // Still exhausted, expand now if (expand_superslab_head(head) < 0) { pthread_mutex_unlock(&expand_lock); +#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Failed to expand SuperSlabHead for class %d (system OOM)\n", class_idx); g_hakmem_lock_depth--; +#endif return NULL; // True system OOM } +#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] Successfully expanded SuperSlabHead for class %d\n", class_idx); g_hakmem_lock_depth--; +#endif } // Update current_chunk and tls->ss to point to (potentially new) chunk @@ -219,9 +225,11 @@ static SuperSlab* superslab_refill(int class_idx) { full_mask = (ss_slabs_capacity(current_chunk) >= 32) ? 0xFFFFFFFF : ((1U << ss_slabs_capacity(current_chunk)) - 1); if (!current_chunk || current_chunk->slab_bitmap == full_mask) { +#if !defined(NDEBUG) || defined(HAKMEM_SUPERSLAB_VERBOSE) g_hakmem_lock_depth++; fprintf(stderr, "[HAKMEM] CRITICAL: Chunk still has no free slabs for class %d after expansion\n", class_idx); g_hakmem_lock_depth--; +#endif return NULL; } }