Files

Moe Charm (CI) 52386401b3 Debug Counters Implementation - Clean History

Major Features:
- Debug counter infrastructure for Refill Stage tracking
- Free Pipeline counters (ss_local, ss_remote, tls_sll)
- Diagnostic counters for early return analysis
- Unified larson.sh benchmark runner with profiles
- Phase 6-3 regression analysis documentation

Bug Fixes:
- Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB)
- Fix profile variable naming consistency
- Add .gitignore patterns for large files

Performance:
- Phase 6-3: 4.79 M ops/s (has OOM risk)
- With SuperSlab: 3.13 M ops/s (+19% improvement)

This is a clean repository without large log files.

🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude <noreply@anthropic.com>

2025-11-05 12:31:14 +09:00

12 KiB

Raw Blame History

ACE Learning Layer - Implementation Plan

Goal: 断片化・巨大WS・reallocの弱点を学習で潰して"つよつよ"にする

優先順位（効果 × 実装容易性）

🔥 Phase 1: 最小実装（1日、最大効果）

Target: 断片化ストレス 3.87 → 10-20 M ops/s

1.1 メトリクス収集基盤（2-3時間）

// core/hakmem_ace_metrics.h
typedef struct ACEMetrics {
    // Fast metrics (1Hz collection)
    uint64_t throughput_ops;       // ops/sec
    uint64_t p50_lat_ns;           // median latency
    uint64_t p90_lat_ns;           // p90 latency
    double   llc_miss_rate;        // LLC misses / loads
    uint64_t mutex_wait_ns;        // mutex contention time
    uint64_t remote_free_backlog[8]; // per size-class
    
    // Slow metrics (30-60s collection)
    uint64_t rss_mb;               // resident set size
    double   fragmentation_ratio;  // allocated / held
    uint32_t partial_empty_slabs;  // partially empty slabs
    uint32_t arena_contention[4];  // per-arena contention
} ACEMetrics;

// Lightweight collection (no syscalls in fast path)
void ace_metrics_tick_1hz(ACEMetrics* m);
void ace_metrics_tick_60s(ACEMetrics* m);

実装:

throughput_ops: 既存のTLS countersから集計
llc_miss_rate: rdpmc使用（軽量）
mutex_wait_ns: 既存のlockタイミング統計
remote_free_backlog: Tiny/Mid MTの既存カウンタ
fragmentation_ratio: Superslab/Mid MTの使用率から計算
rss_mb: /proc/self/statm（60秒に1回のみ）

1.2 Fast Loop骨格（2-3時間）

// core/hakmem_ace_controller.c
typedef struct ACEKnobs {
    // TLS/Magazine capacity (per class)
    uint16_t tls_cap[8];           // [4, 8, 16, 32, 64]
    
    // Bundle width (central freelist transfer)
    uint16_t bundle_width;         // [16, 32, 64, 128]
    
    // Remote free drain threshold
    uint16_t drain_threshold[8];   // [32, 64, 128, 256]
    
    // Partial release threshold
    uint8_t  partial_release_pages; // [1, 2, 4, 8]
    
    // Diet factor (巨大WS検知時)
    float    diet_factor;          // [0.5, 0.66, 0.75, 0.9]
} ACEKnobs;

// Fast loop (every 0.5-1s)
void ace_fast_loop_tick(ACEMetrics* m, ACEKnobs* k) {
    // Compute reward
    double reward = 
        1.0 * normalize(m->throughput_ops)
      - 0.3 * m->llc_miss_rate
      - 0.2 * normalize(m->mutex_wait_ns)
      - 0.2 * normalize_backlog(m->remote_free_backlog);
    
    // 1) Remote backlog ↑ → drain threshold ↓
    if (max_backlog(m->remote_free_backlog) > 128) {
        lower_drain_threshold(k);
    }
    
    // 2) LLC miss ↑ → TLS capacity ↓ (diet)
    if (m->llc_miss_rate > 0.15) {
        apply_diet(k, k->diet_factor);
    }
    
    // 3) Mutex wait ↑ → bundle width ↑
    if (m->mutex_wait_ns > threshold) {
        increase_bundle_width(k);
    }
    
    // Update learning (UCB1 or simple hill-climb)
    ucb1_update_knobs(reward, k);
}

実装優先度:

✅ TLS capacity調整（最も効果的、既存TINY_TLS_MAG_CAPを動的化）
✅ Drain threshold調整（既存POOL_REMOTE_DRAIN_THRESHOLDを動的化）
⚠️ Bundle width調整（新規実装必要、後回しOK）

1.3 学習アルゴリズム（1-2時間）

// Simple UCB1 for discrete knobs
typedef struct UCB1Arm {
    uint32_t pulls;
    double   total_reward;
} UCB1Arm;

typedef struct UCB1Bandit {
    UCB1Arm arms[5];  // 5 candidates per knob
    uint32_t total_pulls;
} UCB1Bandit;

// Select next knob value
int ucb1_select_arm(UCB1Bandit* b) {
    double best_ucb = -INFINITY;
    int best_arm = 0;
    
    for (int i = 0; i < 5; i++) {
        double avg = b->arms[i].total_reward / (b->arms[i].pulls + 1e-6);
        double ucb = avg + sqrt(2 * log(b->total_pulls + 1) / (b->arms[i].pulls + 1e-6));
        if (ucb > best_ucb) {
            best_ucb = ucb;
            best_arm = i;
        }
    }
    return best_arm;
}

候補値:

static const uint16_t TLS_CAP_CANDIDATES[] = {4, 8, 16, 32, 64};
static const uint16_t DRAIN_THRESHOLD_CANDIDATES[] = {32, 64, 128, 256, 512};
static const float DIET_FACTOR_CANDIDATES[] = {0.5, 0.66, 0.75, 0.9, 1.0};

🚀 Phase 2: 断片化対策（半日）

Target: Fragmentation stress 3.87 → 10-20 M ops/s

2.1 Slow Loop（2-3時間）

// Slow loop (every 30-60s)
void ace_slow_loop_tick(ACEMetrics* m, ACEKnobs* k) {
    // Fragmentation ↑ or RSS ↑ → partial release
    if (m->fragmentation_ratio > 0.6 || m->rss_mb > threshold) {
        // Lower threshold to trigger more scavenging
        if (k->partial_release_pages > 1) {
            k->partial_release_pages--;
        }
        
        // Run budgeted scavenge (max 5ms)
        run_partial_scavenge_budgeted(5);
    }
    
    // RSS stable & low miss → restore thresholds
    else if (m->llc_miss_rate < 0.10 && m->fragmentation_ratio < 0.4) {
        // Raise threshold (hysteresis)
        if (k->partial_release_pages < 8) {
            k->partial_release_pages++;
        }
    }
}

2.2 Budgeted Scavenge（2-3時間）

// Partial release with time budget
void run_partial_scavenge_budgeted(uint32_t max_ms) {
    uint64_t start_ns = get_ns();
    uint64_t budget_ns = max_ms * 1000000ULL;
    
    // Iterate slabs/superslabs
    for (each slab/superslab) {
        // Check consecutive empty pages
        uint32_t empty_pages = count_consecutive_empty_pages(slab);
        
        if (empty_pages >= g_ace_knobs.partial_release_pages) {
            // madvise DONTNEED or munmap
            partial_release_pages(slab, empty_pages);
        }
        
        // Check budget
        if (get_ns() - start_ns > budget_ns) {
            break;  // Stop, resume next cycle
        }
    }
}

実装ポイント:

Superslab: 既存のビットマップから空きページ検出
Mid MT: セグメントの使用率追跡が必要（新規フィールド追加）
madvise(DONTNEED)で物理メモリ返却（munmapより軽量）

🎯 Phase 3: 巨大WS対策（半日）

Target: 巨大WS 22 → 30-45 M ops/s

3.1 自動ダイエット（実装済みのknobを動的化）

void apply_diet(ACEKnobs* k, float factor) {
    for (int i = 0; i < 8; i++) {
        // Hot classes (0-2) diet more aggressively
        float class_factor = (i < 3) ? factor : (factor + 1.0) / 2.0;
        
        uint16_t new_cap = (uint16_t)(k->tls_cap[i] * class_factor);
        k->tls_cap[i] = MAX(4, new_cap);  // Min capacity = 4
    }
}

void restore_diet(ACEKnobs* k) {
    for (int i = 0; i < 8; i++) {
        // Gradual restore (10% per cycle)
        k->tls_cap[i] = MIN(64, k->tls_cap[i] * 1.1);
    }
}

3.2 LLC Miss監視

// Using rdpmc (lightweight)
double read_llc_miss_rate() {
    static uint64_t prev_misses = 0;
    static uint64_t prev_refs = 0;
    
    uint64_t misses = rdpmc(LLC_MISS_EVENT);
    uint64_t refs = rdpmc(LLC_REF_EVENT);
    
    double rate = (double)(misses - prev_misses) / (refs - prev_refs + 1);
    
    prev_misses = misses;
    prev_refs = refs;
    
    return rate;
}

🔧 Phase 4: realloc最適化（1日）

Target: 6.6-277ns ブレを 1.3-2x短縮

4.1 In-place拡張

void* hak_realloc_inplace(void* ptr, size_t old_size, size_t new_size) {
    // Check adjacent free space
    void* next_block = (char*)ptr + old_size;
    
    if (is_free(next_block)) {
        size_t available = get_free_size(next_block);
        size_t needed = new_size - old_size;
        
        if (available >= needed) {
            // Merge and extend in-place
            merge_free_block(ptr, next_block);
            return ptr;  // No memcpy!
        }
    }
    
    return NULL;  // Fall back to copy
}

4.2 コピー最適化

void* hak_realloc_copy(void* ptr, size_t old_size, size_t new_size) {
    void* new_ptr = hak_malloc(new_size);
    
    // Size-dependent strategy
    if (old_size < 4096) {
        // Small: direct memcpy
        memcpy(new_ptr, ptr, old_size);
    }
    else if (old_size < k->nt_threshold) {
        // Medium: prefetch + memcpy
        __builtin_prefetch(ptr, 0, 3);
        __builtin_prefetch((char*)ptr + 64, 0, 3);
        memcpy(new_ptr, ptr, old_size);
    }
    else {
        // Large: non-temporal store
        memcpy_nt(new_ptr, ptr, old_size);
    }
    
    hak_free(ptr);
    return new_ptr;
}

実装順序（推奨）

Day 1: 最小ACE（最大効果）

✅ メトリクス収集（throughput, llc_miss, backlog）
✅ Fast loop骨格
✅ TLS capacity調整（動的化）
✅ UCB1学習
✅ ON/OFF切替環境変数

Expected: 断片化ケースで即効果（3.9 → 8-12 M ops/s）

Day 2: 断片化対策

✅ Slow loop
✅ Fragmentation metrics
✅ Budgeted scavenge
✅ Partial release

Expected: 断片化ケースで 10-20 M ops/s

Day 3: 巨大WS対策

✅ LLC miss monitoring
✅ Diet adjustment
✅ Gradual restore

Expected: 巨大WSで 30-45 M ops/s

Day 4: realloc最適化（オプション）

✅ In-place拡張
✅ コピー最適化
✅ NT threshold学習

Expected: realloc 1.3-2x改善

安全弁（必須）

typedef struct ACEGuards {
    uint64_t max_p99_lat_ns;     // p99 latency上限
    uint64_t max_rss_mb;          // RSS上限
    uint32_t max_cpu_percent;     // ACE CPU占有上限
    bool     enabled;             // ON/OFF switch
} ACEGuards;

void ace_check_guards(ACEMetrics* m, ACEGuards* g) {
    if (!g->enabled) return;
    
    // Latency guard
    if (m->p90_lat_ns > g->max_p99_lat_ns) {
        ace_emergency_stop("Latency spike");
        return;
    }
    
    // RSS guard
    if (m->rss_mb > g->max_rss_mb) {
        ace_emergency_stop("RSS overflow");
        return;
    }
}

環境変数制御

# Enable/disable ACE
export HAKMEM_ACE_ENABLED=1

# Fast loop interval (ms)
export HAKMEM_ACE_FAST_INTERVAL_MS=500

# Slow loop interval (s)
export HAKMEM_ACE_SLOW_INTERVAL_S=30

# Guards
export HAKMEM_ACE_MAX_P99_LAT_NS=10000000  # 10ms
export HAKMEM_ACE_MAX_RSS_MB=16384         # 16GB

# Debug logging
export HAKMEM_ACE_LOG_LEVEL=1  # 0=off, 1=info, 2=debug

A/B計測

# Baseline (ACE OFF)
HAKMEM_ACE_ENABLED=0 ./bench_fragment_stress > baseline.txt

# ACE ON
HAKMEM_ACE_ENABLED=1 ./bench_fragment_stress > ace_on.txt

# Compare
diff <(grep "ops/sec" baseline.txt) <(grep "ops/sec" ace_on.txt)

期待効果（保守的見積り）

ワークロード	Before	After (ACE)	改善率
断片化ストレス	3.87 M ops/s	10-20 M ops/s	2.6-5.2x
巨大WS	22.15 M ops/s	30-45 M ops/s	1.4-2.0x
realloc重め	277ns (worst)	140-210ns	1.3-2.0x
Mid MT (維持)	111.6 M ops/s	110-115 M ops/s	±5%

ファイル構成

core/
├── hakmem_ace_metrics.h       ← NEW (metrics collection)
├── hakmem_ace_metrics.c       ← NEW
├── hakmem_ace_controller.h    ← NEW (fast/slow loops)
├── hakmem_ace_controller.c    ← NEW
├── hakmem_ace_ucb1.h          ← NEW (learning)
├── hakmem_ace_ucb1.c          ← NEW
├── hakmem_ace_scavenge.h      ← NEW (partial release)
├── hakmem_ace_scavenge.c      ← NEW
└── hakmem_ace_realloc.h       ← NEW (realloc opt, Phase 4)

# Existing files to modify:
core/hakmem_tiny_magazine.c    ← Add dynamic TLS capacity
core/hakmem_pool.c             ← Add dynamic drain threshold
core/hakmem.c                  ← Integrate ace_tick() calls

Next Step

どのPhaseから始めますか？

推奨: Phase 1 (Day 1) - 最小実装で最大効果

メトリクス収集
Fast loop
TLS capacity調整
UCB1学習

このPhase 1のコード（hakmem_ace_metrics.h/.c + hakmem_ace_controller.h/.c）を今すぐ書きましょうか？ 🚀

12 KiB Raw Blame History Unescape Escape