Files
hakmem/core/hakmem_learner.c

605 lines
35 KiB
C
Raw Normal View History

// ============================================================================
// hakmem_learner.c - Background Learner (自動CAP/W_MAX調整)
// ============================================================================
//
// 機能: バックグラウンドスレッドでヒット率・統計を監視し、ポリシーを自動調整
//
// 学習アルゴリズム:
// ┌───────────────────────────────────────────────────────────────┐
// │ 1. CAP学習ヒット率ベース
// │ - 定期的にヒット率をサンプリングデフォルト1秒ごと
// │ - 目標ヒット率と比較Mid: 0.65、Large: 0.55
// │ - 不足していればCAP増加、十分ならCAP減少 │
// │ - Dwell安定期間で振動抑制 │
// │ │
// │ 2. Budget enforcement + Water-filling │
// │ - 合計CAP上限Budgetを設定可能 │
// │ - 超過時: 需要の低いクラスから削減 │
// │ - 未達時: 需要の高いクラスへ配分Water-filling有効時
// │ │
// │ 3. W_MAX学習UCB1 + Canary Deployment
// │ - 複数のW_MAX候補をUCB1多腕バンディットで探索 │
// │ - Canary方式: 一時的に候補を適用し、効果測定 │
// │ - 改善なければ最良値へロールバック │
// │ │
// │ 4. DYN1/DYN2自動割り当て │
// │ - サイズヒストグラムからピーク検出 │
// │ - 固定クラスと被らない範囲で動的クラスを設定 │
// └───────────────────────────────────────────────────────────────┘
//
// 環境変数(主要なもの):
// ┌─────────────────────────────────┬─────────┬──────────────────┐
// │ 変数 │ デフォルト│ 説明 │
// ├─────────────────────────────────┼─────────┼──────────────────┤
// │ HAKMEM_LEARN │ 0 │ 学習モード有効化 │
// │ HAKMEM_LEARN_WINDOW_MS │ 1000 │ サンプリング間隔 │
// │ HAKMEM_TARGET_HIT_MID │ 0.65 │ Mid目標ヒット率 │
// │ HAKMEM_TARGET_HIT_LARGE │ 0.55 │ Large目標ヒット率│
// │ HAKMEM_CAP_STEP_MID │ 4 │ Mid CAP更新幅 │
// │ HAKMEM_CAP_STEP_LARGE │ 1 │ Large CAP更新幅 │
// │ HAKMEM_CAP_DWELL_SEC_MID │ 3 │ Mid安定期間(秒) │
// │ HAKMEM_CAP_DWELL_SEC_LG │ 5 │ Large安定期間 │
// │ HAKMEM_BUDGET_MID │ 0 │ Mid総CAP上限 │
// │ HAKMEM_BUDGET_LARGE │ 0 │ Large総CAP上限 │
// │ HAKMEM_WF │ 0 │ Water-filling │
// │ HAKMEM_DYN1_AUTO │ 0 │ DYN1自動割り当て │
// │ HAKMEM_WMAX_LEARN │ 0 │ W_MAX学習有効化 │
// │ HAKMEM_WMAX_CANDIDATES_MID │ 1.4,... │ Mid候補リスト │
// │ HAKMEM_WMAX_CANDIDATES_LARGE │ 1.25,...│ Large候補リスト │
// │ HAKMEM_WMAX_CANARY │ 1 │ Canary方式有効 │
// │ HAKMEM_THP_LEARN │ 0 │ THP閾値学習 │
// └─────────────────────────────────┴─────────┴──────────────────┘
//
// 使用例:
// # 基本的な学習モードCAP自動調整のみ
// HAKMEM_LEARN=1 ./app
//
// # 目標ヒット率を調整Mid: 70%、Large: 60%
// HAKMEM_LEARN=1 HAKMEM_TARGET_HIT_MID=0.7 HAKMEM_TARGET_HIT_LARGE=0.6 ./app
//
// # Budget制約Mid: 300 pages、Large: 50 bundles+ Water-filling
// HAKMEM_LEARN=1 HAKMEM_BUDGET_MID=300 HAKMEM_BUDGET_LARGE=50 HAKMEM_WF=1 ./app
//
// # DYN1自動割り当て8-16KBギャップをピークで埋める
// HAKMEM_LEARN=1 HAKMEM_DYN1_AUTO=1 HAKMEM_CAP_MID_DYN1=64 ./app
//
// # W_MAX学習Canary方式で安全に探索
// HAKMEM_LEARN=1 HAKMEM_WMAX_LEARN=1 \
// HAKMEM_WMAX_CANDIDATES_MID=1.4,1.6,1.8 \
// HAKMEM_WMAX_CANDIDATES_LARGE=1.3,1.6,2.0 ./app
//
// 注意事項:
// - 学習モードは高負荷ワークロードで効果的
// - 低トラフィック時は誤調整の可能性ありmin_samples調整推奨
// - W_MAX学習はリスクあり内部断片化増加→ Canary推奨
// - FrozenPolicy更新はRCUライクgrace period未実装
// ============================================================================
#include "hakmem_learner.h"
#include "hakmem_internal.h"
#include "hakmem_syscall.h" // Phase 6.X P0 Fix: Box 3 syscall layer (bypasses LD_PRELOAD)
#include "hakmem_policy.h"
#include "hakmem_pool.h"
#include "hakmem_l25_pool.h"
#include "hakmem_ace_stats.h"
#include "hakmem_size_hist.h"
#include "hakmem_learn_log.h"
#include "hakmem_tiny_superslab.h" // Phase 8.4: ACE Observer
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
#include "box/learner_env_box.h" // Box: Learner ENV decision
#include <pthread.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <math.h>
static pthread_t g_thr;
static int g_run = 0;
// Previous snapshots for delta computation
static uint64_t prev_mid_hits[POOL_NUM_CLASSES];
static uint64_t prev_mid_misses[POOL_NUM_CLASSES];
static uint64_t prev_lg_hits[L25_NUM_CLASSES];
static uint64_t prev_lg_misses[L25_NUM_CLASSES];
static uint64_t prev_try_attempts = 0;
static uint64_t prev_try_success = 0;
static uint64_t prev_ring_underflow = 0;
static inline int get_env_int(const char* name, int defv) {
const char* e = getenv(name);
return (e ? atoi(e) : defv);
}
static int parse_float_list(const char* s, double* out, int maxn) {
if (!s) return 0;
int n = 0; const char* p = s;
char buf[64];
while (*p && n < maxn) {
int i = 0;
while (*p && *p != ',' && i < (int)sizeof(buf)-1) buf[i++] = *p++;
buf[i] = '\0';
if (i > 0) out[n++] = atof(buf);
if (*p == ',') p++;
}
return n;
}
// --------------------- UCB1 helpers (discrete candidates) ------------------
typedef struct {
double values[16];
int pulls[16];
double sum_score[16];
int n;
int cur;
int dwell_ticks;
int dwell_sec;
} ucb1_t;
static void ucb1_init(ucb1_t* u, const double* vals, int n, int dwell_sec) {
memset(u, 0, sizeof(*u));
if (n > 16) n = 16;
u->n = n;
for (int i=0;i<n;i++){ u->values[i]=vals[i]; u->pulls[i]=0; u->sum_score[i]=0.0; }
u->cur = 0; u->dwell_ticks = 0; u->dwell_sec = dwell_sec;
}
static int ucb1_select(ucb1_t* u) {
// if any never pulled, pick it first
for (int i=0;i<u->n;i++) if (u->pulls[i]==0) return i;
// otherwise compute UCB
double total = 0.0; for (int i=0;i<u->n;i++) total += (double)u->pulls[i];
double best_ucb = -1e100; int best_i = 0;
for (int i=0;i<u->n;i++) {
double mean = u->sum_score[i] / (double)u->pulls[i];
double bonus = 1.5 * sqrt(log(total) / (double)u->pulls[i]);
double ucb = mean + bonus;
if (ucb > best_ucb) { best_ucb = ucb; best_i = i; }
}
return best_i;
}
static void ucb1_update(ucb1_t* u, int arm, double score) {
if (arm < 0 || arm >= u->n) return;
u->pulls[arm] += 1;
u->sum_score[arm] += score;
}
static void* learner_main(void* arg) {
(void)arg;
// Config
int window_ms = get_env_int("HAKMEM_LEARN_WINDOW_MS", 1000);
double tgt_mid = (getenv("HAKMEM_TARGET_HIT_MID") ? atof(getenv("HAKMEM_TARGET_HIT_MID")) : 0.65);
double tgt_lg = (getenv("HAKMEM_TARGET_HIT_LARGE") ? atof(getenv("HAKMEM_TARGET_HIT_LARGE")) : 0.55);
double eps = 0.03; // hysteresis band
int step_mid = get_env_int("HAKMEM_CAP_STEP_MID", 4); // pages per update
int step_lg = get_env_int("HAKMEM_CAP_STEP_LARGE", 1); // bundles per update
int min_mid = get_env_int("HAKMEM_CAP_MIN_MID", 8);
int max_mid = get_env_int("HAKMEM_CAP_MAX_MID", 2048);
int min_lg = get_env_int("HAKMEM_CAP_MIN_LARGE", 1);
int max_lg = get_env_int("HAKMEM_CAP_MAX_LARGE", 512);
int budget_mid = get_env_int("HAKMEM_BUDGET_MID", 0); // 0=disabled
int budget_lg = get_env_int("HAKMEM_BUDGET_LARGE", 0); // 0=disabled
int min_samples = get_env_int("HAKMEM_LEARN_MIN_SAMPLES", 256);
// Dwell (stability) for CAP updates
int cap_dwell_mid = get_env_int("HAKMEM_CAP_DWELL_SEC_MID", 3);
int cap_dwell_lg = get_env_int("HAKMEM_CAP_DWELL_SEC_LG", 5);
int wf_enabled = get_env_int("HAKMEM_WF", 0);
// weights for simple gain proxy (if needed later)
double w_miss = (getenv("HAKMEM_GAIN_W_MISS") ? atof(getenv("HAKMEM_GAIN_W_MISS")) : 1.0);
(void)w_miss;
// Initialize prev counters
memset(prev_mid_hits, 0, sizeof(prev_mid_hits));
memset(prev_mid_misses, 0, sizeof(prev_mid_misses));
memset(prev_lg_hits, 0, sizeof(prev_lg_hits));
memset(prev_lg_misses, 0, sizeof(prev_lg_misses));
// per-class dwell counters (seconds since last change)
static int mid_dwell_ticks[POOL_NUM_CLASSES] = {0};
static int lg_dwell_ticks[L25_NUM_CLASSES] = {0};
// Phase 8.4: Check ACE Observer setting once (outside loop)
const char* ace_observe = getenv("HAKMEM_ACE_OBSERVE");
int ace_enabled = (ace_observe && atoi(ace_observe) != 0);
const char* ace_debug = getenv("HAKMEM_ACE_DEBUG");
int ace_debug_enabled = (ace_debug && atoi(ace_debug) != 0);
if (ace_enabled && ace_debug_enabled) {
fprintf(stderr, "[Learner] ACE Observer enabled (debug on)\n");
}
while (g_run) {
usleep(window_ms * 1000);
if (!g_run) break;
// Phase 8.4: ACE Observer (Tiny SuperSlab sizing)
if (ace_enabled) {
if (ace_debug_enabled) {
fprintf(stderr, "[Learner] Calling ACE Observer...\n");
}
hak_tiny_superslab_ace_observe_all();
}
// Optional: flush sampled logs to file
const char* logf = getenv("HAKMEM_LOG_FILE");
if (logf && *logf) {
hkm_log_flush_file(logf);
// Append metrics line: M,timestamp_ns,try_attempts_delta,try_success_delta,ring_underflow_delta,trylock_rate
uint64_t cur_try=0, cur_succ=0, cur_uf=0;
hak_pool_extra_metrics_snapshot(&cur_try, &cur_succ, &cur_uf);
uint64_t d_try = (cur_try >= prev_try_attempts) ? (cur_try - prev_try_attempts) : 0;
uint64_t d_suc = (cur_succ >= prev_try_success) ? (cur_succ - prev_try_success) : 0;
uint64_t d_uf = (cur_uf >= prev_ring_underflow) ? (cur_uf - prev_ring_underflow) : 0;
prev_try_attempts = cur_try; prev_try_success = cur_succ; prev_ring_underflow = cur_uf;
double rate = (d_try > 0) ? ((double)d_suc / (double)d_try) : 0.0;
// get timestamp
struct timespec ts; clock_gettime(CLOCK_REALTIME, &ts);
unsigned long long ts_ns = (unsigned long long)ts.tv_sec*1000000000ull + (unsigned long long)ts.tv_nsec;
FILE* fp = fopen(logf, "a");
if (fp) {
fprintf(fp, "M,%llu,%llu,%llu,%llu,%.6f\n", ts_ns,
(unsigned long long)d_try, (unsigned long long)d_suc, (unsigned long long)d_uf, rate);
fclose(fp);
}
}
// Snapshot current stats
uint64_t mid_hits[POOL_NUM_CLASSES], mid_misses[POOL_NUM_CLASSES], mid_refills[POOL_NUM_CLASSES], mid_frees[POOL_NUM_CLASSES];
uint64_t lg_hits[L25_NUM_CLASSES], lg_misses[L25_NUM_CLASSES], lg_refills[L25_NUM_CLASSES], lg_frees[L25_NUM_CLASSES];
hak_pool_stats_snapshot(mid_hits, mid_misses, mid_refills, mid_frees);
hak_l25_pool_stats_snapshot(lg_hits, lg_misses, lg_refills, lg_frees);
const FrozenPolicy* cur = hkm_policy_get();
if (!cur) continue;
FrozenPolicy* np = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (!np) continue;
memcpy(np, cur, sizeof(FrozenPolicy));
// Adjust Mid caps by hit rate vs target (delta over window) with dwell
int mid_classes = 5;
if (cur->mid_dyn1_bytes != 0 && cur->mid_dyn2_bytes != 0) mid_classes = 7;
else if (cur->mid_dyn1_bytes != 0 || cur->mid_dyn2_bytes != 0) mid_classes = 6;
for (int i = 0; i < mid_classes; i++) {
uint64_t dh = mid_hits[i] - prev_mid_hits[i];
uint64_t dm = mid_misses[i] - prev_mid_misses[i];
uint64_t dt = dh + dm;
if (dt < (uint64_t)min_samples) continue;
double hit = (dt > 0) ? ((double)dh / (double)dt) : 1.0;
int cap;
if (i < 5) cap = np->mid_cap[i];
else if (i == 5) cap = np->mid_cap_dyn1;
else cap = np->mid_cap_dyn2;
// dwell gate per class
mid_dwell_ticks[i] += window_ms/1000;
if (mid_dwell_ticks[i] < cap_dwell_mid) {
// skip change this window
} else {
if (hit < (tgt_mid - eps)) {
cap += step_mid;
} else if (hit > (tgt_mid + eps)) {
cap -= step_mid;
}
if (cap < min_mid) cap = min_mid;
if (cap > max_mid) cap = max_mid;
// reset dwell only if actual change happens
int old = (i < 5) ? np->mid_cap[i] : (i==5 ? np->mid_cap_dyn1 : np->mid_cap_dyn2);
if (cap != old) mid_dwell_ticks[i] = 0;
}
if (i < 5) np->mid_cap[i] = (uint16_t)cap;
else if (i == 5) np->mid_cap_dyn1 = (uint16_t)cap;
else np->mid_cap_dyn2 = (uint16_t)cap;
}
// Optional: auto-assign DYN1 to peak size in 232KiB when enabled
const char* dyn_auto = getenv("HAKMEM_DYN1_AUTO");
const char* dyn2_auto = getenv("HAKMEM_DYN2_AUTO");
if ((dyn_auto && atoi(dyn_auto) != 0) || (dyn2_auto && atoi(dyn2_auto) != 0)) {
// Snapshot first 40 KiB (bins up to 40), reset after read
uint64_t bins[41];
hkm_size_hist_snapshot(bins, 41, 1);
int best_kb = 0, second_kb = 0; uint64_t best_cnt = 0, second_cnt = 0;
for (int kb = 2; kb <= 32; kb++) {
uint64_t c = bins[kb];
if (c > best_cnt) { second_cnt = best_cnt; second_kb = best_kb; best_cnt = c; best_kb = kb; }
else if (c > second_cnt && kb != best_kb) { second_cnt = c; second_kb = kb; }
}
// Choose DYN1 around peak if not conflicting with fixed classes
if (dyn_auto && atoi(dyn_auto) != 0 && best_kb >= 2 && best_kb <= 32) {
size_t dyn_bytes = (size_t)best_kb * 1024;
if (dyn_bytes != 2048 && dyn_bytes != 4096 && dyn_bytes != 8192 && dyn_bytes != 16384 && dyn_bytes != 32768) {
np->mid_dyn1_bytes = (uint32_t)dyn_bytes;
}
}
if (dyn2_auto && atoi(dyn2_auto) != 0 && second_kb >= 2 && second_kb <= 32) {
size_t dyn_bytes = (size_t)second_kb * 1024;
if (dyn_bytes != 2048 && dyn_bytes != 4096 && dyn_bytes != 8192 && dyn_bytes != 16384 && dyn_bytes != 32768 && dyn_bytes != np->mid_dyn1_bytes) {
np->mid_dyn2_bytes = (uint32_t)dyn_bytes;
}
}
}
// Adjust Large caps similarly with dwell
for (int i = 0; i < L25_NUM_CLASSES; i++) {
uint64_t dh = lg_hits[i] - prev_lg_hits[i];
uint64_t dm = lg_misses[i] - prev_lg_misses[i];
uint64_t dt = dh + dm;
if (dt < (uint64_t)min_samples) continue;
double hit = (dt > 0) ? ((double)dh / (double)dt) : 1.0;
int cap = np->large_cap[i];
lg_dwell_ticks[i] += window_ms/1000;
if (lg_dwell_ticks[i] >= cap_dwell_lg) {
if (hit < (tgt_lg - eps)) {
cap += step_lg;
} else if (hit > (tgt_lg + eps)) {
cap -= step_lg;
}
int old = np->large_cap[i];
if (cap != old) lg_dwell_ticks[i] = 0;
}
if (cap < min_lg) cap = min_lg;
if (cap > max_lg) cap = max_lg;
np->large_cap[i] = (uint16_t)cap;
}
// Budget enforcement / Water-filling for Mid
if (budget_mid > 0) {
// Assemble class arrays
int idx_map[7]; int m=0; for (int i=0;i<5;i++) idx_map[m++]=i; if (cur->mid_dyn1_bytes) idx_map[m++]=5; if (cur->mid_dyn2_bytes) idx_map[m++]=6;
// compute sum caps
#define GET_MID_CAP(npX, slotX) ((slotX)<5 ? (npX)->mid_cap[(slotX)] : ((slotX)==5 ? (npX)->mid_cap_dyn1 : (npX)->mid_cap_dyn2))
#define SET_MID_CAP(npX, slotX, valX) do { if ((slotX)<5) (npX)->mid_cap[(slotX)] = (uint16_t)(valX); else if ((slotX)==5) (npX)->mid_cap_dyn1 = (uint16_t)(valX); else (npX)->mid_cap_dyn2 = (uint16_t)(valX); } while(0)
int sum = 0; for (int k=0;k<m;k++){ int i=idx_map[k]; int v=GET_MID_CAP(np, i); sum += v; }
// need score = miss ratio in window高いほど需要大
double need[7]; for (int k=0;k<m;k++){ int i=idx_map[k]; uint64_t dh = (i<5? (mid_hits[i]-prev_mid_hits[i]) : 0); uint64_t dm = (i<5? (mid_misses[i]-prev_mid_misses[i]) : 0); double dt=(double)(dh+dm); need[k] = (dt>0.0)? ((double)dm/dt) : 0.0; }
// If sum > budget: remove from lowest-need first
if (sum > budget_mid) {
while (sum > budget_mid) {
// find min need with cap>min_mid
int best_k = -1; double best_need = 1e9; int best_cap=0;
for (int k=0;k<m;k++){ int slot=idx_map[k]; int cap=GET_MID_CAP(np, slot); if (cap<=min_mid) continue; if (need[k] < best_need){ best_need=need[k]; best_k=k; best_cap=cap; } }
if (best_k < 0) break;
int slot = idx_map[best_k]; int nv = GET_MID_CAP(np, slot) - step_mid; if (nv < min_mid) nv = min_mid; SET_MID_CAP(np, slot, nv); sum = 0; for (int k=0;k<m;k++){ int sl=idx_map[k]; sum += GET_MID_CAP(np, sl); }
}
} else if (wf_enabled && sum < budget_mid) {
// sum < budget: allocate to highest-need first
while (sum < budget_mid) {
int best_k = -1; double best_need = -1e9;
for (int k=0;k<m;k++){ if (need[k] > best_need){ best_need=need[k]; best_k=k; } }
if (best_k < 0) break;
int slot = idx_map[best_k]; int nv = GET_MID_CAP(np, slot) + step_mid; SET_MID_CAP(np, slot, nv); sum += step_mid;
}
}
}
if (budget_lg > 0) {
int sum = 0; for (int i=0;i<L25_NUM_CLASSES;i++) sum += np->large_cap[i];
// need score = miss ratioLarge
double need_lg[L25_NUM_CLASSES]; for (int i=0;i<L25_NUM_CLASSES;i++){ uint64_t dh=lg_hits[i]-prev_lg_hits[i]; uint64_t dm=lg_misses[i]-prev_lg_misses[i]; double dt=(double)(dh+dm); need_lg[i]=(dt>0.0)?((double)dm/dt):0.0; }
if (sum > budget_lg) {
while (sum > budget_lg) {
int best=-1; double best_need=1e9;
for (int i=0;i<L25_NUM_CLASSES;i++){ if (np->large_cap[i] <= min_lg) continue; if (need_lg[i] < best_need){ best_need=need_lg[i]; best=i; } }
if (best<0) break; int nv=np->large_cap[best]-step_lg; if (nv<min_lg) nv=min_lg; np->large_cap[best]=nv; sum=0; for (int i=0;i<L25_NUM_CLASSES;i++) sum += np->large_cap[i];
}
} else if (wf_enabled && sum < budget_lg) {
while (sum < budget_lg) {
int best=-1; double best_need=-1e9; for (int i=0;i<L25_NUM_CLASSES;i++){ if (need_lg[i] > best_need){ best_need=need_lg[i]; best=i; } }
if (best<0) break; np->large_cap[best]+=step_lg; sum += step_lg;
}
}
}
// Publish new policy
hkm_policy_publish(np);
if (cur->mid_dyn1_bytes != 0 || cur->mid_dyn2_bytes != 0) {
fprintf(stderr, "[Learner] Published caps: Mid={%u,%u,%u,%u,%u,D1:%u,D2:%u} Large={%u,%u,%u,%u,%u}\n",
(unsigned)np->mid_cap[0], (unsigned)np->mid_cap[1], (unsigned)np->mid_cap[2], (unsigned)np->mid_cap[3], (unsigned)np->mid_cap[4], (unsigned)np->mid_cap_dyn1, (unsigned)np->mid_cap_dyn2,
(unsigned)np->large_cap[0], (unsigned)np->large_cap[1], (unsigned)np->large_cap[2], (unsigned)np->large_cap[3], (unsigned)np->large_cap[4]);
} else {
fprintf(stderr, "[Learner] Published caps: Mid={%u,%u,%u,%u,%u} Large={%u,%u,%u,%u,%u}\n",
(unsigned)np->mid_cap[0], (unsigned)np->mid_cap[1], (unsigned)np->mid_cap[2], (unsigned)np->mid_cap[3], (unsigned)np->mid_cap[4],
(unsigned)np->large_cap[0], (unsigned)np->large_cap[1], (unsigned)np->large_cap[2], (unsigned)np->large_cap[3], (unsigned)np->large_cap[4]);
}
// Update prev snapshots
for (int i=0;i<mid_classes;i++){ prev_mid_hits[i]=mid_hits[i]; prev_mid_misses[i]=mid_misses[i]; }
for (int i=0;i<L25_NUM_CLASSES;i++){ prev_lg_hits[i]=lg_hits[i]; prev_lg_misses[i]=lg_misses[i]; }
// Optional: W_MAX learning (discrete candidates + dwell + canary)
const char* wlearn = getenv("HAKMEM_WMAX_LEARN");
if (wlearn && atoi(wlearn) != 0) {
static ucb1_t u_mid, u_lg; static int inited=0;
static double vals_mid[8], vals_lg[8]; static int n_mid=0, n_lg=0;
// Canary control
static int canary_mid=0, canary_lg=0; // active flags
static double base_mid_mean=0.0, base_lg_mean=0.0; // baselines
static double trial_mid_sum=0.0, trial_lg_sum=0.0; static int trial_mid_n=0, trial_lg_n=0; static int trial_mid_sec=0, trial_lg_sec=0;
static int trial_sec = 0; static double adopt_pct = 0.01; static int canary_on = 0;
if (!inited) {
n_mid = parse_float_list(getenv("HAKMEM_WMAX_CANDIDATES_MID"), vals_mid, 8);
if (n_mid <= 0) { vals_mid[0]=1.4; vals_mid[1]=1.6; vals_mid[2]=1.7; n_mid=3; }
n_lg = parse_float_list(getenv("HAKMEM_WMAX_CANDIDATES_LARGE"), vals_lg, 8);
if (n_lg <= 0) { vals_lg[0]=1.25; vals_lg[1]=1.30; vals_lg[2]=1.40; n_lg=3; }
ucb1_init(&u_mid, vals_mid, n_mid, get_env_int("HAKMEM_WMAX_DWELL_SEC", 10));
ucb1_init(&u_lg, vals_lg, n_lg, get_env_int("HAKMEM_WMAX_DWELL_SEC", 10));
trial_sec = get_env_int("HAKMEM_WMAX_TRIAL_SEC", 5);
adopt_pct = (getenv("HAKMEM_WMAX_ADOPT_PCT") ? atof(getenv("HAKMEM_WMAX_ADOPT_PCT")) : 0.01);
canary_on = get_env_int("HAKMEM_WMAX_CANARY", 1);
inited = 1;
}
u_mid.dwell_ticks += window_ms/1000; u_lg.dwell_ticks += window_ms/1000;
// score簡易: mid_hit+large_hit - (mid_miss+large_miss) - 2*l1_fallback
hkm_ace_stats_snapshot_t ace;
hkm_ace_stats_snapshot(&ace, 1);
double score = (double)(ace.mid_hit + ace.large_hit) - (double)(ace.mid_miss + ace.large_miss) - 2.0*(double)ace.l1_fallback;
// update current arms with window score
ucb1_update(&u_mid, u_mid.cur, score);
ucb1_update(&u_lg, u_lg.cur, score);
// accumulate trial score if canary active
if (canary_mid) { trial_mid_sum += score; trial_mid_n++; trial_mid_sec += window_ms/1000; }
if (canary_lg) { trial_lg_sum += score; trial_lg_n++; trial_lg_sec += window_ms/1000; }
// maybe switch arm if dwell passed
if (u_mid.dwell_ticks >= u_mid.dwell_sec && !canary_mid) {
u_mid.dwell_ticks = 0; int next = ucb1_select(&u_mid);
if (next != u_mid.cur) {
if (canary_on) {
// start canary: publish candidate temporarily and measure
FrozenPolicy* np3 = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (np3) {
const FrozenPolicy* cur2 = hkm_policy_get();
double prev_mean = (u_mid.pulls[u_mid.cur] > 0) ? (u_mid.sum_score[u_mid.cur] / (double)u_mid.pulls[u_mid.cur]) : 0.0;
base_mid_mean = prev_mean; trial_mid_sum = 0.0; trial_mid_n = 0; trial_mid_sec = 0;
memcpy(np3, cur2, sizeof(FrozenPolicy));
np3->w_max_mid = u_mid.values[next]; hkm_policy_publish(np3);
canary_mid = 1; // set active; do not change u_mid.cur yet
fprintf(stderr, "[Learner] W_MAX mid canary start: %.2f (base=%.3f)\n", np3->w_max_mid, base_mid_mean);
// store candidate index into u_mid.cur temporarily for scoring but keep canary flag
u_mid.cur = next;
}
} else {
FrozenPolicy* np3 = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (np3) {
memcpy(np3, hkm_policy_get(), sizeof(FrozenPolicy));
np3->w_max_mid = u_mid.values[next]; hkm_policy_publish(np3);
u_mid.cur = next; fprintf(stderr, "[Learner] W_MAX mid=%.2f (UCB1)\n", np3->w_max_mid);
}
}
}
}
if (u_lg.dwell_ticks >= u_lg.dwell_sec && !canary_lg) {
u_lg.dwell_ticks = 0; int next = ucb1_select(&u_lg);
if (next != u_lg.cur) {
if (canary_on) {
FrozenPolicy* np4 = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (np4) {
const FrozenPolicy* cur2 = hkm_policy_get();
double prev_mean = (u_lg.pulls[u_lg.cur] > 0) ? (u_lg.sum_score[u_lg.cur] / (double)u_lg.pulls[u_lg.cur]) : 0.0;
base_lg_mean = prev_mean; trial_lg_sum = 0.0; trial_lg_n = 0; trial_lg_sec = 0;
memcpy(np4, cur2, sizeof(FrozenPolicy));
np4->w_max_large = u_lg.values[next]; hkm_policy_publish(np4);
canary_lg = 1; u_lg.cur = next;
fprintf(stderr, "[Learner] W_MAX large canary start: %.2f (base=%.3f)\n", np4->w_max_large, base_lg_mean);
}
} else {
FrozenPolicy* np4 = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (np4) {
memcpy(np4, hkm_policy_get(), sizeof(FrozenPolicy));
np4->w_max_large = u_lg.values[next]; hkm_policy_publish(np4);
u_lg.cur = next; fprintf(stderr, "[Learner] W_MAX large=%.2f (UCB1)\n", np4->w_max_large);
}
}
}
}
// finish canary trials if time elapsed
if (canary_mid && trial_mid_sec >= trial_sec && trial_mid_n > 0) {
double trial_mean = trial_mid_sum / (double)trial_mid_n;
double thresh = base_mid_mean * (1.0 + adopt_pct);
if (!(trial_mean >= thresh)) {
// revert to best baseline arm (approx: pick best mean so far)
int besti = 0; double bestm=-1e100; for (int i=0;i<u_mid.n;i++){ if (u_mid.pulls[i]>0){ double m=u_mid.sum_score[i]/(double)u_mid.pulls[i]; if (m>bestm){bestm=m;besti=i;} } }
FrozenPolicy* npR = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (npR) { memcpy(npR, hkm_policy_get(), sizeof(FrozenPolicy)); npR->w_max_mid = u_mid.values[besti]; hkm_policy_publish(npR); u_mid.cur = besti; fprintf(stderr, "[Learner] W_MAX mid canary revert to %.2f (trial=%.3f base=%.3f)\n", npR->w_max_mid, trial_mean, base_mid_mean); }
} else {
fprintf(stderr, "[Learner] W_MAX mid canary adopt (trial=%.3f base=%.3f)\n", trial_mean, base_mid_mean);
}
canary_mid = 0; trial_mid_sum=0.0; trial_mid_n=0; trial_mid_sec=0;
}
if (canary_lg && trial_lg_sec >= trial_sec && trial_lg_n > 0) {
double trial_mean = trial_lg_sum / (double)trial_lg_n;
double thresh = base_lg_mean * (1.0 + adopt_pct);
if (!(trial_mean >= thresh)) {
int besti = 0; double bestm=-1e100; for (int i=0;i<u_lg.n;i++){ if (u_lg.pulls[i]>0){ double m=u_lg.sum_score[i]/(double)u_lg.pulls[i]; if (m>bestm){bestm=m;besti=i;} } }
FrozenPolicy* npR = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (npR) { memcpy(npR, hkm_policy_get(), sizeof(FrozenPolicy)); npR->w_max_large = u_lg.values[besti]; hkm_policy_publish(npR); u_lg.cur = besti; fprintf(stderr, "[Learner] W_MAX large canary revert to %.2f (trial=%.3f base=%.3f)\n", npR->w_max_large, trial_mean, base_lg_mean); }
} else {
fprintf(stderr, "[Learner] W_MAX large canary adopt (trial=%.3f base=%.3f)\n", trial_mean, base_lg_mean);
}
canary_lg = 0; trial_lg_sum=0.0; trial_lg_n=0; trial_lg_sec=0;
}
}
// Optional: THP threshold learning (discrete + canary)
const char* thp_learn = getenv("HAKMEM_THP_LEARN");
if (thp_learn && atoi(thp_learn) != 0) {
static ucb1_t u_thp; static int inited=0;
static double thp_vals[8]; int n=0;
static int canary=0; static double base_mean=0.0; static double trial_sum=0.0; static int trial_n=0; static int trial_sec=0; static int trial_elapsed=0; static double adopt_pct=0.01; static int canary_on=1;
if (!inited) {
const char* s = getenv("HAKMEM_THP_CANDIDATES");
n = parse_float_list(s, thp_vals, 8);
if (n <= 0) { thp_vals[0]=524288; thp_vals[1]=786432; thp_vals[2]=1048576; thp_vals[3]=1572864; thp_vals[4]=2097152; n=5; }
ucb1_init(&u_thp, thp_vals, n, get_env_int("HAKMEM_THP_DWELL_SEC", 15));
trial_sec = get_env_int("HAKMEM_THP_TRIAL_SEC", 6);
adopt_pct = (getenv("HAKMEM_THP_ADOPT_PCT") ? atof(getenv("HAKMEM_THP_ADOPT_PCT")) : 0.015);
canary_on = get_env_int("HAKMEM_THP_CANARY", 1);
inited=1;
}
u_thp.dwell_ticks += window_ms/1000;
// reuse same score proxy
hkm_ace_stats_snapshot_t ace;
hkm_ace_stats_snapshot(&ace, 1);
double score = (double)(ace.mid_hit + ace.large_hit) - (double)(ace.mid_miss + ace.large_miss) - 2.0*(double)ace.l1_fallback;
ucb1_update(&u_thp, u_thp.cur, score);
if (canary) { trial_sum += score; trial_n++; trial_elapsed += window_ms/1000; }
if (u_thp.dwell_ticks >= u_thp.dwell_sec && !canary) {
u_thp.dwell_ticks = 0; int next = ucb1_select(&u_thp);
if (next != u_thp.cur) {
if (canary_on) {
FrozenPolicy* np5 = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (np5) {
const FrozenPolicy* cur2 = hkm_policy_get();
double prev_mean = (u_thp.pulls[u_thp.cur] > 0) ? (u_thp.sum_score[u_thp.cur] / (double)u_thp.pulls[u_thp.cur]) : 0.0;
base_mean = prev_mean; trial_sum = 0.0; trial_n = 0; trial_elapsed = 0;
memcpy(np5, cur2, sizeof(FrozenPolicy));
np5->thp_threshold = (size_t)u_thp.values[next]; hkm_policy_publish(np5);
canary = 1; u_thp.cur = next;
fprintf(stderr, "[Learner] THP canary start: %zu (base=%.3f)\n", np5->thp_threshold, base_mean);
}
} else {
FrozenPolicy* np5 = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (np5) {
memcpy(np5, hkm_policy_get(), sizeof(FrozenPolicy));
np5->thp_threshold = (size_t)u_thp.values[next]; hkm_policy_publish(np5);
u_thp.cur = next; fprintf(stderr, "[Learner] THP threshold=%zu (UCB1)\n", np5->thp_threshold);
}
}
}
}
if (canary && trial_elapsed >= trial_sec && trial_n > 0) {
double trial_mean = trial_sum / (double)trial_n;
double thresh = base_mean * (1.0 + adopt_pct);
if (!(trial_mean >= thresh)) {
int besti = 0; double bestm=-1e100; for (int i=0;i<u_thp.n;i++){ if (u_thp.pulls[i]>0){ double m=u_thp.sum_score[i]/(double)u_thp.pulls[i]; if (m>bestm){bestm=m;besti=i;} } }
FrozenPolicy* npR = (FrozenPolicy*)hkm_libc_malloc(sizeof(FrozenPolicy)); // Phase 6.X P0
if (npR) { memcpy(npR, hkm_policy_get(), sizeof(FrozenPolicy)); npR->thp_threshold = (size_t)u_thp.values[besti]; hkm_policy_publish(npR); u_thp.cur = besti; fprintf(stderr, "[Learner] THP canary revert to %zu (trial=%.3f base=%.3f)\n", npR->thp_threshold, trial_mean, base_mean); }
} else {
fprintf(stderr, "[Learner] THP canary adopt (trial=%.3f base=%.3f)\n", trial_mean, base_mean);
}
canary = 0; trial_sum = 0.0; trial_n = 0; trial_elapsed = 0;
}
}
}
return NULL;
}
void hkm_learner_init(void) {
P0 Optimization: Shared Pool fast path with O(1) metadata lookup Performance Results: - Throughput: 2.66M ops/s → 3.8M ops/s (+43% improvement) - sp_meta_find_or_create: O(N) linear scan → O(1) direct pointer - Stage 2 metadata scan: 100% → 10-20% (80-90% reduction via hints) Core Optimizations: 1. O(1) Metadata Lookup (superslab_types.h) - Added `shared_meta` pointer field to SuperSlab struct - Eliminates O(N) linear search through ss_metadata[] array - First access: O(N) scan + cache | Subsequent: O(1) direct return 2. sp_meta_find_or_create Fast Path (hakmem_shared_pool.c) - Check cached ss->shared_meta first before linear scan - Cache pointer after successful linear scan for future lookups - Reduces 7.8% CPU hotspot to near-zero for hot paths 3. Stage 2 Class Hints Fast Path (hakmem_shared_pool_acquire.c) - Try class_hints[class_idx] FIRST before full metadata scan - Uses O(1) ss->shared_meta lookup for hint validation - __builtin_expect() for branch prediction optimization - 80-90% of acquire calls now skip full metadata scan 4. Proper Initialization (ss_allocation_box.c) - Initialize shared_meta = NULL in superslab_allocate() - Ensures correct NULL-check semantics for new SuperSlabs Additional Improvements: - Updated ptr_trace and debug ring for release build efficiency - Enhanced ENV variable documentation and analysis - Added learner_env_box.h for configuration management - Various Box optimizations for reduced overhead Thread Safety: - All atomic operations use correct memory ordering - shared_meta cached under mutex protection - Lock-free Stage 2 uses proper CAS with acquire/release semantics Testing: - Benchmark: 1M iterations, 3.8M ops/s stable - Build: Clean compile RELEASE=0 and RELEASE=1 - No crashes, memory leaks, or correctness issues Next Optimization Candidates: - P1: Per-SuperSlab free slot bitmap for O(1) slot claiming - P2: Reduce Stage 2 critical section size - P3: Page pre-faulting (MAP_POPULATE) 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-12-04 16:21:54 +09:00
if (!hak_learner_env_should_run()) {
return;
}
if (g_run) return;
g_run = 1;
if (pthread_create(&g_thr, NULL, learner_main, NULL) != 0) {
g_run = 0;
fprintf(stderr, "[Learner] pthread_create failed\n");
} else {
fprintf(stderr, "[Learner] Started (CAP auto-tuner)\n");
}
}
void hkm_learner_shutdown(void) {
if (!g_run) return;
g_run = 0;
pthread_join(g_thr, NULL);
fprintf(stderr, "[Learner] Stopped\n");
}