Files
hakmem/core/hakmem_tiny_intel.inc
Moe Charm (CI) acc64f2438 Phase ML1: Pool v1 memset 89.73% overhead 軽量化 (+15.34% improvement)
## Summary
- ChatGPT により bench_profile.h の setenv segfault を修正(RTLD_NEXT 経由に切り替え)
- core/box/pool_zero_mode_box.h 新設:ENV キャッシュ経由で ZERO_MODE を統一管理
- core/hakmem_pool.c で zero mode に応じた memset 制御(FULL/header/off)
- A/B テスト結果:ZERO_MODE=header で +15.34% improvement(1M iterations, C6-heavy)

## Files Modified
- core/box/pool_api.inc.h: pool_zero_mode_box.h include
- core/bench_profile.h: glibc setenv → malloc+putenv(segfault 回避)
- core/hakmem_pool.c: zero mode 参照・制御ロジック
- core/box/pool_zero_mode_box.h (新設): enum/getter
- CURRENT_TASK.md: Phase ML1 結果記載

## Test Results
| Iterations | ZERO_MODE=full | ZERO_MODE=header | Improvement |
|-----------|----------------|-----------------|------------|
| 10K       | 3.06 M ops/s   | 3.17 M ops/s    | +3.65%     |
| 1M        | 23.71 M ops/s  | 27.34 M ops/s   | **+15.34%** |

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude Haiku 4.5 <noreply@anthropic.com>
2025-12-10 09:08:18 +09:00

528 lines
19 KiB
C++

typedef struct {
uint64_t ts_ns; // timestamp (ns, monotonic)
uint32_t size; // requested/served size
uint32_t site_id; // callsite id (optional; 0 if unknown)
uint16_t latency_bucket;// latency bucket (optional; 0 if unknown)
uint8_t tier_hit; // which tier handled/refilled (SLL/MAG/SLAB/SUPER/FRONT)
uint8_t flags; // burst/sequential/random (bit flags; 0 unused)
uint16_t class_idx; // tiny class
uint16_t thread_id; // low bits of thread id (best-effort)
} AllocEvent;
// Forward decl (defined in ss_os_acquire_box.h)
extern int ss_os_madvise_guarded(void* ptr, size_t len, int advice, const char* where);
#define EVENTQ_CAP 65536u
#define EVENTQ_MASK (EVENTQ_CAP - 1u)
static _Atomic uint32_t g_ev_tail = 0;
static _Atomic uint32_t g_ev_head = 0;
static AllocEvent g_ev_ring[EVENTQ_CAP];
static int g_int_engine = 0; // HAKMEM_INT_ENGINE=1
static pthread_t g_int_thread;
static volatile int g_int_stop = 0;
static int g_int_started = 0;
// OBS (観測) 機能は無効化。必要になった場合は git 履歴から復活させる。
#define TINY_OBS_TLS_HIT 1
#define TINY_OBS_TLS_MISS 2
#define TINY_OBS_SPILL_SS 3
#define TINY_OBS_SPILL_OWNER 4
#define TINY_OBS_SPILL_MAG 5
#define TINY_OBS_SPILL_REQUEUE 6
static inline void tiny_obs_update_interval(void) {}
static inline void tiny_obs_record(uint8_t kind, int class_idx) { (void)kind; (void)class_idx; }
static inline void tiny_obs_process(const void* ev_unused) { (void)ev_unused; }
// ---------------------------------------------------------------------------
// Tiny ACE (Adaptive Cache Engine) state machine
// ---------------------------------------------------------------------------
typedef enum {
ACE_STATE_STEADY = 0,
ACE_STATE_BURST = 1,
ACE_STATE_REMOTE_HEAVY = 2,
ACE_STATE_MEM_TIGHT = 3
} TinyAceStateId;
typedef struct {
uint64_t ema_ops;
uint64_t ema_spill;
uint64_t ema_remote;
uint64_t ema_miss;
TinyAceStateId state;
uint64_t last_switch_ns;
} TinyAceState;
typedef struct {
uint16_t mag_cap;
uint16_t sll_cap;
uint16_t fast_cap;
uint16_t batch;
uint16_t hotmag_cap;
uint16_t hotmag_refill;
uint8_t drain_mask;
uint8_t slab_lg;
TinyAceStateId state;
uint8_t hot_rank;
uint8_t request_trim;
uint64_t ema_ops_snapshot;
} TinyAcePolicy;
static TinyAceState g_ace_state[TINY_NUM_CLASSES];
static TinyAcePolicy g_ace_policy[TINY_NUM_CLASSES];
static uint64_t g_ace_tick_now_ns = 0;
static int g_ace_mem_tight_flag = 0;
static uint64_t g_ace_last_rss_check_ns = 0;
static int g_tiny_rss_budget_kb = 0; // HAKMEM_TINY_RSS_BUDGET_KB (0=disabled)
static int g_tiny_int_tight = 0; // HAKMEM_TINY_INT_TIGHT=1 → bias caps downward
static int g_tiny_diet_step = 16; // HAKMEM_TINY_DIET_STEP (cap decrement step)
static int g_tiny_cap_floor[TINY_NUM_CLASSES] = { 64, 64, 64, 128, 64, 64, 64, 64 }; // min MAG cap per class
#define ACE_COOLDOWN_NS (800ULL * 1000 * 1000) // 0.8s
#define ACE_RSS_CHECK_NS (500ULL * 1000 * 1000) // 0.5s
#define ACE_EMA_WEIGHT 8
#define ACE_MAG_STEP_DEFAULT 8
#define ACE_SLL_STEP_DEFAULT 16
static inline uint64_t tiny_ace_now_ns(void) {
struct timespec ts;
clock_gettime(CLOCK_MONOTONIC, &ts);
return (uint64_t)ts.tv_sec * 1000000000ULL + (uint64_t)ts.tv_nsec;
}
static inline uint64_t tiny_ace_ema(uint64_t prev, uint64_t sample) {
if (prev == 0) return sample;
return ((prev * (ACE_EMA_WEIGHT - 1)) + sample) / ACE_EMA_WEIGHT;
}
// EXTRACTED: static int get_rss_kb_self(void);
static __attribute__((unused)) void tiny_ace_update_mem_tight(uint64_t now_ns) {
if (g_tiny_rss_budget_kb <= 0) {
g_ace_mem_tight_flag = 0;
return;
}
if (now_ns - g_ace_last_rss_check_ns < ACE_RSS_CHECK_NS) {
return;
}
g_ace_last_rss_check_ns = now_ns;
int rss_kb = get_rss_kb_self();
if (rss_kb > 0) {
int high = (g_tiny_rss_budget_kb * 95) / 100;
int low = (g_tiny_rss_budget_kb * 85) / 100;
if (rss_kb >= high) g_ace_mem_tight_flag = 1;
else if (rss_kb <= low) g_ace_mem_tight_flag = 0;
}
}
static __attribute__((unused)) void tiny_ace_collect_stats(int idx, const void* st_unused) {
TinyAceState* cs = &g_ace_state[idx];
TinyAcePolicy pol = g_ace_policy[idx];
uint64_t now = g_ace_tick_now_ns;
(void)st_unused;
uint64_t ops = 0;
uint64_t spills_total = 0;
uint64_t remote_spill = 0;
uint64_t miss = 0;
cs->ema_ops = tiny_ace_ema(cs->ema_ops, ops);
cs->ema_spill = tiny_ace_ema(cs->ema_spill, spills_total);
cs->ema_remote = tiny_ace_ema(cs->ema_remote, remote_spill);
cs->ema_miss = tiny_ace_ema(cs->ema_miss, miss);
if (ops == 0 && spills_total == 0) {
pol.ema_ops_snapshot = cs->ema_ops;
g_ace_policy[idx] = pol;
return;
}
TinyAceStateId next_state;
if (g_ace_mem_tight_flag) {
next_state = ACE_STATE_MEM_TIGHT;
} else if (spills_total > 0) {
next_state = ACE_STATE_BURST;
} else if (cs->ema_remote > 16 && cs->ema_remote >= (cs->ema_spill / 3 + 1)) {
next_state = ACE_STATE_REMOTE_HEAVY;
} else if (cs->ema_spill > 32 || cs->ema_miss > 16 || miss > 16) {
next_state = ACE_STATE_BURST;
} else {
next_state = ACE_STATE_STEADY;
}
if (next_state != cs->state) {
if (now - cs->last_switch_ns >= ACE_COOLDOWN_NS) {
cs->state = next_state;
cs->last_switch_ns = now;
} else {
next_state = cs->state;
}
}
pol.state = cs->state;
pol.hot_rank = 0;
pol.request_trim = 0;
int base_mag = tiny_default_cap(idx);
if (base_mag > g_mag_cap_limit) base_mag = g_mag_cap_limit;
if (base_mag < 16) base_mag = 16;
int mag_min = base_mag;
int mag_max = tiny_cap_max_for_class(idx);
if (mag_max > g_mag_cap_limit) mag_max = g_mag_cap_limit;
if (mag_max < mag_min) mag_max = mag_min;
int current_mag = g_mag_cap_override[idx];
if (current_mag <= 0) current_mag = base_mag;
if (current_mag < mag_min) current_mag = mag_min;
if (current_mag > mag_max) current_mag = mag_max;
int mag_step = ACE_MAG_STEP_DEFAULT;
if (mag_step < 1) mag_step = 1;
int current_sll = pol.sll_cap;
if (current_sll < current_mag) current_sll = current_mag;
if (current_sll < 32) current_sll = 32;
int sll_step = ACE_SLL_STEP_DEFAULT;
if (sll_step < 1) sll_step = 1;
int sll_max = TINY_TLS_MAG_CAP;
uint16_t base_fast = g_fast_cap_defaults[idx];
uint16_t current_fast = g_fast_cap[idx];
if (current_fast == 0 && base_fast > 0) current_fast = base_fast;
uint16_t new_fast = current_fast;
uint16_t new_batch = (idx <= 3) ? 64 : 48;
uint8_t new_drain = 2;
uint8_t new_slab_lg = 20;
int new_mag = current_mag;
int new_sll = current_sll;
int hot_cap_new = (int)hotmag_effective_cap(idx);
int hot_refill_new = (int)hotmag_refill_target(idx);
int hot_cap_limit = g_hotmag_cap_default + 64;
if (hot_cap_limit < 32) hot_cap_limit = 32;
if (hot_cap_limit > 512) hot_cap_limit = 512;
int hot_cap_floor = 24;
if (hot_cap_floor > hot_cap_limit) hot_cap_floor = hot_cap_limit;
switch (cs->state) {
case ACE_STATE_STEADY: {
if (new_mag > mag_min) {
int dec = mag_step / 2;
if (dec < 1) dec = 1;
new_mag -= dec;
if (new_mag < mag_min) new_mag = mag_min;
}
int target_sll = new_mag * ((g_sll_multiplier > 0) ? g_sll_multiplier : 2);
if (target_sll < new_mag) target_sll = new_mag;
if (new_sll > target_sll) {
int dec = sll_step / 2;
if (dec < 1) dec = 1;
new_sll -= dec;
if (new_sll < target_sll) new_sll = target_sll;
}
if (g_hotmag_enable && idx <= 3) {
if (!g_hotmag_cap_locked[idx]) hot_cap_new -= 16;
if (!g_hotmag_refill_locked[idx]) hot_refill_new -= 8;
}
if (g_fast_enable) new_fast = base_fast;
new_drain = 2;
break;
}
case ACE_STATE_BURST: {
if (g_hotmag_enable && idx <= 3) {
if (!g_hotmag_cap_locked[idx]) hot_cap_new += 32;
if (!g_hotmag_refill_locked[idx]) hot_refill_new += 16;
}
new_mag += mag_step;
if (new_mag > mag_max) new_mag = mag_max;
int target_sll = new_mag * ((g_sll_multiplier > 0) ? g_sll_multiplier : 2) + sll_step;
if (target_sll > sll_max) target_sll = sll_max;
if (target_sll > new_sll) new_sll = target_sll;
if (g_fast_enable) {
uint32_t f = (uint32_t)((base_fast > 0) ? base_fast : current_fast) + 64u;
if (f > TINY_TLS_MAG_CAP) f = TINY_TLS_MAG_CAP;
new_fast = (uint16_t)f;
}
new_batch = (idx <= 3) ? 96 : 64;
new_drain = 1;
if (idx <= 3) new_slab_lg = 21;
break;
}
case ACE_STATE_REMOTE_HEAVY: {
if (g_hotmag_enable && idx <= 3) {
if (!g_hotmag_cap_locked[idx]) hot_cap_new += 16;
if (!g_hotmag_refill_locked[idx]) hot_refill_new += 8;
}
int target_sll = new_sll + sll_step;
if (target_sll > sll_max) target_sll = sll_max;
new_sll = target_sll;
if (new_mag < mag_max) {
int inc = mag_step / 2;
if (inc < 1) inc = 1;
new_mag += inc;
if (new_mag > mag_max) new_mag = mag_max;
}
new_drain = 0;
if (g_fast_enable) new_fast = base_fast;
break;
}
case ACE_STATE_MEM_TIGHT: {
if (g_hotmag_enable && idx <= 3) {
if (!g_hotmag_cap_locked[idx]) hot_cap_new -= 24;
if (!g_hotmag_refill_locked[idx]) hot_refill_new /= 2;
}
new_mag -= mag_step * 2;
if (new_mag < mag_min) new_mag = mag_min;
new_sll -= sll_step;
if (new_sll < new_mag) new_sll = new_mag;
pol.request_trim = 1;
if (g_fast_enable) {
if (base_fast > 0) {
uint32_t f = base_fast / 2;
if (f < 16) f = 16;
if (f > TINY_TLS_MAG_CAP) f = TINY_TLS_MAG_CAP;
new_fast = (uint16_t)f;
} else {
new_fast = 0;
}
}
new_batch = (idx <= 3) ? 48 : 32;
new_drain = 2;
new_slab_lg = 20;
break;
}
}
if (g_hotmag_enable && idx <= 3) {
if (!g_hotmag_cap_locked[idx]) {
if (hot_cap_new > hot_cap_limit) hot_cap_new = hot_cap_limit;
if (hot_cap_new < hot_cap_floor) hot_cap_new = hot_cap_floor;
} else {
hot_cap_new = (int)hotmag_effective_cap(idx);
}
if (!g_hotmag_refill_locked[idx]) {
if (hot_refill_new < 0) hot_refill_new = 0;
if (hot_refill_new > hot_cap_new) hot_refill_new = hot_cap_new;
if (hot_refill_new > 0 && hot_refill_new < 8) hot_refill_new = 8;
} else {
hot_refill_new = (int)hotmag_refill_target(idx);
}
} else {
hot_cap_new = (int)hotmag_effective_cap(idx);
hot_refill_new = (int)hotmag_refill_target(idx);
}
if (new_mag > mag_max) new_mag = mag_max;
if (new_mag < mag_min) new_mag = mag_min;
if (new_sll > sll_max) new_sll = sll_max;
if (new_sll < new_mag) new_sll = new_mag;
if (new_mag < current_mag) pol.request_trim = 1;
if (!g_fast_enable) new_fast = 0;
if (new_fast > TINY_TLS_MAG_CAP) new_fast = TINY_TLS_MAG_CAP;
pol.mag_cap = (uint16_t)new_mag;
pol.sll_cap = (uint16_t)new_sll;
pol.fast_cap = new_fast;
pol.batch = new_batch;
pol.drain_mask = new_drain;
pol.slab_lg = new_slab_lg;
pol.hotmag_cap = (uint16_t)hot_cap_new;
pol.hotmag_refill = (uint16_t)hot_refill_new;
pol.ema_ops_snapshot = cs->ema_ops;
g_ace_policy[idx] = pol;
}
static __attribute__((unused)) void tiny_ace_refresh_hot_ranks(void) {
int top1 = -1, top2 = -1, top3 = -1;
uint64_t val1 = 0, val2 = 0, val3 = 0;
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
uint64_t ops = g_ace_state[i].ema_ops;
if (ops > val1) {
val3 = val2; top3 = top2;
val2 = val1; top2 = top1;
val1 = ops; top1 = i;
} else if (ops > val2) {
val3 = val2; top3 = top2;
val2 = ops; top2 = i;
} else if (ops > val3) {
val3 = ops; top3 = i;
}
}
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
TinyAcePolicy* pol = &g_ace_policy[i];
pol->hot_rank = 0;
if (!g_fast_enable) {
pol->fast_cap = 0;
}
}
if (top1 >= 0) g_ace_policy[top1].hot_rank = 2;
if (top2 >= 0) g_ace_policy[top2].hot_rank = 1;
if (top3 >= 0) g_ace_policy[top3].hot_rank = 3;
if (!g_fast_enable) {
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
g_hot_alloc_fn[i] = NULL;
}
return;
}
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
TinyAcePolicy* pol = &g_ace_policy[i];
uint32_t target = pol->fast_cap;
if (pol->state == ACE_STATE_MEM_TIGHT) {
// keep tightened fast cap
} else if (pol->hot_rank == 2) {
target += 48u;
} else if (pol->hot_rank == 1) {
target += 24u;
} else if (pol->hot_rank == 3) {
target += 16u;
} else {
// relax toward base
uint32_t base = g_fast_cap_defaults[i];
if (target > base) {
uint32_t dec = target - base;
if (dec > 32u) dec = 32u;
if (target > dec) target -= dec;
else target = base;
} else {
target = base;
}
}
if (target > TINY_TLS_MAG_CAP) target = TINY_TLS_MAG_CAP;
pol->fast_cap = (uint16_t)target;
}
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
TinyHotAllocFn fn = NULL;
if (g_ace_policy[i].hot_rank != 0) {
switch (i) {
case 0: fn = tiny_hot_pop_class0; break;
case 1: fn = tiny_hot_pop_class1; break;
case 2: fn = tiny_hot_pop_class2; break;
case 3: fn = tiny_hot_pop_class3; break;
default: fn = NULL; break;
}
}
g_hot_alloc_fn[i] = fn;
}
}
static __attribute__((unused)) void tiny_ace_apply_policies(void) {
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
TinyAcePolicy* pol = &g_ace_policy[i];
int prev_mag = g_mag_cap_override[i];
if (prev_mag <= 0) prev_mag = tiny_default_cap(i);
int new_mag = pol->mag_cap;
if (new_mag < 16) new_mag = 16;
if (new_mag > TINY_TLS_MAG_CAP) new_mag = TINY_TLS_MAG_CAP;
if (new_mag != g_mag_cap_override[i]) {
g_mag_cap_override[i] = new_mag;
tiny_tls_publish_targets(i, (uint32_t)new_mag);
}
if (pol->request_trim || new_mag < prev_mag) {
tiny_tls_request_trim(i, 0);
}
int new_sll = pol->sll_cap;
if (new_sll < new_mag) new_sll = new_mag;
if (new_sll > TINY_TLS_MAG_CAP) new_sll = TINY_TLS_MAG_CAP;
pol->sll_cap = (uint16_t)new_sll; // publish only into policy (no global override)
if (g_fast_enable && !g_fast_cap_locked[i]) {
uint16_t new_fast = pol->fast_cap;
if (new_fast > TINY_TLS_MAG_CAP) new_fast = TINY_TLS_MAG_CAP;
g_fast_cap[i] = new_fast;
}
if (g_hotmag_enable && hkm_is_hot_class(i)) {
if (!g_hotmag_cap_locked[i]) {
uint16_t target_cap = pol->hotmag_cap;
if (target_cap < 16) target_cap = 16;
if (target_cap > 512) target_cap = 512;
if (g_hotmag_cap_current[i] != target_cap) {
g_hotmag_cap_current[i] = target_cap;
}
}
if (!g_hotmag_refill_locked[i]) {
uint16_t target_ref = pol->hotmag_refill;
if (target_ref > g_hotmag_cap_current[i]) target_ref = g_hotmag_cap_current[i];
g_hotmag_refill_current[i] = target_ref;
}
hotmag_init_if_needed(i);
}
}
}
static __attribute__((unused)) void tiny_ace_init_defaults(void) {
uint64_t now = tiny_ace_now_ns();
int mult = (g_sll_multiplier > 0) ? g_sll_multiplier : 2;
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
TinyAceState* cs = &g_ace_state[i];
cs->ema_ops = 0;
cs->ema_spill = 0;
cs->ema_remote = 0;
cs->ema_miss = 0;
cs->state = ACE_STATE_STEADY;
cs->last_switch_ns = now;
TinyAcePolicy* pol = &g_ace_policy[i];
pol->state = ACE_STATE_STEADY;
pol->hot_rank = 0;
pol->request_trim = 0;
pol->ema_ops_snapshot = 0;
int base_mag = tiny_default_cap(i);
if (base_mag > g_mag_cap_limit) base_mag = g_mag_cap_limit;
if (base_mag < 16) base_mag = 16;
pol->mag_cap = (uint16_t)base_mag;
int sll = base_mag * mult;
if (sll > TINY_TLS_MAG_CAP) sll = TINY_TLS_MAG_CAP;
pol->sll_cap = (uint16_t)sll;
pol->fast_cap = g_fast_cap_defaults[i];
pol->batch = (i <= 3) ? 64 : 48;
pol->drain_mask = 2;
pol->slab_lg = 20;
pol->hotmag_cap = hotmag_effective_cap(i);
pol->hotmag_refill = hotmag_refill_target(i);
if (g_mag_cap_override[i] <= 0) g_mag_cap_override[i] = pol->mag_cap;
switch (i) {
case 0: g_hot_alloc_fn[i] = tiny_hot_pop_class0; break;
case 1: g_hot_alloc_fn[i] = tiny_hot_pop_class1; break;
case 2: g_hot_alloc_fn[i] = tiny_hot_pop_class2; break;
default: g_hot_alloc_fn[i] = NULL; break;
}
}
for (int i = 0; i < TINY_NUM_CLASSES; i++) {
tiny_tls_publish_targets(i, (uint32_t)g_mag_cap_override[i]);
}
}
static inline void superslab_partial_release(SuperSlab* ss, uint32_t epoch) {
#if defined(MADV_DONTNEED)
if (!g_ss_partial_enable) return;
if (!ss || ss->magic != SUPERSLAB_MAGIC) return;
uint32_t prev = ss->partial_epoch;
if (epoch != 0 && (epoch - prev) < g_ss_partial_interval) return;
size_t len = (size_t)1 << ss->lg_size;
if (ss_os_madvise_guarded(ss, len, MADV_DONTNEED, "tiny_ss_partial") == 0) {
ss->partial_epoch = epoch;
}
#else
(void)ss; (void)epoch;
#endif
}
// Tiny diet (memory-tight) controls
// Event logging options: default minimal (no timestamp, no thread id)
static int g_int_event_ts = 0; // HAKMEM_INT_EVENT_TS=1 to include timestamp
static unsigned g_int_sample_mask = 0; // HAKMEM_INT_SAMPLE=(N) → mask=(1<<N)-1; 0=log all
static __thread unsigned g_tls_ev_seq = 0; // per-thread event seq