Debug Counters Implementation - Clean History

Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 12:31:14 +09:00
commit 52386401b3
27144 changed files with 124451 additions and 0 deletions
--- a/benchmarks/scripts/utils/head_to_head_large.sh
+++ b/benchmarks/scripts/utils/head_to_head_large.sh
@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Head-to-head for Large(64KB–1MB), 10s, system/mimalloc/hakmem (P1/P2 profiles)
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+LARSON="$ROOT_DIR/mimalloc-bench/bench/larson/larson"
+LIB_HAK="$ROOT_DIR/libhakmem.so"
+LIB_MI="/lib/x86_64-linux-gnu/libmimalloc.so.2"
+
+if [[ ! -x "$LARSON" ]]; then
+  echo "[ERR] larson not found: $LARSON" >&2; exit 1
+fi
+if [[ ! -f "$LIB_HAK" ]]; then
+  echo "[ERR] libhakmem.so not found: $LIB_HAK" >&2; exit 1
+fi
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUT="$ROOT_DIR/docs/benchmarks/${TS}_HEAD2HEAD_LARGE"
+mkdir -p "$OUT"
+echo "[OUT] $OUT"
+
+cd "$ROOT_DIR/mimalloc-bench/bench/larson"
+
+echo "System malloc LARGE_4T" | tee "$OUT/system_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/system_large_4t.log"
+
+echo "mimalloc LARGE_4T" | tee "$OUT/mimalloc_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" env LD_PRELOAD="$LIB_MI" "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/mimalloc_large_4t.log"
+
+# P1 best (alloc優先)
+echo "hakmem P1 LARGE_4T (remote, factor=4, HDR=1)" | tee "$OUT/hakmem_p1_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" env LD_PRELOAD="$LIB_HAK" HAKMEM_WRAP_L25=1 HAKMEM_L25_PREF=remote HAKMEM_L25_RUN_FACTOR=4 \
+    HAKMEM_HDR_LIGHT=1 HAKMEM_SHARD_MIX=1 HAKMEM_TLS_LO_MAX=512 \
+    "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/hakmem_p1_large_4t.log"
+
+# P2+TC best (free優先)
+echo "hakmem P2+TC LARGE_4T (remote, factor=4, HDR=2, TC_SPILL=16)" | tee "$OUT/hakmem_p2_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" env LD_PRELOAD="$LIB_HAK" HAKMEM_WRAP_L25=1 HAKMEM_L25_PREF=remote HAKMEM_L25_RUN_FACTOR=4 \
+    HAKMEM_HDR_LIGHT=2 HAKMEM_L25_TC_SPILL=16 HAKMEM_SHARD_MIX=1 HAKMEM_TLS_LO_MAX=512 \
+    "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/hakmem_p2_large_4t.log"
+
+cd - >/dev/null
+
+rg -n "Throughput" "$OUT"/*.log | tee "$OUT/summary.txt" || true
+echo "[DONE] Logs at $OUT"
--- a/benchmarks/scripts/utils/kill_bench.sh
+++ b/benchmarks/scripts/utils/kill_bench.sh
@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Kill any lingering mimalloc-bench/larson runs and our bench runner scripts.
+# Usage: scripts/kill_bench.sh
+
+PATS=(
+  "mimalloc-bench/bench/larson/larson"
+  "scripts/run_bench_suite.sh"
+  "scripts/save_prof_sweep.sh"
+  "scripts/ab_sweep_mid.sh"
+)
+
+found=0
+for pat in "${PATS[@]}"; do
+  if pgrep -fa "$pat" >/dev/null 2>&1; then
+    echo "[kill_bench] Found processes for: $pat"
+    pgrep -fa "$pat" || true
+    found=1
+  fi
+done
+
+if [[ "$found" -eq 0 ]]; then
+  echo "[kill_bench] No matching bench processes found."
+  exit 0
+fi
+
+echo "[kill_bench] Sending SIGTERM..."
+for pat in "${PATS[@]}"; do
+  pgrep -f "$pat" >/dev/null 2>&1 && pkill -f "$pat" || true
+done
+
+sleep 1
+
+echo "[kill_bench] Forcing SIGKILL for leftovers..."
+for pat in "${PATS[@]}"; do
+  pgrep -f "$pat" >/dev/null 2>&1 && pkill -9 -f "$pat" || true
+done
+
+echo "[kill_bench] Done."
+
--- a/benchmarks/scripts/utils/lua_workload.lua
+++ b/benchmarks/scripts/utils/lua_workload.lua
@ -0,0 +1,25 @@
+-- lua_workload.lua - mixed string builder + table churn
+
+local N = tonumber(os.getenv("LUA_WORK_N")) or 500000
+
+-- String builder (amortized)
+local t = {}
+for i = 1, N do
+  t[#t+1] = tostring(i)
+  if (i % 5) == 0 then t[#t+1] = "-" end
+end
+local s = table.concat(t)
+
+-- Table churn (insert/remove)
+local arr = {}
+for i = 1, N do
+  arr[i] = i * 3
+end
+local sum = 0
+for i = 1, N, 3 do
+  sum = sum + (arr[i] or 0)
+  arr[i] = nil
+end
+
+print("len(s)=", #s, " sum=", sum)
+
--- a/benchmarks/scripts/utils/parse_comprehensive_logs.py
+++ b/benchmarks/scripts/utils/parse_comprehensive_logs.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import sys, os, re
+
+TEST_MAP = {
+    'Sequential LIFO': 'lifo',
+    'Sequential FIFO': 'fifo',
+    'Random Order Free': 'random',
+    'Interleaved': 'interleave',
+    'Long-lived vs Short-lived': 'longshort',
+    'Mixed Sizes': 'mixed',
+}
+
+def parse_file(path, allocator):
+    size = None
+    cur_test = None
+    results = []
+    with open(path,'r',errors='ignore') as f:
+        for line in f:
+            m = re.search(r'^SIZE CLASS:\s*(\d+) Bytes', line)
+            if m:
+                size = int(m.group(1))
+                cur_test = None
+                continue
+            # Detect tests
+            for key, short in TEST_MAP.items():
+                if key != 'Mixed Sizes' and key in line:
+                    cur_test = short
+                    break
+            if 'Mixed Sizes ---' in line or 'Test 5: Mixed Sizes' in line:
+                size = 'mixed'
+                cur_test = 'mixed'
+            m2 = re.search(r'^Throughput:\s*([0-9.]+) M ops/sec', line)
+            if m2:
+                thr = float(m2.group(1))
+                results.append((allocator, size, cur_test, thr))
+    return results
+
+def main():
+    if len(sys.argv) != 2:
+        print('usage: parse_comprehensive_logs.py <dir>', file=sys.stderr)
+        sys.exit(1)
+    d = sys.argv[1]
+    out = []
+    for name, alloc in [('hakmem.log','hakmem'),('mimalloc.log','mimalloc'),('system.log','system')]:
+        p = os.path.join(d,name)
+        if os.path.exists(p):
+            out.extend(parse_file(p, alloc))
+    print('allocator,size,test,throughput_mops')
+    for rec in out:
+        print('{},{},{},{}'.format(rec[0], rec[1], rec[2], rec[3]))
+
+if __name__ == '__main__':
+    main()
+
--- a/benchmarks/scripts/utils/parse_mimalloc_logs.py
+++ b/benchmarks/scripts/utils/parse_mimalloc_logs.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import sys, re, os, glob
+
+# Parse mimalloc-bench logs and print CSV: file,allocator,procs,test,throughput
+# Assumes lines like: "Throughput =   1234567 operations per second, ..."
+
+def infer_allocator_from_file(path):
+    fn = os.path.basename(path)
+    if fn.startswith('hakmem_'): return 'hakmem'
+    if fn.startswith('mimalloc_'): return 'mimalloc'
+    if fn.startswith('jemalloc_'): return 'jemalloc'
+    if fn.startswith('system_'): return 'system'
+    # fallback: try substring
+    for k in ('hakmem','mimalloc','jemalloc','system'):
+        if k in fn: return k
+    return 'unknown'
+
+def infer_procs_from_file(path):
+    m = re.search(r'procs=([0-9,]+)', os.path.basename(path))
+    return m.group(1) if m else ''
+
+def parse_file(path):
+    alloc = infer_allocator_from_file(path)
+    procs = infer_procs_from_file(path)
+    test = ''
+    with open(path,'r',errors='ignore') as f:
+        for line in f:
+            tl = line.strip()
+            m = re.search(r'^Test\s*:\s*(\S+)', tl)
+            if m: test = m.group(1)
+            m2 = re.search(r'Throughput\s*=\s*([0-9]+)\s+operations per second', tl)
+            if m2:
+                thr = int(m2.group(1))
+                yield (path, alloc, procs, test, thr)
+
+def main():
+    if len(sys.argv) != 2:
+        print(f"usage: {sys.argv[0]} <log_dir>")
+        sys.exit(1)
+    logdir = sys.argv[1]
+    files = sorted(glob.glob(os.path.join(logdir,'*.log')))
+    print('file,allocator,procs,test,throughput_ops_per_sec')
+    for fp in files:
+        for rec in parse_file(fp):
+            print(','.join(str(x) for x in rec))
+
+if __name__ == '__main__':
+    main()
+
--- a/benchmarks/scripts/utils/parse_usdt_stat.py
+++ b/benchmarks/scripts/utils/parse_usdt_stat.py
@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+import sys
+import os
+import re
+import csv
+
+# Accept PMU events with or without user-only suffix (":u")
+PMU_EVENTS = {
+    'cycles': 'cycles',
+    'cycles:u': 'cycles',
+    'instructions': 'instructions',
+    'instructions:u': 'instructions',
+    'L1-dcache-load-misses': 'l1_miss',
+    'L1-dcache-load-misses:u': 'l1_miss',
+    'branch-misses': 'br_miss',
+    'branch-misses:u': 'br_miss',
+}
+
+USDT_EVENTS = {
+    'sdt:hakmem:sll_pop': 'sll_pop',
+    'sdt:hakmem:mag_pop': 'mag_pop',
+    'sdt:hakmem:front_pop': 'front_pop',
+    'sdt:hakmem:bump_hit': 'bump_hit',
+    'sdt:hakmem:slow_alloc': 'slow_alloc',
+    'sdt:hakmem:sll_push': 'sll_push',
+    'sdt:hakmem:mag_push': 'mag_push',
+    'sdt:hakmem:spill_super': 'spill_super',
+    'sdt:hakmem:spill_tiny': 'spill_tiny',
+    'sdt:hakmem:remote_drain': 'remote_drain',
+    'sdt:hakmem:superslab_alloc': 'super_alloc',
+    'sdt:hakmem:superslab_fail': 'super_fail',
+    'sdt:hakmem:quick_pop': 'quick_pop',
+    'sdt:hakmem:quick_refill_sll': 'quick_refill_sll',
+    'sdt:hakmem:quick_refill_mag': 'quick_refill_mag',
+    'sdt:hakmem:bitmap_burst': 'bitmap_burst',
+    'sdt:hakmem:mag_refill': 'mag_refill',
+    'sdt:hakmem:bitmap_scan': 'bitmap_scan',
+}
+
+def parse_value(s):
+    s = s.strip()
+    # perf may print numbers with no separators in -x , mode; best-effort
+    try:
+        return int(s)
+    except ValueError:
+        # try float to int
+        try:
+            return int(float(s))
+        except Exception:
+            return None
+
+def parse_stat_file(path):
+    data = {}
+    with open(path, 'r', errors='ignore') as f:
+        for line in f:
+            parts = [p.strip() for p in line.strip().split(',')]
+            if len(parts) < 3:
+                continue
+            val = parse_value(parts[0])
+            event = parts[2]
+            if val is None:
+                continue
+            # Normalize PMU event key (strip optional ":u")
+            if not event.startswith('sdt:'):
+                base = event.split(':')[0]
+                if event not in PMU_EVENTS and base in PMU_EVENTS:
+                    event = base
+            if event in PMU_EVENTS:
+                data[PMU_EVENTS[event]] = val
+            elif event in USDT_EVENTS:
+                name = USDT_EVENTS[event]
+                data[name] = data.get(name, 0) + val
+            # else ignore
+    return data
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: parse_usdt_stat.py <usdt_results_dir>")
+        sys.exit(1)
+    root = sys.argv[1]
+    rows = []
+    for fn in sorted(os.listdir(root)):
+        if not fn.endswith('.stat.csv'):
+            continue
+        m = re.match(r'(?P<alloc>hakmem|system)_s(?P<size>\d+)_b(?P<batch>\d+)_c(?P<cycles>\d+)\.stat\.csv', fn)
+        if not m:
+            continue
+        meta = m.groupdict()
+        path = os.path.join(root, fn)
+        stats = parse_stat_file(path)
+        row = {
+            'allocator': meta['alloc'],
+            'size': int(meta['size']),
+            'batch': int(meta['batch']),
+            'cycles_param': int(meta['cycles']),
+        }
+        row.update(stats)
+        # derived
+        total_pops = sum(row.get(k, 0) for k in ('sll_pop','mag_pop','front_pop'))
+        if total_pops > 0:
+            row['front_rate'] = row.get('front_pop',0)/total_pops
+            row['sll_rate'] = row.get('sll_pop',0)/total_pops
+            row['mag_rate'] = row.get('mag_pop',0)/total_pops
+        else:
+            row['front_rate'] = row['sll_rate'] = row['mag_rate'] = 0.0
+        rows.append(row)
+
+    # sort for readability
+    rows.sort(key=lambda r: (r['allocator'], r['size'], r['batch']))
+    out = os.path.join(root, 'summary.csv')
+    # collect headers
+    headers = ['allocator','size','batch','cycles_param'] + list(PMU_EVENTS.values()) + list(USDT_EVENTS.values()) + ['front_rate','sll_rate','mag_rate']
+    # remove duplicates but keep order
+    seen = set()
+    hdr_final = []
+    for h in headers:
+        if h not in seen:
+            hdr_final.append(h)
+            seen.add(h)
+    with open(out, 'w', newline='') as f:
+        w = csv.DictWriter(f, fieldnames=hdr_final)
+        w.writeheader()
+        for r in rows:
+            w.writerow(r)
+    print(out)
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/scripts/utils/run_benchfast_sweep.sh
+++ b/benchmarks/scripts/utils/run_benchfast_sweep.sh
@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Sweep bench-fastpath refill sizes (8/12/16) and run Tiny-Hot triad each.
+# Usage: scripts/run_benchfast_sweep.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+echo "[build] system/mimalloc (fast + mi)"
+make -s bench_fast bench_tiny_hot_mi >/dev/null
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/tiny_benchfast_sweep_${TS}"
+mkdir -p "$OUTDIR"
+
+run_case() {
+  local tag="$1"; shift
+  echo "[build] HAKMEM bench-fastpath (${tag})"
+  make -s "bench_fastpath_${tag}" >/dev/null
+  echo "[run] triad (${tag})"
+  SKIP_BUILD=1 bash scripts/run_tiny_hot_triad.sh "$cycles"
+  # pick the latest triad CSV and copy with tag
+  local latest_csv
+  latest_csv=$(ls -1dt bench_results/tiny_hot_triad_* | head -1)/results.csv
+  cp "$latest_csv" "$OUTDIR/results_${tag}.csv"
+  echo "[saved] $OUTDIR/results_${tag}.csv"
+}
+
+run_case r8
+run_case r12
+run_case r16
+
+echo "[done] sweep outputs in: $OUTDIR"
+
--- a/benchmarks/scripts/utils/run_larson.sh
+++ b/benchmarks/scripts/utils/run_larson.sh
@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Reproducible larson runner for hakmem/system/mimalloc.
+#
+# Usage:
+#   scripts/run_larson.sh [runtime_sec] [threads]
+# Examples:
+#   scripts/run_larson.sh                 # default: 10s, threads=1 4
+#   scripts/run_larson.sh 10 1            # 10s, 1 thread
+#
+# Optional env vars:
+#   HAKMEM_WRAP_TINY=0|1
+#   HAKMEM_WRAP_TINY_REFILL=0|1
+#   HAKMEM_TINY_MAG_CAP=INT
+#   HAKMEM_SAFE_FREE=0|1
+#   HAKMEM_EVO_SAMPLE=INT  (0 disables evo recording; default 0)
+#   MIMALLOC_SO=/path/to/libmimalloc.so.2 (optional; if not set, auto-detect)
+
+usage() {
+  cat << USAGE
+Usage: scripts/run_larson.sh [options] [runtime_sec] [threads_csv]
+
+Options:
+  -d SECONDS     Runtime seconds (default: 10)
+  -t CSV         Threads CSV, e.g. 1,4 (default: 1,4)
+  -c NUM         Chunks per thread (default: 10000)
+  -r NUM         Rounds (default: 1)
+  -m BYTES       Min size (default: 8)
+  -M BYTES       Max size (default: 1024)
+  -s SEED        Random seed (default: 12345)
+  -p PRESET      Preset: burst|loop (sets -c/-r)
+  -w             Include WRAP_TINY=1 runs (default: off)
+  -h             Show this help
+
+Env overrides (alternative to flags):
+  MIN, MAX, CHUNK_PER_THREAD, ROUNDS, SEED
+  HAKMEM_* toggles per README
+USAGE
+}
+
+# Defaults
+RUNTIME="10"
+THREADS_ARG="1,4"
+
+# Workload defaults (burst preset)
+MIN="${MIN:-8}"
+MAX="${MAX:-1024}"
+CHUNK_PER_THREAD="${CHUNK_PER_THREAD:-10000}"
+ROUNDS="${ROUNDS:-1}"
+SEED="${SEED:-12345}"
+
+PRESET=""
+
+INCLUDE_WRAP=0
+while getopts ":d:t:c:r:m:M:s:p:wh" opt; do
+  case $opt in
+    d) RUNTIME="$OPTARG" ;;
+    t) THREADS_ARG="$OPTARG" ;;
+    c) CHUNK_PER_THREAD="$OPTARG" ;;
+    r) ROUNDS="$OPTARG" ;;
+    m) MIN="$OPTARG" ;;
+    M) MAX="$OPTARG" ;;
+    s) SEED="$OPTARG" ;;
+    p) PRESET="$OPTARG" ;;
+    w) INCLUDE_WRAP=1 ;;
+    h) usage; exit 0 ;;
+    :) echo "Missing argument for -$OPTARG" >&2; usage; exit 2 ;;
+    *) usage; exit 2 ;;
+  esac
+done
+shift $((OPTIND-1))
+
+# Backward-compatible positional args
+if [[ $# -ge 1 ]]; then RUNTIME="$1"; fi
+if [[ $# -ge 2 ]]; then THREADS_ARG="$2"; fi
+
+case "$PRESET" in
+  burst|BURST)
+    CHUNK_PER_THREAD=10000; ROUNDS=1 ;;
+  loop|LOOP)
+    CHUNK_PER_THREAD=100; ROUNDS=100 ;;
+  "" ) : ;;
+  *) echo "Unknown preset: $PRESET" >&2; exit 2 ;;
+esac
+
+# Params matching our standard runs (larson reads: runtime, min, max, chunks/thread, rounds, seed, threads)
+# Show resolved parameters for reproducibility
+echo "[CFG] runtime=${RUNTIME}s threads={${THREADS_ARG}} min=${MIN} max=${MAX} chunks/thread=${CHUNK_PER_THREAD} rounds=${ROUNDS} seed=${SEED}"
+
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+LIB_HAKMEM="$ROOT_DIR/libhakmem.so"
+LARSON_BIN="$ROOT_DIR/mimalloc-bench/bench/larson/larson"
+
+if [[ ! -x "$LARSON_BIN" ]]; then
+  echo "[ERR] Larson binary not found at: $LARSON_BIN" >&2
+  echo "      Did you sync submodule/build bench?" >&2
+  exit 1
+fi
+
+if [[ ! -f "$LIB_HAKMEM" ]]; then
+  echo "[INFO] libhakmem.so not found; building..."
+  (cd "$ROOT_DIR" && make -j4 shared >/dev/null)
+fi
+
+abs_hakmem="$(readlink -f "$LIB_HAKMEM")"
+
+detect_mimalloc() {
+  if [[ -n "${MIMALLOC_SO:-}" && -f "$MIMALLOC_SO" ]]; then
+    echo "$MIMALLOC_SO"
+    return 0
+  fi
+  # try common paths or ldconfig
+  for p in \
+    /usr/lib/x86_64-linux-gnu/libmimalloc.so.2 \
+    /lib/x86_64-linux-gnu/libmimalloc.so.2; do
+    [[ -f "$p" ]] && { echo "$p"; return 0; }
+  done
+  if command -v ldconfig >/dev/null 2>&1; then
+    so="$(ldconfig -p | awk '/libmimalloc.so/ {print $4; exit}')"
+    [[ -n "$so" && -f "$so" ]] && { echo "$so"; return 0; }
+  fi
+  return 1
+}
+
+mimalloc_so=""
+if mimalloc_so=$(detect_mimalloc); then
+  :
+else
+  mimalloc_so=""
+fi
+
+run_case() {
+  local label="$1"; shift
+  local preload="$1"; shift
+  local threads="$1"; shift
+
+  echo "\n== $label | ${threads}T | ${RUNTIME}s =="
+  if [[ -n "$preload" ]]; then
+    env LD_PRELOAD="$preload" "$LARSON_BIN" "$RUNTIME" "$MIN" "$MAX" "$CHUNK_PER_THREAD" "$ROUNDS" "$SEED" "$threads" 2>&1 | tail -n 3
+  else
+    "$LARSON_BIN" "$RUNTIME" "$MIN" "$MAX" "$CHUNK_PER_THREAD" "$ROUNDS" "$SEED" "$threads" 2>&1 | tail -n 3
+  fi
+}
+
+IFS=',' read -r -a THREADS <<< "$THREADS_ARG"
+
+for t in "${THREADS[@]}"; do
+  # system malloc
+  run_case "system malloc" "" "$t"
+
+  # mimalloc (optional)
+  if [[ -n "$mimalloc_so" ]]; then
+    run_case "mimalloc" "$mimalloc_so" "$t"
+  else
+    echo "\n== mimalloc | ${t}T | ${RUNTIME}s =="
+    echo "[SKIP] libmimalloc not found"
+  fi
+
+  # hakmem default
+  run_case "hakmem (default)" "$abs_hakmem" "$t"
+
+  # hakmem wrap tiny (optional)
+  if [[ "$INCLUDE_WRAP" -eq 1 ]]; then
+    echo "\n== hakmem (HAKMEM_WRAP_TINY=1) | ${t}T | ${RUNTIME}s =="
+    HAKMEM_WRAP_TINY=1 LD_PRELOAD="$abs_hakmem" "$LARSON_BIN" "$RUNTIME" "$MIN" "$MAX" "$CHUNK_PER_THREAD" "$ROUNDS" "$SEED" "$t" 2>&1 | tail -n 3
+  fi
+done
+
+echo "\nDone."
--- a/benchmarks/scripts/utils/run_memory_efficiency.sh
+++ b/benchmarks/scripts/utils/run_memory_efficiency.sh
@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Compare memory efficiency (Max RSS) between HAKMEM and System on tiny-hot bench.
+# - Runs selected sizes/batches with /usr/bin/time -v and parses Maximum resident set size (KB).
+# - Optionally toggles HAKMEM_TINY_FLUSH_ON_EXIT to evaluate exit-time trimming.
+# Output: bench_results/memory_eff_YYYYMMDD_HHMMSS/results.csv
+# Usage: scripts/run_memory_efficiency.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+if [[ ! -x /usr/bin/time ]]; then
+  echo "[error] /usr/bin/time not found. Install 'time' package." >&2
+  exit 1
+fi
+
+echo "[build] perf_main benches (no bench-only macros)"
+make -s perf_main >/dev/null
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/memory_eff_${TS}"
+mkdir -p "$OUTDIR"
+CSV="$OUTDIR/results.csv"
+echo "allocator,size,batch,cycles,flush_on_exit,max_rss_kb,elapsed_ms" > "$CSV"
+
+sizes=(32 64 128)
+batches=(100)
+
+run_case() {
+  local alloc="$1"; shift
+  local size="$1"; shift
+  local batch="$1"; shift
+  local cyc="$1"; shift
+  local flush="$1"; shift
+  local bin
+  if [[ "$alloc" == "hakmem" ]]; then bin=./bench_tiny_hot_hakmem; else bin=./bench_tiny_hot_system; fi
+  local tmp_log="$OUTDIR/tmp_${alloc}_${size}_${batch}_${cyc}_${flush}.log"
+  local tmp_out="$OUTDIR/tmp_${alloc}_${size}_${batch}_${cyc}_${flush}.out"
+  if [[ "$alloc" == "hakmem" ]]; then
+    HAKMEM_TINY_FLUSH_ON_EXIT="$flush" /usr/bin/time -v "$bin" "$size" "$batch" "$cyc" >"$tmp_out" 2>"$tmp_log" || true
+  else
+    /usr/bin/time -v "$bin" "$size" "$batch" "$cyc" >"$tmp_out" 2>"$tmp_log" || true
+  fi
+  local rss=$(sed -n 's/^\s*Maximum resident set size (kbytes): \([0-9]\+\).*/\1/p' "$tmp_log" | tail -1)
+  local elapsed=$(sed -n 's/^\s*Elapsed (wall clock) time (h:mm:ss or m:ss): \(.*\)/\1/p' "$tmp_log" | tail -1)
+  # convert elapsed to ms (best-effort; handles m:ss or h:mm:ss)
+  local ms=0
+  if [[ -n "$elapsed" ]]; then
+    local e1="" e2="" e3=""
+    IFS=: read -r e1 e2 e3 <<<"$elapsed" || true
+    if [[ -n "$e3" ]]; then
+      # h:m:s
+      ms=$(( (10#${e1}*3600 + 10#${e2}*60) * 1000 ))
+      ms=$(( ms + (10#${e3%.*})*1000 ))
+    else
+      # m:s
+      ms=$(( (10#${e1}*60) * 1000 ))
+      ms=$(( ms + (10#${e2%.*})*1000 ))
+    fi
+  fi
+  echo "$alloc,$size,$batch,$cyc,$flush,${rss:-},${ms:-}" >> "$CSV"
+}
+
+for s in "${sizes[@]}"; do
+  for b in "${batches[@]}"; do
+    echo "[run] SYSTEM size=$s batch=$b cycles=$cycles"
+    run_case system "$s" "$b" "$cycles" 0
+    echo "[run] HAKMEM (flush=0) size=$s batch=$b cycles=$cycles"
+    run_case hakmem "$s" "$b" "$cycles" 0
+    echo "[run] HAKMEM (flush=1) size=$s batch=$b cycles=$cycles"
+    run_case hakmem "$s" "$b" "$cycles" 1
+  done
+done
+
+echo "[done] CSV: $CSV"
+sed -n '1,40p' "$CSV" || true
--- a/benchmarks/scripts/utils/run_ultra_debug_sweep.sh
+++ b/benchmarks/scripts/utils/run_ultra_debug_sweep.sh
@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Sweep Ultra Tiny (SLL-only) with debug counters and output CSV
+# Usage: scripts/run_ultra_debug_sweep.sh [cycles] [batch]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+batch=${2:-200}
+
+make -s bench_fast >/dev/null
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/ultra_debug_${TS}"
+mkdir -p "$OUTDIR"
+CSV="$OUTDIR/results.csv"
+echo "size,batch,cycles,throughput_mops,class,pop_hits,refills,resets,sll_count" > "$CSV"
+
+sizes=(16 32 64)
+
+size_to_class() {
+  case "$1" in
+    16) echo 1;;
+    32) echo 2;;
+    64) echo 3;;
+    8)  echo 0;;
+    128) echo 4;;
+    *) echo -1;;
+  esac
+}
+
+for s in "${sizes[@]}"; do
+  cls=$(size_to_class "$s")
+  log="$OUTDIR/ultra_${s}_b=${batch}_c=${cycles}.log"
+  # Run with Ultra + debug; capture stdout+stderr in one file
+  HAKMEM_TINY_ULTRA=1 HAKMEM_TINY_ULTRA_DEBUG=1 HAKMEM_TINY_MAG_CAP=128 \
+    ./bench_tiny_hot_hakmem "$s" "$batch" "$cycles" >"$log" 2>&1 || true
+
+  thr=$(sed -n 's/^Throughput: \([0-9.][0-9.]*\) M ops.*/\1/p' "$log" | tail -n1)
+  # Extract Ultra debug block
+  start=$(grep -n '^\[Ultra Tiny Debug\]' "$log" | tail -n1 | cut -d: -f1)
+  if [[ -n "$start" ]]; then
+    # header is the next line; data follows
+    data_start=$((start+2))
+    # take next 8 lines (classes 0..7)
+    sed -n "${data_start},$((data_start+7))p" "$log" > "$OUTDIR/tmp_ultra.txt" || true
+    # pick the line for target class
+    line=$(awk -F',' -v k="$cls" '($1==k){print $0}' "$OUTDIR/tmp_ultra.txt" | tail -n1)
+    if [[ -n "$line" ]]; then
+      # line format: class,pop_hits,refills,resets,sll_count
+      IFS=',' read -r c ph rf rs sc <<<"$line"
+      echo "$s,$batch,$cycles,${thr:-},$c,$ph,$rf,$rs,$sc" >> "$CSV"
+    fi
+  fi
+done
+
+echo "[done] CSV: $CSV"
+sed -n '1,20p' "$CSV" || true
+
--- a/benchmarks/scripts/utils/run_usdt_overview.sh
+++ b/benchmarks/scripts/utils/run_usdt_overview.sh
@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build with USDT tracepoints and run perf stat for USDT events + PMU on tiny_hot + mixed
+# Usage: scripts/run_usdt_overview.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+# Allow overriding perf binary (e.g., WSL generic tools). Usage:
+#   PERF_BIN=/usr/lib/linux-tools-6.8.0-86/perf bash scripts/run_usdt_overview.sh 40000
+PERF_BIN=${PERF_BIN:-perf}
+cd "$ROOT_DIR"
+
+cycles=${1:-50000}
+
+if [[ "${SKIP_BUILD:-0}" != "1" ]]; then
+  echo "[build] USDT-enabled benches"
+  make -s clean >/dev/null 2>&1 || true
+  make -s bench_fast CFLAGS+=" -DHAKMEM_USDT=1" >/dev/null
+else
+  echo "[build] skipped (SKIP_BUILD=1)"
+fi
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/usdt_${TS}"
+mkdir -p "$OUTDIR"
+
+EVENTS_USDT=(
+  sdt:hakmem:sll_pop
+  sdt:hakmem:mag_pop
+  sdt:hakmem:front_pop
+  sdt:hakmem:bump_hit
+  sdt:hakmem:slow_alloc
+  sdt:hakmem:sll_push
+  sdt:hakmem:mag_push
+  sdt:hakmem:spill_super
+  sdt:hakmem:spill_tiny
+  sdt:hakmem:remote_drain
+  sdt:hakmem:superslab_alloc
+  sdt:hakmem:superslab_fail
+)
+EVENTS_PMU=(cycles,instructions,L1-dcache-load-misses,branch-misses)
+
+join_events() {
+  local IFS=','; echo "$*"
+}
+
+PMU_JOINED=$(join_events "${EVENTS_PMU[@]}" )
+
+# Detect USDT availability by actually probing a dummy run
+USDT_JOINED=""
+{
+  "$PERF_BIN" stat -x , -e sdt:hakmem:front_pop true 1>/dev/null 2>"$OUTDIR/.usdt_probe.err"
+} || true
+if rg -q "unknown tracepoint" "$OUTDIR/.usdt_probe.err"; then
+  echo "[warn] perf does not support 'sdt:' on this system (unknown tracepoint). Using PMU-only." | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] Install perf matching your kernel: sudo apt-get install linux-tools-\$(uname -r)" | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] Kernel must have UPROBE/SDT support (CONFIG_UPROBE_EVENTS)." | tee -a "$OUTDIR/summary.txt"
+elif rg -q "can't access trace events|No permissions" "$OUTDIR/.usdt_probe.err"; then
+  echo "[warn] USDT blocked by tracefs perms; falling back to PMU-only." | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] Try: sudo mount -t tracefs -o mode=755 nodev /sys/kernel/tracing" | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] And: sudo sysctl kernel.perf_event_paranoid=1" | tee -a "$OUTDIR/summary.txt"
+else
+  # Looks good; enable USDT events
+  USDT_JOINED=$(join_events "${EVENTS_USDT[@]}")
+fi
+
+# Basic environment info for troubleshooting
+{
+  echo "[env] perf=$($PERF_BIN --version 2>/dev/null | head -n1)";
+  echo "[env] kernel=$(uname -r)";
+  echo "[env] tracefs=$(ls -ld /sys/kernel/tracing 2>/dev/null || true)";
+} | tee -a "$OUTDIR/summary.txt"
+
+run_perf() {
+  local tag="$1"; shift
+  local bin="$1"; shift
+  local size="$1"; shift
+  local batch="$1"; shift
+  local cyc="$1"; shift
+  local log="$OUTDIR/${tag}_s${size}_b${batch}_c${cyc}.stat.csv"
+  echo "[perf] $tag size=$size batch=$batch cycles=$cyc" | tee -a "$OUTDIR/summary.txt"
+  if [[ -n "$USDT_JOINED" ]]; then
+    "$PERF_BIN" stat -x , -e "$USDT_JOINED","$PMU_JOINED" "$bin" "$size" "$batch" "$cyc" 1>/dev/null 2>"$log" || true
+  else
+    "$PERF_BIN" stat -x , -e "$PMU_JOINED" "$bin" "$size" "$batch" "$cyc" 1>/dev/null 2>"$log" || true
+  fi
+}
+
+# Tiny-hot focus (8/16/32/64)
+for s in 8 16 32 64; do
+  for b in 100; do
+    HAKMEM_QUIET=1 run_perf "hakmem" ./bench_tiny_hot_hakmem "$s" "$b" "$cycles"
+    HAKMEM_QUIET=1 run_perf "system" ./bench_tiny_hot_system "$s" "$b" "$cycles"
+  done
+done
+
+# Random mixed overview
+bash scripts/run_random_mixed_matrix.sh 80000 >/dev/null || true
+
+echo "[done] USDT overview: $OUTDIR"
+ls -1 "$OUTDIR" | sed -n '1,20p'
--- a/benchmarks/scripts/utils/save_prof_sweep.sh
+++ b/benchmarks/scripts/utils/save_prof_sweep.sh
@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Save a short profiler sweep into docs/benchmarks/<YYYYMMDD_HHMMSS>/
+# Usage: scripts/save_prof_sweep.sh [-d SEC] [-t CSV] [-s N]
+
+RUNTIME=2
+THREADS="1,4"
+SAMPLE_N=8
+BENCH_TIMEOUT=""
+KILL_GRACE=${KILL_GRACE:-2}
+
+while getopts ":d:t:s:h" opt; do
+  case $opt in
+    d) RUNTIME="$OPTARG" ;;
+    t) THREADS="$OPTARG" ;;
+    s) SAMPLE_N="$OPTARG" ;;
+    h) echo "Usage: $0 [-d SEC] [-t CSV] [-s N]"; exit 0 ;;
+    :) echo "Missing arg -$OPTARG"; exit 2 ;;
+    *) echo "Usage: $0 [-d SEC] [-t CSV] [-s N]"; exit 2 ;;
+  esac
+done
+
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+OUTDIR="$ROOT_DIR/docs/benchmarks/$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTDIR"
+
+LIB="$(readlink -f "$ROOT_DIR/libhakmem.so" || true)"
+LARSON="$ROOT_DIR/mimalloc-bench/bench/larson/larson"
+
+if [[ -z "${BENCH_TIMEOUT}" ]]; then
+  BENCH_TIMEOUT=$(( RUNTIME + 3 ))
+fi
+
+echo "Saving sweep into: $OUTDIR" | tee "$OUTDIR/summary.txt"
+echo "RUNTIME=$RUNTIME THREADS=$THREADS SAMPLE=1/$((1<<SAMPLE_N)) TIMEOUT=${BENCH_TIMEOUT}s" | tee -a "$OUTDIR/summary.txt"
+
+declare -a RUNS=(
+  "tiny 8 1024"
+  "mid 2048 32768"
+  "gap 33000 65536"
+  "large 65536 1048576"
+)
+
+IFS=',' read -r -a TARR <<< "$THREADS"
+
+for r in "${RUNS[@]}"; do
+  read -r name rmin rmax <<< "$r"
+  for t in "${TARR[@]}"; do
+    label="${name}_T${t}_${rmin}-${rmax}"
+    echo "== $label ==" | tee -a "$OUTDIR/summary.txt"
+    if [[ -f "$LARSON" && -f "$ROOT_DIR/libhakmem.so" ]]; then
+      timeout -k "${KILL_GRACE}s" "${BENCH_TIMEOUT}s" \
+        env HAKMEM_PROF=1 HAKMEM_PROF_SAMPLE="$SAMPLE_N" \
+        LD_PRELOAD="$LIB" "$LARSON" "$RUNTIME" "$rmin" "$rmax" 10000 1 12345 "$t" 2>&1 \
+        | tee "$OUTDIR/${label}.log" | tail -n 80 | tee -a "$OUTDIR/summary.txt"
+    else
+      echo "Skip: missing larson or libhakmem.so" | tee -a "$OUTDIR/summary.txt"
+    fi
+  done
+done
+
+echo "Done. See $OUTDIR" | tee -a "$OUTDIR/summary.txt"
--- a/benchmarks/scripts/utils/sqlite_workload.sql
+++ b/benchmarks/scripts/utils/sqlite_workload.sql
@ -0,0 +1,32 @@
+PRAGMA journal_mode = OFF;
+PRAGMA synchronous = OFF;
+PRAGMA temp_store = MEMORY;
+
+-- schema
+CREATE TABLE t (
+  id INTEGER PRIMARY KEY,
+  s  TEXT
+);
+
+-- bulk insert via recursive CTE (~50k rows)
+WITH RECURSIVE cnt(x) AS (
+  SELECT 1
+  UNION ALL
+  SELECT x+1 FROM cnt LIMIT 50000
+)
+INSERT INTO t(s)
+SELECT printf('str-%d-%d', x, x*x) FROM cnt;
+
+-- simple read queries
+SELECT COUNT(*) FROM t;
+SELECT SUM(LENGTH(s)) FROM t;
+
+-- point lookups
+SELECT s FROM t WHERE id IN (1, 100, 1000, 10000, 40000);
+
+-- update a slice
+UPDATE t SET s = s || '-x' WHERE (id % 50) = 0;
+
+-- final check
+SELECT COUNT(*) FROM t WHERE s LIKE '%-x';
+