Debug Counters Implementation - Clean History

Major Features: - Debug counter infrastructure for Refill Stage tracking - Free Pipeline counters (ss_local, ss_remote, tls_sll) - Diagnostic counters for early return analysis - Unified larson.sh benchmark runner with profiles - Phase 6-3 regression analysis documentation Bug Fixes: - Fix SuperSlab disabled by default (HAKMEM_TINY_USE_SUPERSLAB) - Fix profile variable naming consistency - Add .gitignore patterns for large files Performance: - Phase 6-3: 4.79 M ops/s (has OOM risk) - With SuperSlab: 3.13 M ops/s (+19% improvement) This is a clean repository without large log files. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-05 12:31:14 +09:00
commit 52386401b3
27144 changed files with 124451 additions and 0 deletions
--- a/benchmarks/scripts/comprehensive/run_all_benches_with_timeouts.sh
+++ b/benchmarks/scripts/comprehensive/run_all_benches_with_timeouts.sh
@ -0,0 +1,91 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run (nearly) the full mimalloc-bench suite with timeouts.
+# Compares: HAKMEM (via LD_PRELOAD on sys), mimalloc (mi), and system (sys).
+#
+# Env knobs:
+#   TIMEOUT_SEC   per-run timeout seconds (default: 900)
+#   PROCS         concurrency list for bench.sh --procs (default: 1,4)
+#   INCLUDE_JE    include jemalloc reference (0/1, default: 0)
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+TIMEOUT_SEC=${TIMEOUT_SEC:-900}
+PROCS=${PROCS:-1,4}
+INCLUDE_JE=${INCLUDE_JE:-0}
+TESTS=${TESTS:-allt}           # Space-separated tests or 'allt'
+REPEATS=${REPEATS:-1}          # Pass through to bench.sh (-r)
+
+RESULT_DIR="$ROOT_DIR/bench_results/mimalloc_full_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$RESULT_DIR"
+
+BENCH_ROOT="$ROOT_DIR/mimalloc-bench"
+BENCH_OUT="$BENCH_ROOT/out/bench"
+
+if [[ ! -d "$BENCH_OUT" ]]; then
+  echo "[warn] mimalloc-bench/out/bench not found. Attempting auto-build (bench only)."
+  if [[ -x "$BENCH_ROOT/build-bench-env.sh" ]]; then
+    pushd "$BENCH_ROOT" >/dev/null
+    ./build-bench-env.sh bench
+    popd >/dev/null
+  else
+    echo "[error] build-bench-env.sh not found under mimalloc-bench."
+    echo "        Please build manually: cd mimalloc-bench && ./build-bench-env.sh bench"
+    exit 1
+  fi
+fi
+
+if [[ ! -d "$BENCH_OUT" ]]; then
+  echo "[error] mimalloc-bench/out/bench still missing after auto-build."
+  echo "        Try: cd mimalloc-bench && ./build-bench-env.sh all"
+  exit 1
+fi
+
+echo "[info] Building HAKMEM shared library with PGO for LD_PRELOAD"
+make -s pgo-profile-shared pgo-build-shared >/dev/null
+
+pushd "$BENCH_OUT" >/dev/null
+
+run_case() {
+  local name="$1"; shift
+  local preload="$1"; shift
+  local args=("$@")
+  local log="$RESULT_DIR/${name}.log"
+  echo "[case] $name | timeout=${TIMEOUT_SEC}s | args: ${args[*]}" | tee -a "$log"
+  if [[ -n "$preload" ]]; then
+    LD_PRELOAD="$preload" timeout -s INT "$TIMEOUT_SEC" bash ../../bench.sh -r="$REPEATS" "${args[@]}" 2>&1 | tee -a "$log" || true
+  else
+    timeout -s INT "$TIMEOUT_SEC" bash ../../bench.sh -r="$REPEATS" "${args[@]}" 2>&1 | tee -a "$log" || true
+  fi
+  # Save benchres.csv for this case if present
+  if [[ -f benchres.csv ]]; then
+    cp -f benchres.csv "$RESULT_DIR/${name}_benchres.csv" || true
+  fi
+}
+
+if [[ "$TESTS" == "allt" ]]; then
+  # HAKMEM vs mimalloc vs system (and optional jemalloc) for full-all tests
+  run_case "hakmem_procs=${PROCS//,/}" "$ROOT_DIR/libhakmem.so" --procs="$PROCS" sys allt
+  run_case "mimalloc_procs=${PROCS//,/}" "" --procs="$PROCS" mi allt
+  run_case "system_procs=${PROCS//,/}" "" --procs="$PROCS" sys allt
+  if [[ "$INCLUDE_JE" == "1" ]]; then
+    run_case "jemalloc_procs=${PROCS//,/}" "" --procs="$PROCS" je allt
+  fi
+else
+  # Split per test to enforce per-test timeouts and partial progress
+  for t in $TESTS; do
+    run_case "hakmem_${t}_p=${PROCS//,/}" "$ROOT_DIR/libhakmem.so" --procs="$PROCS" sys "$t"
+    run_case "mimalloc_${t}_p=${PROCS//,/}" "" --procs="$PROCS" mi "$t"
+    run_case "system_${t}_p=${PROCS//,/}" "" --procs="$PROCS" sys "$t"
+    if [[ "$INCLUDE_JE" == "1" ]]; then
+      run_case "jemalloc_${t}_p=${PROCS//,/}" "" --procs="$PROCS" je "$t"
+    fi
+  done
+fi
+
+popd >/dev/null
+
+echo "[info] Logs: $RESULT_DIR"
+echo "[hint] Parse logs to CSV: scripts/parse_mimalloc_logs.py $RESULT_DIR > $RESULT_DIR/summary.csv"
--- a/benchmarks/scripts/comprehensive/run_bench_suite.sh
+++ b/benchmarks/scripts/comprehensive/run_bench_suite.sh
@ -0,0 +1,71 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run a small suite across system/mimalloc/hakmem and save logs under docs/benchmarks/<timestamp>.
+# Focus: mimalloc-bench/bench/larson patterns that cover Tiny/Mid/Large/Big.
+# Optionally include a 'hakmem best' run (WRAP L1 + learning on + DYN auto).
+
+RUNTIME=${RUNTIME:-1}
+THREADS_CSV=${THREADS:-"1,4"}
+BEST=${BEST:-0}
+# Hard wall-clock timeout per run (external). Defaults to RUNTIME+3s.
+BENCH_TIMEOUT=${BENCH_TIMEOUT:-}
+KILL_GRACE=${KILL_GRACE:-2}
+if [[ -z "${BENCH_TIMEOUT}" ]]; then
+  # Add small cushion to allow larson to exit cleanly
+  BENCH_TIMEOUT=$(( RUNTIME + 3 ))
+fi
+
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+OUTDIR="$ROOT_DIR/docs/benchmarks/$(date +%Y%m%d_%H%M%S)_SUITE"
+mkdir -p "$OUTDIR"
+
+LARSON="$ROOT_DIR/mimalloc-bench/bench/larson/larson"
+LIB_HAK="$ROOT_DIR/libhakmem.so"
+LIB_HAK_ABS=$(readlink -f "$LIB_HAK")
+LIB_MI="/lib/x86_64-linux-gnu/libmimalloc.so.2"
+
+echo "Suite: RUNTIME=${RUNTIME}s THREADS=${THREADS_CSV} BEST=${BEST}" | tee "$OUTDIR/summary.txt"
+
+run_case() {
+  local label="$1"; shift
+  local min="$1"; local max="$2"; shift 2
+  local threads_csv="$1"; shift
+  IFS=',' read -r -a TARR <<< "$threads_csv"
+  for t in "${TARR[@]}"; do
+    echo "== ${label} | ${t}T | system ==" | tee -a "$OUTDIR/summary.txt"
+    timeout -k "${KILL_GRACE}s" "${BENCH_TIMEOUT}s" \
+      "$LARSON" "$RUNTIME" "$min" "$max" 10000 1 12345 "$t" 2>&1 \
+      | tee "$OUTDIR/${label}_system_T${t}.log" | tail -n 3 | tee -a "$OUTDIR/summary.txt"
+    if [[ -f "$LIB_MI" ]]; then
+      echo "== ${label} | ${t}T | mimalloc ==" | tee -a "$OUTDIR/summary.txt"
+      timeout -k "${KILL_GRACE}s" "${BENCH_TIMEOUT}s" \
+        env LD_PRELOAD="$LIB_MI" "$LARSON" "$RUNTIME" "$min" "$max" 10000 1 12345 "$t" 2>&1 \
+        | tee "$OUTDIR/${label}_mimalloc_T${t}.log" | tail -n 3 | tee -a "$OUTDIR/summary.txt"
+    fi
+    if [[ -f "$LIB_HAK" ]]; then
+      echo "== ${label} | ${t}T | hakmem(default) ==" | tee -a "$OUTDIR/summary.txt"
+      timeout -k "${KILL_GRACE}s" "${BENCH_TIMEOUT}s" \
+        env LD_PRELOAD="$LIB_HAK_ABS" "$LARSON" "$RUNTIME" "$min" "$max" 10000 1 12345 "$t" 2>&1 \
+        | tee "$OUTDIR/${label}_hakmem_default_T${t}.log" | tail -n 3 | tee -a "$OUTDIR/summary.txt"
+      if [[ "$BEST" == "1" ]]; then
+        echo "== ${label} | ${t}T | hakmem(best) ==" | tee -a "$OUTDIR/summary.txt"
+        timeout -k "${KILL_GRACE}s" "${BENCH_TIMEOUT}s" \
+          env HAKMEM_WRAP_L2=1 HAKMEM_WRAP_L25=1 HAKMEM_LEARN=1 HAKMEM_DYN1_AUTO=1 HAKMEM_DYN2_AUTO=1 HAKMEM_HIST_SAMPLE=7 HAKMEM_WMAX_LEARN=1 HAKMEM_WMAX_DWELL_SEC=2 \
+          LD_PRELOAD="$LIB_HAK_ABS" "$LARSON" "$RUNTIME" "$min" "$max" 10000 1 12345 "$t" 2>&1 \
+          | tee "$OUTDIR/${label}_hakmem_best_T${t}.log" | tail -n 3 | tee -a "$OUTDIR/summary.txt"
+      fi
+    fi
+  done
+}
+
+# Tiny band (8–64B)
+run_case tiny 8 64 "$THREADS_CSV"
+# Mid band (2–32KiB)
+run_case mid 2048 32768 "$THREADS_CSV"
+# Large band (64KiB–1MiB)
+run_case large 65536 1048576 "$THREADS_CSV"
+# Big band (2–4MiB)
+run_case big 2097152 4194304 "$THREADS_CSV"
+
+echo "Saved suite: $OUTDIR" | tee -a "$OUTDIR/summary.txt"
--- a/benchmarks/scripts/comprehensive/run_comprehensive_pair.sh
+++ b/benchmarks/scripts/comprehensive/run_comprehensive_pair.sh
@ -0,0 +1,31 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run comprehensive bench for HAKMEM (direct) and mimalloc (direct) and parse to CSV
+# Usage: scripts/run_comprehensive_pair.sh
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/comp_pair_${TS}"
+mkdir -p "$OUTDIR"
+
+echo "[build] HAKMEM + mi direct-link…"
+make -s bench_fast >/dev/null
+make -s bench_comprehensive_mi >/dev/null
+
+echo "[run] HAKMEM (direct)"
+HAKMEM_TINY_TLS_SLL=${HAKMEM_TINY_TLS_SLL:-1} \
+HAKMEM_TINY_MAG_CAP=${HAKMEM_TINY_MAG_CAP:-128} \
+HAKMEM_WRAP_TINY=1 \
+./bench_comprehensive_hakmem | tee "$OUTDIR/hakmem.log" >/dev/null
+
+echo "[run] mimalloc (direct)"
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}:$ROOT_DIR/mimalloc-bench/extern/mi/out/release"
+./bench_comprehensive_mi | tee "$OUTDIR/mimalloc.log" >/dev/null
+
+echo "[parse] to CSV"
+python3 scripts/parse_comprehensive_logs.py "$OUTDIR" > "$OUTDIR/summary.csv"
+echo "[done] CSV: $OUTDIR/summary.csv"
+sed -n '1,60p' "$OUTDIR/summary.csv" || true
--- a/benchmarks/scripts/comprehensive/run_mimalloc_suite_with_hakmem.sh
+++ b/benchmarks/scripts/comprehensive/run_mimalloc_suite_with_hakmem.sh
@ -0,0 +1,52 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run a representative subset of mimalloc-bench with HAKMEM via LD_PRELOAD.
+# Produces logs under bench_results/.
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+RESULT_DIR="$ROOT_DIR/bench_results/mimalloc_suite_$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$RESULT_DIR"
+
+echo "[info] Building HAKMEM shared library with PGO (for LD_PRELOAD)"
+make -s pgo-profile-shared pgo-build-shared >/dev/null
+
+BENCH_ROOT="$ROOT_DIR/mimalloc-bench"
+BENCH_OUT="$BENCH_ROOT/out/bench"
+
+if [[ ! -d "$BENCH_OUT" ]]; then
+  echo "[warn] mimalloc-bench/out/bench not found."
+  echo "       Build it first: (may require sudo/network)"
+  echo "         cd mimalloc-bench"
+  echo "         ./build-bench-env.sh all"
+  exit 1
+fi
+
+pushd "$BENCH_OUT" >/dev/null
+
+TESTS=(larson cfrac espresso xmalloc-test sh6bench sh8bench cscratch cthrash mstress malloc-large)
+ALLOC_HAK=(sys)           # sys + LD_PRELOAD=hakmem
+ALLOC_BASE=(mi je sys)    # reference allocators for comparison
+PROCS=(1 4)
+
+echo "[info] Running mimalloc-bench subset with HAKMEM (LD_PRELOAD)"
+for p in "${PROCS[@]}"; do
+  log="$RESULT_DIR/hakmem_p${p}.log"
+  echo "[case] HAKMEM LD_PRELOAD --procs=$p | tests: ${TESTS[*]}" | tee -a "$log"
+  LD_PRELOAD="$ROOT_DIR/libhakmem.so" \
+  HAKMEM_WRAP_TINY=1 bash ../../bench.sh --procs="$p" "${ALLOC_HAK[@]}" "${TESTS[@]}" 2>&1 | tee -a "$log" || true
+done
+
+echo "[info] Running reference allocators (mi/je/sys)"
+for p in "${PROCS[@]}"; do
+  log="$RESULT_DIR/ref_p${p}.log"
+  echo "[case] REF mi/je/sys --procs=$p | tests: ${TESTS[*]}" | tee -a "$log"
+  bash ../../bench.sh --procs="$p" "${ALLOC_BASE[@]}" "${TESTS[@]}" 2>&1 | tee -a "$log" || true
+done
+
+popd >/dev/null
+
+echo "[info] Logs written to: $RESULT_DIR"
+echo "[done] mimalloc-bench subset complete"
--- a/benchmarks/scripts/comprehensive/run_perf_hot_triad.sh
+++ b/benchmarks/scripts/comprehensive/run_perf_hot_triad.sh
@ -0,0 +1,56 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# perf-stat triad (HAKMEM/System/mimalloc) for bench_tiny_hot at a single size/batch/cycles
+# Collects cycles/instructions/branches/branch-misses/L1-dcache-load-misses (CSV via perf -x ,)
+# Usage: scripts/run_perf_hot_triad.sh [size] [batch] [cycles] [reps]
+#   size  : bytes (default 32)
+#   batch : batch size (default 100)
+#   cycles: loop cycles per op (default 50000)
+#   reps  : perf repetitions (default 3)
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+size=${1:-32}
+batch=${2:-100}
+cycles=${3:-50000}
+reps=${4:-3}
+
+echo "[build] benches (fast + mi)"
+make -s bench_fast bench_tiny_hot_mi >/dev/null
+
+# Ensure LD_LIBRARY_PATH is defined (set -u safety)
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}"
+MI_LIBDIR="$ROOT_DIR/mimalloc-bench/extern/mi/out/release"
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/perf_hot_triad_${TS}"
+mkdir -p "$OUTDIR"
+
+echo "[info] size=$size batch=$batch cycles=$cycles reps=$reps"
+echo "[info] results → $OUTDIR"
+
+run_perf() {
+  local alloc="$1"; shift
+  local bin="$1"; shift
+  local tag="$alloc"_s${size}_b${batch}_c${cycles}
+  local log="$OUTDIR/${tag}.perf.csv"
+  echo "[perf] $tag"
+  if [[ "$alloc" == "mimalloc" ]]; then
+    LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MI_LIBDIR" \
+      perf stat -x , -r "$reps" -e cycles,instructions,branches,branch-misses,L1-dcache-load-misses \
+        "$bin" "$size" "$batch" "$cycles" 1>/dev/null 2>"$log" || true
+  else
+    perf stat -x , -r "$reps" -e cycles,instructions,branches,branch-misses,L1-dcache-load-misses \
+      "$bin" "$size" "$batch" "$cycles" 1>/dev/null 2>"$log" || true
+  fi
+}
+
+run_perf hakmem ./bench_tiny_hot_hakmem
+run_perf system ./bench_tiny_hot_system
+run_perf mimalloc ./bench_tiny_hot_mi
+
+echo "[done] perf CSVs under: $OUTDIR"
+ls -1 "$OUTDIR" | sed -n '1,20p'
+
--- a/benchmarks/scripts/comprehensive/run_perf_main_triad.sh
+++ b/benchmarks/scripts/comprehensive/run_perf_main_triad.sh
@ -0,0 +1,27 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Safe mainline-oriented preset triad (no bench-only macros).
+# Usage: scripts/run_perf_main_triad.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+echo "[build] perf_main (no bench-only macros)"
+make -s perf_main >/dev/null
+
+# Recommended envs
+export HAKMEM_TINY_TLS_SLL=${HAKMEM_TINY_TLS_SLL:-1}
+export HAKMEM_TINY_REFILL_MAX=${HAKMEM_TINY_REFILL_MAX:-96}
+export HAKMEM_TINY_REFILL_MAX_HOT=${HAKMEM_TINY_REFILL_MAX_HOT:-192}
+export HAKMEM_TINY_SPILL_HYST=${HAKMEM_TINY_SPILL_HYST:-16}
+export HAKMEM_TINY_BG_REMOTE=${HAKMEM_TINY_BG_REMOTE:-0}
+
+echo "[info] env: TLS_SLL=$HAKMEM_TINY_TLS_SLL REFILL_MAX=$HAKMEM_TINY_REFILL_MAX HOT=$HAKMEM_TINY_REFILL_MAX_HOT HYST=$HAKMEM_TINY_SPILL_HYST BG_REMOTE=$HAKMEM_TINY_BG_REMOTE"
+
+bash scripts/run_tiny_hot_triad.sh "$cycles"
+
+echo "[done] perf_main triad finished"
+
--- a/benchmarks/scripts/comprehensive/run_random_mixed_matrix.sh
+++ b/benchmarks/scripts/comprehensive/run_random_mixed_matrix.sh
@ -0,0 +1,57 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run random-mixed bench for HAKMEM, System, mimalloc across ws/seeds and write CSV
+# Usage: scripts/run_random_mixed_matrix.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-150000}
+
+echo "[build] benches (fast + random-mixed variants)"
+make -s bench_fast bench_random_mixed_hakmem bench_random_mixed_system bench_random_mixed_mi >/dev/null
+
+# Ensure LD_LIBRARY_PATH is defined (set -u safety)
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}"
+MI_LIBDIR="$ROOT_DIR/mimalloc-bench/extern/mi/out/release"
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/random_mixed_${TS}"
+mkdir -p "$OUTDIR"
+CSV="$OUTDIR/results.csv"
+echo "allocator,cycles,ws,seed,throughput_mops" > "$CSV"
+
+ws_list=(200 400 800)
+seeds=(42 1337)
+
+run_case() {
+  local bin="$1"; shift
+  local alloc="$1"; shift
+  local cyc="$1"; shift
+  local ws="$1"; shift
+  local seed="$1"; shift
+  local out
+  if [[ "$alloc" == "mimalloc" ]]; then
+    LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MI_LIBDIR" \
+      $bin "$cyc" "$ws" "$seed" | sed -n 's/^Throughput: \([0-9.][0-9.]*\) M ops.*/\1/p' >"$OUTDIR/tmp.txt" || true
+  else
+    $bin "$cyc" "$ws" "$seed" | sed -n 's/^Throughput: \([0-9.][0-9.]*\) M ops.*/\1/p' >"$OUTDIR/tmp.txt" || true
+  fi
+  out=$(cat "$OUTDIR/tmp.txt" || true)
+  if [[ -n "$out" ]]; then echo "$alloc,$cyc,$ws,$seed,$out" >> "$CSV"; fi
+}
+
+for ws in "${ws_list[@]}"; do
+  for seed in "${seeds[@]}"; do
+    echo "[run] HAKMEM ws=$ws seed=$seed cycles=$cycles"
+    run_case ./bench_random_mixed_hakmem hakmem "$cycles" "$ws" "$seed"
+    echo "[run] SYSTEM ws=$ws seed=$seed cycles=$cycles"
+    run_case ./bench_random_mixed_system system "$cycles" "$ws" "$seed"
+    echo "[run] MIMALLOC ws=$ws seed=$seed cycles=$cycles"
+    run_case ./bench_random_mixed_mi mimalloc "$cycles" "$ws" "$seed"
+  done
+done
+
+echo "[done] CSV: $CSV"
+sed -n '1,40p' "$CSV" || true
--- a/benchmarks/scripts/comprehensive/run_suite_compare.sh
+++ b/benchmarks/scripts/comprehensive/run_suite_compare.sh
@ -0,0 +1,64 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Aggregate suite runner: tiny-hot triad, random-mixed triad, comprehensive pair, optional app benches
+# Usage: scripts/run_suite_compare.sh [cycles_hot] [cycles_mixed] [with_apps]
+#   cycles_hot   : tiny hot cycles (default 80000)
+#   cycles_mixed : random mixed cycles (default 120000)
+#   with_apps    : 0/1 (default 0) — if 1, runs scripts/run_apps_with_hakmem.sh
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles_hot=${1:-80000}
+cycles_mixed=${2:-120000}
+with_apps=${3:-0}
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/suite_${TS}"
+mkdir -p "$OUTDIR"
+
+log() { echo "[$(date +%H:%M:%S)] $*"; }
+
+log "tiny hot triad ($cycles_hot)"
+bash scripts/run_tiny_hot_triad.sh "$cycles_hot" | tee "$OUTDIR/tiny_hot.log" >/dev/null || true
+th_csv=$(sed -n 's/^\[done\] CSV: \(.*\)$/\1/p' "$OUTDIR/tiny_hot.log" | tail -n1)
+
+log "random mixed triad ($cycles_mixed)"
+bash scripts/run_random_mixed_matrix.sh "$cycles_mixed" | tee "$OUTDIR/random_mixed.log" >/dev/null || true
+rm_csv=$(sed -n 's/^\[done\] CSV: \(.*\)$/\1/p' "$OUTDIR/random_mixed.log" | tail -n1)
+
+log "comprehensive pair"
+bash scripts/run_comprehensive_pair.sh | tee "$OUTDIR/comp_pair.log" >/dev/null || true
+cp_csv=$(sed -n 's/^\[done\] CSV: \(.*\)$/\1/p' "$OUTDIR/comp_pair.log" | tail -n1)
+
+if [[ "$with_apps" == "1" ]]; then
+  log "apps (LD-safe)"
+  bash scripts/run_apps_with_hakmem.sh | tee "$OUTDIR/apps.log" >/dev/null || true
+fi
+
+summary="$OUTDIR/summary.md"
+{
+  echo "# HAKMEM vs System vs mimalloc – Suite ($TS)"
+  echo ""
+  echo "- tiny hot triad CSV: \
+    \
+    $th_csv"
+  echo "- random mixed triad CSV: \
+    \
+    $rm_csv"
+  echo "- comprehensive pair CSV: \
+    \
+    $cp_csv"
+  if [[ "$with_apps" == "1" ]]; then
+    echo "- apps log: $OUTDIR/apps.log"
+  fi
+  echo ""
+  echo "Quick peek (head):"
+  echo ""; echo '```'; sed -n '1,20p' "$th_csv"; echo '```'
+  echo ""; echo '```'; sed -n '1,20p' "$rm_csv"; echo '```'
+  echo ""; echo '```'; sed -n '1,30p' "$cp_csv"; echo '```'
+} > "$summary"
+
+log "done → $summary"
+
--- a/benchmarks/scripts/mid/compare_mid_mt_allocators.sh
+++ b/benchmarks/scripts/mid/compare_mid_mt_allocators.sh
@ -0,0 +1,122 @@
+#!/bin/bash
+# Compare Mid Range MT performance across different allocators
+#
+# Runs bench_mid_large_mt with:
+# - System allocator (glibc)
+# - mimalloc
+# - HAKX (our implementation)
+#
+# Usage: ./compare_mid_mt_allocators.sh [threads] [cycles] [ws] [seed] [runs]
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# Default parameters
+THREADS=${1:-4}
+CYCLES=${2:-60000}
+WORKING_SET=${3:-256}
+SEED=${4:-1}
+RUNS=${5:-3}
+
+# CPU affinity
+TASKSET="taskset -c 0-3"
+
+echo "=========================================="
+echo "Mid Range MT Allocator Comparison"
+echo "=========================================="
+echo "Configuration:"
+echo "  Threads:     $THREADS"
+echo "  Cycles:      $CYCLES"
+echo "  Working Set: $WORKING_SET"
+echo "  Seed:        $SEED"
+echo "  Runs/each:   $RUNS"
+echo ""
+
+cd "$PROJECT_ROOT"
+
+# Build all variants if needed
+for variant in system mi hakx; do
+    TARGET="bench_mid_large_mt_${variant}"
+    if [ ! -f "./$TARGET" ]; then
+        echo "Building $TARGET..."
+        make "$TARGET"
+    fi
+done
+
+echo ""
+echo "=========================================="
+
+# Function to run benchmark and extract median throughput
+run_bench() {
+    local variant=$1
+    local results=()
+
+    echo ""
+    echo "Testing: $variant"
+    echo "----------------------------------------"
+
+    for i in $(seq 1 $RUNS); do
+        local output=$($TASKSET ./bench_mid_large_mt_${variant} $THREADS $CYCLES $WORKING_SET $SEED 2>&1)
+        local mops=$(echo "$output" | grep "Throughput:" | awk '{print $2}')
+        results+=($mops)
+        printf "  Run %d: %s M ops/sec\n" $i "$mops"
+    done
+
+    # Calculate median - simpler approach using awk
+    local median=$(printf "%s\n" "${results[@]}" | sort -n | awk '{
+        a[NR] = $1
+    }
+    END {
+        if (NR % 2 == 1) {
+            print a[int(NR/2) + 1]
+        } else {
+            median = (a[NR/2] + a[NR/2 + 1]) / 2
+            printf "%.2f", median
+        }
+    }')
+
+    echo "  Median: $median M ops/sec"
+    echo "$median"
+}
+
+# Run benchmarks
+echo "Running benchmarks..."
+
+SYSTEM_RESULT=$(run_bench "system")
+MI_RESULT=$(run_bench "mi")
+HAKX_RESULT=$(run_bench "hakx")
+
+# Summary
+echo ""
+echo "=========================================="
+echo "Summary"
+echo "=========================================="
+printf "%-20s %10s %15s\n" "Allocator" "Throughput" "vs System"
+echo "----------------------------------------"
+printf "%-20s %10.2f M  %15s\n" "System (glibc)" "$SYSTEM_RESULT" "1.00x"
+
+MI_RATIO=$(echo "scale=2; $MI_RESULT / $SYSTEM_RESULT" | bc)
+printf "%-20s %10.2f M  %15s\n" "mimalloc" "$MI_RESULT" "${MI_RATIO}x"
+
+HAKX_RATIO=$(echo "scale=2; $HAKX_RESULT / $SYSTEM_RESULT" | bc)
+printf "%-20s %10.2f M  %15s\n" "HAKX (Mid MT)" "$HAKX_RESULT" "${HAKX_RATIO}x"
+
+echo ""
+echo "HAKX vs mimalloc:"
+HAKX_VS_MI=$(echo "scale=2; $HAKX_RESULT / $MI_RESULT * 100" | bc)
+printf "  %.1f%% of mimalloc performance\n" "$HAKX_VS_MI"
+
+# Winner
+echo ""
+if (( $(echo "$HAKX_RESULT > $MI_RESULT" | bc -l) )); then
+    echo "🏆 HAKX is FASTER than mimalloc!"
+elif (( $(echo "$HAKX_RESULT > $SYSTEM_RESULT * 1.5" | bc -l) )); then
+    echo "✅ HAKX significantly faster than system allocator (>1.5x)"
+else
+    echo "⚠️  HAKX needs optimization"
+fi
+
+echo ""
+echo "Comparison completed!"
--- a/benchmarks/scripts/mid/run_mid_mt_bench.sh
+++ b/benchmarks/scripts/mid/run_mid_mt_bench.sh
@ -0,0 +1,123 @@
+#!/bin/bash
+# Mid Range MT Benchmark - Optimal Configuration
+#
+# Parameters discovered through performance tuning:
+# - threads=4: Optimal for quad-core systems
+# - cycles=60000: Sufficient iterations for stable results
+# - ws=256: Working set that fits in L3 cache (4MB)
+# - taskset -c 0-3: Pin to cores 0-3 for consistency
+#
+# Performance target: 95-99 M ops/sec
+# vs System allocator: ~1.87x faster
+
+set -e
+
+SCRIPT_DIR="$(cd "$(dirname "${BASH_SOURCE[0]}")" && pwd)"
+PROJECT_ROOT="$(cd "$SCRIPT_DIR/.." && pwd)"
+
+# Default parameters (optimized for cache efficiency)
+THREADS=${1:-4}
+CYCLES=${2:-60000}
+WORKING_SET=${3:-256}
+SEED=${4:-1}
+RUNS=${5:-5}
+
+# CPU affinity (use cores 0-3)
+TASKSET="taskset -c 0-3"
+
+echo "======================================"
+echo "Mid Range MT Benchmark (8-32KB)"
+echo "======================================"
+echo "Configuration:"
+echo "  Threads:     $THREADS"
+echo "  Cycles:      $CYCLES"
+echo "  Working Set: $WORKING_SET"
+echo "  Seed:        $SEED"
+echo "  Runs:        $RUNS"
+echo "  CPU Affinity: cores 0-3"
+echo ""
+echo "Working Set Analysis:"
+WS_SIZE=$((WORKING_SET * 16))  # Average 16KB per allocation
+echo "  Memory: ~${WS_SIZE} KB per thread"
+echo "  Total:  ~$((WS_SIZE * THREADS / 1024)) MB"
+echo ""
+
+cd "$PROJECT_ROOT"
+
+# Check if benchmark exists
+if [ ! -f "./bench_mid_large_mt_hakx" ]; then
+    echo "ERROR: bench_mid_large_mt_hakx not found!"
+    echo "Building benchmark..."
+    make bench_mid_large_mt_hakx
+    echo ""
+fi
+
+# Run benchmark multiple times and collect results
+echo "Running benchmark $RUNS times..."
+echo ""
+
+RESULTS=()
+for i in $(seq 1 $RUNS); do
+    echo "Run $i/$RUNS:"
+    OUTPUT=$($TASKSET ./bench_mid_large_mt_hakx $THREADS $CYCLES $WORKING_SET $SEED 2>&1)
+    echo "$OUTPUT"
+
+    # Extract throughput
+    MOPS=$(echo "$OUTPUT" | grep "Throughput:" | awk '{print $2}')
+    RESULTS+=($MOPS)
+    echo ""
+done
+
+# Calculate statistics
+echo "======================================"
+echo "Summary Statistics"
+echo "======================================"
+
+# Sort results for median calculation
+IFS=$'\n' SORTED=($(sort -n <<<"${RESULTS[*]}"))
+unset IFS
+
+# Calculate average
+SUM=0
+for val in "${RESULTS[@]}"; do
+    SUM=$(echo "$SUM + $val" | bc)
+done
+AVG=$(echo "scale=2; $SUM / ${#RESULTS[@]}" | bc)
+
+# Get median
+MID=$((${#RESULTS[@]} / 2))
+if [ $((${#RESULTS[@]} % 2)) -eq 0 ]; then
+    MEDIAN=$(echo "scale=2; (${SORTED[$MID-1]} + ${SORTED[$MID]}) / 2" | bc)
+else
+    MEDIAN=${SORTED[$MID]}
+fi
+
+# Get min/max
+MIN=${SORTED[0]}
+MAX=${SORTED[-1]}
+
+echo "Results (M ops/sec):"
+for i in "${!RESULTS[@]}"; do
+    printf "  Run %d: %s\n" $((i+1)) "${RESULTS[$i]}"
+done
+echo ""
+echo "Statistics:"
+printf "  Average: %.2f M ops/sec\n" $AVG
+printf "  Median:  %.2f M ops/sec\n" $MEDIAN
+printf "  Min:     %.2f M ops/sec\n" $MIN
+printf "  Max:     %.2f M ops/sec\n" $MAX
+printf "  Range:   %.2f - %.2f M\n" $MIN $MAX
+echo ""
+
+# Performance vs target
+TARGET_MIN=95
+TARGET_MAX=120
+if (( $(echo "$MEDIAN >= $TARGET_MIN" | bc -l) )); then
+    PCT=$(echo "scale=1; $MEDIAN / $TARGET_MAX * 100" | bc)
+    echo "Target Achievement: ${PCT}% of 120M target ✅"
+else
+    echo "Target Achievement: Below 95M target ❌"
+fi
+
+echo ""
+echo "Benchmark completed successfully!"
--- a/benchmarks/scripts/run_full_benchmark.sh
+++ b/benchmarks/scripts/run_full_benchmark.sh
@ -0,0 +1,65 @@
+#!/bin/bash
+# Full benchmark with jemalloc & mimalloc via LD_PRELOAD
+
+set -e
+
+WARMUP=2
+RUNS=50
+OUTPUT="full_benchmark.csv"
+
+echo "allocator,scenario,iterations,avg_ns,soft_pf,hard_pf,rss_kb,ops_per_sec" > "$OUTPUT"
+
+SCENARIOS=("json" "mir" "vm" "mixed")
+ALLOCATORS=(
+    "hakmem-baseline:./bench_allocators_hakmem:"
+    "hakmem-evolving:./bench_allocators_hakmem:"
+    "system:./bench_allocators_system:"
+    "jemalloc:./bench_allocators_system:LD_PRELOAD=/lib/x86_64-linux-gnu/libjemalloc.so.2"
+    "mimalloc:./bench_allocators_system:LD_PRELOAD=/lib/x86_64-linux-gnu/libmimalloc.so.2"
+)
+
+total_runs=$((${#ALLOCATORS[@]} * ${#SCENARIOS[@]} * RUNS))
+current=0
+
+echo "========================================"
+echo "Full Benchmark: jemalloc & mimalloc"
+echo "========================================"
+echo "Warmup: $WARMUP, Runs: $RUNS"
+echo "Total: $total_runs runs"
+echo ""
+
+for alloc_spec in "${ALLOCATORS[@]}"; do
+    IFS=':' read -r alloc_name binary env_prefix <<< "$alloc_spec"
+    
+    echo "📊 Allocator: $alloc_name"
+    
+    for scenario in "${SCENARIOS[@]}"; do
+        echo "  Scenario: $scenario"
+        
+        # Warmup
+        for ((i=1; i<=WARMUP; i++)); do
+            if [ -n "$env_prefix" ]; then
+                $env_prefix $binary --allocator "$alloc_name" --scenario "$scenario" --iterations 1 >> /dev/null 2>&1 || true
+            else
+                $binary --allocator "$alloc_name" --scenario "$scenario" --iterations 1 >> /dev/null 2>&1 || true
+            fi
+        done
+        
+        # Real runs
+        for ((i=1; i<=RUNS; i++)); do
+            current=$((current + 1))
+            printf "\r    Progress: %d/%d" "$current" "$total_runs"
+            
+            if [ -n "$env_prefix" ]; then
+                $env_prefix $binary --allocator "$alloc_name" --scenario "$scenario" --iterations 1 2>/dev/null | grep "^$alloc_name," >> "$OUTPUT" || true
+            else
+                $binary --allocator "$alloc_name" --scenario "$scenario" --iterations 1 2>/dev/null | grep "^$alloc_name," >> "$OUTPUT" || true
+            fi
+        done
+        echo ""
+    done
+done
+
+echo ""
+echo "✅ Benchmark completed: $OUTPUT"
+wc -l "$OUTPUT"
--- a/benchmarks/scripts/tiny/measure_rss_tiny.sh
+++ b/benchmarks/scripts/tiny/measure_rss_tiny.sh
@ -0,0 +1,62 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Measure steady-state RSS for Tiny sizes by maintaining a live set
+# and churning short-lived allocations. Reports peak and end RSS.
+#
+# Usage: scripts/measure_rss_tiny.sh <size> <live_count> <iters>
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+if [[ $# -lt 3 ]]; then
+  echo "usage: $0 <size> <live_count> <iters>" >&2
+  exit 1
+fi
+size=$1; live=$2; iters=$3
+
+cat > "$ROOT_DIR/.tmp_rss_bench.c" <<'EOF'
+#include <stdio.h>
+#include <stdlib.h>
+#include <stdint.h>
+#include <string.h>
+#include <time.h>
+#include <unistd.h>
+
+static size_t get_rss_kb(void) {
+  FILE* f = fopen("/proc/self/statm","r");
+  if (!f) return 0; unsigned long size, res; fscanf(f, "%lu %lu", &size, &res); fclose(f);
+  long ps = sysconf(_SC_PAGESIZE); return (size_t)((res * ps) / 1024);
+}
+
+int main(int argc, char** argv) {
+  size_t size = (size_t)strtoull(argv[1],NULL,10);
+  int live = atoi(argv[2]);
+  int iters = atoi(argv[3]);
+  void** L = (void**)malloc(sizeof(void*)*(size_t)live);
+  for (int i=0;i<live;i++) L[i] = malloc(size);
+  size_t peak=0;
+  for (int it=0; it<iters; it++) {
+    for (int i=0;i<live;i+=2) { free(L[i]); L[i]=malloc(size); }
+    size_t rss = get_rss_kb(); if (rss>peak) peak=rss;
+  }
+  size_t end_rss = get_rss_kb();
+  printf("peak_rss_kb=%zu end_rss_kb=%zu\n", peak, end_rss);
+  for (int i=0;i<live;i++) free(L[i]); free(L); return 0;
+}
+EOF
+
+gcc -O3 -march=native -mtune=native .tmp_rss_bench.c -o .tmp_rss_bench
+
+echo "[info] Building shared lib (for LD_PRELOAD HAKMEM case)"
+make -s pgo-build-shared >/dev/null || true
+
+echo "[case] HAKMEM (LD_PRELOAD)"
+out_h=$(HAKMEM_LD_SAFE=1 LD_PRELOAD="$ROOT_DIR/libhakmem.so" ./.tmp_rss_bench "$size" "$live" "$iters")
+echo "$out_h"
+
+echo "[case] System"
+out_s=$(./.tmp_rss_bench "$size" "$live" "$iters")
+echo "$out_s"
+
+rm -f .tmp_rss_bench .tmp_rss_bench.c
--- a/benchmarks/scripts/tiny/run_tiny_benchfast_triad.sh
+++ b/benchmarks/scripts/tiny/run_tiny_benchfast_triad.sh
@ -0,0 +1,24 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build and run Tiny-Hot triad with bench-only fast path (SLL→Mag→tiny refill).
+# Usage: scripts/run_tiny_benchfast_triad.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+echo "[build] system/mimalloc (fast + mi)"
+make -s bench_fast bench_tiny_hot_mi >/dev/null
+
+echo "[build] HAKMEM bench-fastpath"
+make -s bench_fastpath >/dev/null
+
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}"
+
+echo "[run] triad (bench-fastpath HAKMEM vs System vs mimalloc)"
+SKIP_BUILD=1 bash scripts/run_tiny_hot_triad.sh "$cycles"
+
+echo "[note] Latest CSV printed by triad runner."
+
--- a/benchmarks/scripts/tiny/run_tiny_debug.sh
+++ b/benchmarks/scripts/tiny/run_tiny_debug.sh
@ -0,0 +1,32 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Debug-oriented Tiny triad run with counters + perf
+# Usage: scripts/run_tiny_debug.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-40000}
+
+echo "[build] bench_debug (HAKMEM_DEBUG_COUNTERS=1)"
+make -s bench_debug >/dev/null
+
+export HAKMEM_TINY_PATH_DEBUG=1
+export HAKMEM_TINY_COUNTERS_DUMP=1
+
+echo "[run] tiny hot triad (cycles=$cycles)"
+HAKMEM_TINY_SPECIALIZE_32_64=${HAKMEM_TINY_SPECIALIZE_32_64:-1} \
+HAKMEM_TINY_BUMP_SHADOW=${HAKMEM_TINY_BUMP_SHADOW:-0} \
+HAKMEM_TINY_BG_BIN=${HAKMEM_TINY_BG_BIN:-0} \
+HAKMEM_TINY_ULTRA_SIMPLE=${HAKMEM_TINY_ULTRA_SIMPLE:-0} \
+HAKMEM_TINY_HOTMAG=${HAKMEM_TINY_HOTMAG:-0} \
+HAKMEM_WRAP_TINY=1 HAKMEM_INT_ENGINE=0 HAKMEM_TINY_TLS_SLL=1 \
+bash scripts/run_tiny_hot_triad.sh "$cycles"
+
+echo "[perf] 32B / 64B"
+HAKMEM_TINY_SPECIALIZE_32_64=1 bash scripts/run_perf_hot_triad.sh 32 100 50000 3
+HAKMEM_TINY_SPECIALIZE_32_64=1 bash scripts/run_perf_hot_triad.sh 64 100 50000 3
+
+echo "[done] Inspect stderr for [Tiny Path Debug] and [Tiny Extended Counters]"
+
--- a/benchmarks/scripts/tiny/run_tiny_hot_sweep.sh
+++ b/benchmarks/scripts/tiny/run_tiny_hot_sweep.sh
@ -0,0 +1,47 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Sweep Tiny hot-path microbench across sizes/batches and save CSV
+# Usage: scripts/run_tiny_hot_sweep.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-200000}
+
+echo "[info] Building tiny hot benches (bench_fast)"
+make -s bench_fast >/dev/null
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/tiny_hot_${TS}"
+mkdir -p "$OUTDIR"
+CSV="$OUTDIR/results.csv"
+echo "mode,size,batch,cycles,throughput_mops" > "$CSV"
+
+sizes=(8 16 24 32 40 48 56 64 128)
+batches=(50 100 200)
+
+run_case() {
+  local mode="$1"; shift
+  local size="$1"; shift
+  local batch="$1"; shift
+  local cyc="$1"; shift
+  local bin
+  if [[ "$mode" == "hakmem" ]]; then bin="./bench_tiny_hot_hakmem"; else bin="./bench_tiny_hot_system"; fi
+  local out
+  out=$($bin "$size" "$batch" "$cyc" | sed -n 's/^Throughput: \([0-9.][0-9.]*\) M ops.*/\1/p' || true)
+  if [[ -n "$out" ]]; then echo "$mode,$size,$batch,$cyc,$out" >> "$CSV"; fi
+}
+
+for s in "${sizes[@]}"; do
+  for b in "${batches[@]}"; do
+    echo "[run] HAKMEM size=$s batch=$b cycles=$cycles"
+    run_case hakmem "$s" "$b" "$cycles"
+    echo "[run] SYSTEM size=$s batch=$b cycles=$cycles"
+    run_case system "$s" "$b" "$cycles"
+  done
+done
+
+echo "[done] CSV: $CSV"
+grep -E '^(mode|hakmem)' "$CSV" | sed -n '1,20p' || true
+
--- a/benchmarks/scripts/tiny/run_tiny_hot_triad.sh
+++ b/benchmarks/scripts/tiny/run_tiny_hot_triad.sh
@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run tiny hot bench across sizes/batches for HAKMEM, System, mimalloc (direct-link triad)
+# Usage: scripts/run_tiny_hot_triad.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-100000}
+
+if [[ "${SKIP_BUILD:-0}" != "1" ]]; then
+  echo "[build] benches (fast + mi)"
+  make -s bench_fast bench_tiny_hot_mi >/dev/null
+else
+  echo "[build] skipped (SKIP_BUILD=1)"
+fi
+
+# Ensure LD_LIBRARY_PATH is defined (set -u safety)
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}"
+MI_LIBDIR="$ROOT_DIR/mimalloc-bench/extern/mi/out/release"
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/tiny_hot_triad_${TS}"
+mkdir -p "$OUTDIR"
+CSV="$OUTDIR/results.csv"
+echo "size,batch,cycles,allocator,throughput_mops" > "$CSV"
+
+sizes=(8 16 24 32 40 48 56 64 128)
+batches=(50 100 200)
+
+run_case() {
+  local size="$1"; shift
+  local batch="$1"; shift
+  local cyc="$1"; shift
+  local bin="$1"; shift
+  local alloc="$1"; shift
+  local out
+  if [[ "$alloc" == "mimalloc" ]]; then
+    LD_LIBRARY_PATH="$LD_LIBRARY_PATH:$MI_LIBDIR" \
+      $bin "$size" "$batch" "$cyc" | sed -n 's/^Throughput: \([0-9.][0-9.]*\) M ops.*/\1/p' >"$OUTDIR/tmp.txt" || true
+  else
+    $bin "$size" "$batch" "$cyc" | sed -n 's/^Throughput: \([0-9.][0-9.]*\) M ops.*/\1/p' >"$OUTDIR/tmp.txt" || true
+  fi
+  out=$(cat "$OUTDIR/tmp.txt" || true)
+  if [[ -n "$out" ]]; then echo "$size,$batch,$cyc,$alloc,$out" >> "$CSV"; fi
+}
+
+for s in "${sizes[@]}"; do
+  for b in "${batches[@]}"; do
+    echo "[run] HAKMEM size=$s batch=$b cycles=$cycles"
+    run_case "$s" "$b" "$cycles" ./bench_tiny_hot_hakmem hakmem
+    echo "[run] SYSTEM size=$s batch=$b cycles=$cycles"
+    run_case "$s" "$b" "$cycles" ./bench_tiny_hot_system system
+    echo "[run] MIMALLOC size=$s batch=$b cycles=$cycles"
+    run_case "$s" "$b" "$cycles" ./bench_tiny_hot_mi mimalloc
+  done
+done
+
+echo "[done] CSV: $CSV"
+sed -n '1,40p' "$CSV" || true
--- a/benchmarks/scripts/tiny/run_tiny_sllonly_r12w192_triad.sh
+++ b/benchmarks/scripts/tiny/run_tiny_sllonly_r12w192_triad.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build and run Tiny-Hot triad with bench-only SLL-only + warmup tuned (REFILL=12, WARMUP32=192) + PGO.
+# Usage: scripts/run_tiny_sllonly_r12w192_triad.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+echo "[build] system/mimalloc (fast + mi)"
+make -s bench_fast bench_tiny_hot_mi >/dev/null
+
+echo "[build] HAKMEM bench_sll_only (r12 w32=192 PGO)"
+make -s pgo-benchsll-r12w192-profile >/dev/null
+make -s pgo-benchsll-r12w192-build >/dev/null
+
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}"
+
+echo "[run] triad (bench_sll_only r12 w32=192)"
+SKIP_BUILD=1 bash scripts/run_tiny_hot_triad.sh "$cycles"
+
+echo "[note] Latest CSV printed by triad runner."
+
--- a/benchmarks/scripts/tiny/run_tiny_sllonly_triad.sh
+++ b/benchmarks/scripts/tiny/run_tiny_sllonly_triad.sh
@ -0,0 +1,25 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build and run Tiny-Hot triad with bench-only SLL-only + warmup + PGO.
+# Usage: scripts/run_tiny_sllonly_triad.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+echo "[build] system/mimalloc (fast + mi)"
+make -s bench_fast bench_tiny_hot_mi >/dev/null
+
+echo "[build] HAKMEM bench_sll_only (PGO)"
+make -s pgo-benchsll-profile >/dev/null
+make -s pgo-benchsll-build >/dev/null
+
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}"
+
+echo "[run] triad (bench_sll_only HAKMEM vs System vs mimalloc)"
+SKIP_BUILD=1 bash scripts/run_tiny_hot_triad.sh "$cycles"
+
+echo "[note] Latest CSV printed by triad runner."
+
--- a/benchmarks/scripts/tiny/run_tiny_ultra_triad.sh
+++ b/benchmarks/scripts/tiny/run_tiny_ultra_triad.sh
@ -0,0 +1,23 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Run Tiny-Hot triad with Ultra (SLL-only) front for HAKMEM, comparing to System/mimalloc.
+# Usage: scripts/run_tiny_ultra_triad.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+echo "[build] system/mimalloc benches (fast + mi)"
+make -s bench_fast bench_tiny_hot_mi >/dev/null
+
+echo "[build] HAKMEM Ultra (SLL-only)"
+make -s bench_ultra >/dev/null
+
+export LD_LIBRARY_PATH="${LD_LIBRARY_PATH:-}"
+
+echo "[run] triad (Ultra HAKMEM vs System vs mimalloc)"
+SKIP_BUILD=1 bash scripts/run_tiny_hot_triad.sh "$cycles"
+
+echo "[note] Latest CSV printed above by triad runner."
--- a/benchmarks/scripts/utils/head_to_head_large.sh
+++ b/benchmarks/scripts/utils/head_to_head_large.sh
@ -0,0 +1,46 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Head-to-head for Large(64KB–1MB), 10s, system/mimalloc/hakmem (P1/P2 profiles)
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+LARSON="$ROOT_DIR/mimalloc-bench/bench/larson/larson"
+LIB_HAK="$ROOT_DIR/libhakmem.so"
+LIB_MI="/lib/x86_64-linux-gnu/libmimalloc.so.2"
+
+if [[ ! -x "$LARSON" ]]; then
+  echo "[ERR] larson not found: $LARSON" >&2; exit 1
+fi
+if [[ ! -f "$LIB_HAK" ]]; then
+  echo "[ERR] libhakmem.so not found: $LIB_HAK" >&2; exit 1
+fi
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUT="$ROOT_DIR/docs/benchmarks/${TS}_HEAD2HEAD_LARGE"
+mkdir -p "$OUT"
+echo "[OUT] $OUT"
+
+cd "$ROOT_DIR/mimalloc-bench/bench/larson"
+
+echo "System malloc LARGE_4T" | tee "$OUT/system_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/system_large_4t.log"
+
+echo "mimalloc LARGE_4T" | tee "$OUT/mimalloc_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" env LD_PRELOAD="$LIB_MI" "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/mimalloc_large_4t.log"
+
+# P1 best (alloc優先)
+echo "hakmem P1 LARGE_4T (remote, factor=4, HDR=1)" | tee "$OUT/hakmem_p1_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" env LD_PRELOAD="$LIB_HAK" HAKMEM_WRAP_L25=1 HAKMEM_L25_PREF=remote HAKMEM_L25_RUN_FACTOR=4 \
+    HAKMEM_HDR_LIGHT=1 HAKMEM_SHARD_MIX=1 HAKMEM_TLS_LO_MAX=512 \
+    "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/hakmem_p1_large_4t.log"
+
+# P2+TC best (free優先)
+echo "hakmem P2+TC LARGE_4T (remote, factor=4, HDR=2, TC_SPILL=16)" | tee "$OUT/hakmem_p2_large_4t.log"
+timeout "${BENCH_TIMEOUT:-13}s" env LD_PRELOAD="$LIB_HAK" HAKMEM_WRAP_L25=1 HAKMEM_L25_PREF=remote HAKMEM_L25_RUN_FACTOR=4 \
+    HAKMEM_HDR_LIGHT=2 HAKMEM_L25_TC_SPILL=16 HAKMEM_SHARD_MIX=1 HAKMEM_TLS_LO_MAX=512 \
+    "$LARSON" 10 65536 1048576 10000 1 12345 4 2>&1 | tee -a "$OUT/hakmem_p2_large_4t.log"
+
+cd - >/dev/null
+
+rg -n "Throughput" "$OUT"/*.log | tee "$OUT/summary.txt" || true
+echo "[DONE] Logs at $OUT"
--- a/benchmarks/scripts/utils/kill_bench.sh
+++ b/benchmarks/scripts/utils/kill_bench.sh
@ -0,0 +1,41 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Kill any lingering mimalloc-bench/larson runs and our bench runner scripts.
+# Usage: scripts/kill_bench.sh
+
+PATS=(
+  "mimalloc-bench/bench/larson/larson"
+  "scripts/run_bench_suite.sh"
+  "scripts/save_prof_sweep.sh"
+  "scripts/ab_sweep_mid.sh"
+)
+
+found=0
+for pat in "${PATS[@]}"; do
+  if pgrep -fa "$pat" >/dev/null 2>&1; then
+    echo "[kill_bench] Found processes for: $pat"
+    pgrep -fa "$pat" || true
+    found=1
+  fi
+done
+
+if [[ "$found" -eq 0 ]]; then
+  echo "[kill_bench] No matching bench processes found."
+  exit 0
+fi
+
+echo "[kill_bench] Sending SIGTERM..."
+for pat in "${PATS[@]}"; do
+  pgrep -f "$pat" >/dev/null 2>&1 && pkill -f "$pat" || true
+done
+
+sleep 1
+
+echo "[kill_bench] Forcing SIGKILL for leftovers..."
+for pat in "${PATS[@]}"; do
+  pgrep -f "$pat" >/dev/null 2>&1 && pkill -9 -f "$pat" || true
+done
+
+echo "[kill_bench] Done."
+
--- a/benchmarks/scripts/utils/lua_workload.lua
+++ b/benchmarks/scripts/utils/lua_workload.lua
@ -0,0 +1,25 @@
+-- lua_workload.lua - mixed string builder + table churn
+
+local N = tonumber(os.getenv("LUA_WORK_N")) or 500000
+
+-- String builder (amortized)
+local t = {}
+for i = 1, N do
+  t[#t+1] = tostring(i)
+  if (i % 5) == 0 then t[#t+1] = "-" end
+end
+local s = table.concat(t)
+
+-- Table churn (insert/remove)
+local arr = {}
+for i = 1, N do
+  arr[i] = i * 3
+end
+local sum = 0
+for i = 1, N, 3 do
+  sum = sum + (arr[i] or 0)
+  arr[i] = nil
+end
+
+print("len(s)=", #s, " sum=", sum)
+
--- a/benchmarks/scripts/utils/parse_comprehensive_logs.py
+++ b/benchmarks/scripts/utils/parse_comprehensive_logs.py
@ -0,0 +1,54 @@
+#!/usr/bin/env python3
+import sys, os, re
+
+TEST_MAP = {
+    'Sequential LIFO': 'lifo',
+    'Sequential FIFO': 'fifo',
+    'Random Order Free': 'random',
+    'Interleaved': 'interleave',
+    'Long-lived vs Short-lived': 'longshort',
+    'Mixed Sizes': 'mixed',
+}
+
+def parse_file(path, allocator):
+    size = None
+    cur_test = None
+    results = []
+    with open(path,'r',errors='ignore') as f:
+        for line in f:
+            m = re.search(r'^SIZE CLASS:\s*(\d+) Bytes', line)
+            if m:
+                size = int(m.group(1))
+                cur_test = None
+                continue
+            # Detect tests
+            for key, short in TEST_MAP.items():
+                if key != 'Mixed Sizes' and key in line:
+                    cur_test = short
+                    break
+            if 'Mixed Sizes ---' in line or 'Test 5: Mixed Sizes' in line:
+                size = 'mixed'
+                cur_test = 'mixed'
+            m2 = re.search(r'^Throughput:\s*([0-9.]+) M ops/sec', line)
+            if m2:
+                thr = float(m2.group(1))
+                results.append((allocator, size, cur_test, thr))
+    return results
+
+def main():
+    if len(sys.argv) != 2:
+        print('usage: parse_comprehensive_logs.py <dir>', file=sys.stderr)
+        sys.exit(1)
+    d = sys.argv[1]
+    out = []
+    for name, alloc in [('hakmem.log','hakmem'),('mimalloc.log','mimalloc'),('system.log','system')]:
+        p = os.path.join(d,name)
+        if os.path.exists(p):
+            out.extend(parse_file(p, alloc))
+    print('allocator,size,test,throughput_mops')
+    for rec in out:
+        print('{},{},{},{}'.format(rec[0], rec[1], rec[2], rec[3]))
+
+if __name__ == '__main__':
+    main()
+
--- a/benchmarks/scripts/utils/parse_mimalloc_logs.py
+++ b/benchmarks/scripts/utils/parse_mimalloc_logs.py
@ -0,0 +1,49 @@
+#!/usr/bin/env python3
+import sys, re, os, glob
+
+# Parse mimalloc-bench logs and print CSV: file,allocator,procs,test,throughput
+# Assumes lines like: "Throughput =   1234567 operations per second, ..."
+
+def infer_allocator_from_file(path):
+    fn = os.path.basename(path)
+    if fn.startswith('hakmem_'): return 'hakmem'
+    if fn.startswith('mimalloc_'): return 'mimalloc'
+    if fn.startswith('jemalloc_'): return 'jemalloc'
+    if fn.startswith('system_'): return 'system'
+    # fallback: try substring
+    for k in ('hakmem','mimalloc','jemalloc','system'):
+        if k in fn: return k
+    return 'unknown'
+
+def infer_procs_from_file(path):
+    m = re.search(r'procs=([0-9,]+)', os.path.basename(path))
+    return m.group(1) if m else ''
+
+def parse_file(path):
+    alloc = infer_allocator_from_file(path)
+    procs = infer_procs_from_file(path)
+    test = ''
+    with open(path,'r',errors='ignore') as f:
+        for line in f:
+            tl = line.strip()
+            m = re.search(r'^Test\s*:\s*(\S+)', tl)
+            if m: test = m.group(1)
+            m2 = re.search(r'Throughput\s*=\s*([0-9]+)\s+operations per second', tl)
+            if m2:
+                thr = int(m2.group(1))
+                yield (path, alloc, procs, test, thr)
+
+def main():
+    if len(sys.argv) != 2:
+        print(f"usage: {sys.argv[0]} <log_dir>")
+        sys.exit(1)
+    logdir = sys.argv[1]
+    files = sorted(glob.glob(os.path.join(logdir,'*.log')))
+    print('file,allocator,procs,test,throughput_ops_per_sec')
+    for fp in files:
+        for rec in parse_file(fp):
+            print(','.join(str(x) for x in rec))
+
+if __name__ == '__main__':
+    main()
+
--- a/benchmarks/scripts/utils/parse_usdt_stat.py
+++ b/benchmarks/scripts/utils/parse_usdt_stat.py
@ -0,0 +1,128 @@
+#!/usr/bin/env python3
+import sys
+import os
+import re
+import csv
+
+# Accept PMU events with or without user-only suffix (":u")
+PMU_EVENTS = {
+    'cycles': 'cycles',
+    'cycles:u': 'cycles',
+    'instructions': 'instructions',
+    'instructions:u': 'instructions',
+    'L1-dcache-load-misses': 'l1_miss',
+    'L1-dcache-load-misses:u': 'l1_miss',
+    'branch-misses': 'br_miss',
+    'branch-misses:u': 'br_miss',
+}
+
+USDT_EVENTS = {
+    'sdt:hakmem:sll_pop': 'sll_pop',
+    'sdt:hakmem:mag_pop': 'mag_pop',
+    'sdt:hakmem:front_pop': 'front_pop',
+    'sdt:hakmem:bump_hit': 'bump_hit',
+    'sdt:hakmem:slow_alloc': 'slow_alloc',
+    'sdt:hakmem:sll_push': 'sll_push',
+    'sdt:hakmem:mag_push': 'mag_push',
+    'sdt:hakmem:spill_super': 'spill_super',
+    'sdt:hakmem:spill_tiny': 'spill_tiny',
+    'sdt:hakmem:remote_drain': 'remote_drain',
+    'sdt:hakmem:superslab_alloc': 'super_alloc',
+    'sdt:hakmem:superslab_fail': 'super_fail',
+    'sdt:hakmem:quick_pop': 'quick_pop',
+    'sdt:hakmem:quick_refill_sll': 'quick_refill_sll',
+    'sdt:hakmem:quick_refill_mag': 'quick_refill_mag',
+    'sdt:hakmem:bitmap_burst': 'bitmap_burst',
+    'sdt:hakmem:mag_refill': 'mag_refill',
+    'sdt:hakmem:bitmap_scan': 'bitmap_scan',
+}
+
+def parse_value(s):
+    s = s.strip()
+    # perf may print numbers with no separators in -x , mode; best-effort
+    try:
+        return int(s)
+    except ValueError:
+        # try float to int
+        try:
+            return int(float(s))
+        except Exception:
+            return None
+
+def parse_stat_file(path):
+    data = {}
+    with open(path, 'r', errors='ignore') as f:
+        for line in f:
+            parts = [p.strip() for p in line.strip().split(',')]
+            if len(parts) < 3:
+                continue
+            val = parse_value(parts[0])
+            event = parts[2]
+            if val is None:
+                continue
+            # Normalize PMU event key (strip optional ":u")
+            if not event.startswith('sdt:'):
+                base = event.split(':')[0]
+                if event not in PMU_EVENTS and base in PMU_EVENTS:
+                    event = base
+            if event in PMU_EVENTS:
+                data[PMU_EVENTS[event]] = val
+            elif event in USDT_EVENTS:
+                name = USDT_EVENTS[event]
+                data[name] = data.get(name, 0) + val
+            # else ignore
+    return data
+
+def main():
+    if len(sys.argv) != 2:
+        print("Usage: parse_usdt_stat.py <usdt_results_dir>")
+        sys.exit(1)
+    root = sys.argv[1]
+    rows = []
+    for fn in sorted(os.listdir(root)):
+        if not fn.endswith('.stat.csv'):
+            continue
+        m = re.match(r'(?P<alloc>hakmem|system)_s(?P<size>\d+)_b(?P<batch>\d+)_c(?P<cycles>\d+)\.stat\.csv', fn)
+        if not m:
+            continue
+        meta = m.groupdict()
+        path = os.path.join(root, fn)
+        stats = parse_stat_file(path)
+        row = {
+            'allocator': meta['alloc'],
+            'size': int(meta['size']),
+            'batch': int(meta['batch']),
+            'cycles_param': int(meta['cycles']),
+        }
+        row.update(stats)
+        # derived
+        total_pops = sum(row.get(k, 0) for k in ('sll_pop','mag_pop','front_pop'))
+        if total_pops > 0:
+            row['front_rate'] = row.get('front_pop',0)/total_pops
+            row['sll_rate'] = row.get('sll_pop',0)/total_pops
+            row['mag_rate'] = row.get('mag_pop',0)/total_pops
+        else:
+            row['front_rate'] = row['sll_rate'] = row['mag_rate'] = 0.0
+        rows.append(row)
+
+    # sort for readability
+    rows.sort(key=lambda r: (r['allocator'], r['size'], r['batch']))
+    out = os.path.join(root, 'summary.csv')
+    # collect headers
+    headers = ['allocator','size','batch','cycles_param'] + list(PMU_EVENTS.values()) + list(USDT_EVENTS.values()) + ['front_rate','sll_rate','mag_rate']
+    # remove duplicates but keep order
+    seen = set()
+    hdr_final = []
+    for h in headers:
+        if h not in seen:
+            hdr_final.append(h)
+            seen.add(h)
+    with open(out, 'w', newline='') as f:
+        w = csv.DictWriter(f, fieldnames=hdr_final)
+        w.writeheader()
+        for r in rows:
+            w.writerow(r)
+    print(out)
+
+if __name__ == '__main__':
+    main()
--- a/benchmarks/scripts/utils/run_benchfast_sweep.sh
+++ b/benchmarks/scripts/utils/run_benchfast_sweep.sh
@ -0,0 +1,37 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Sweep bench-fastpath refill sizes (8/12/16) and run Tiny-Hot triad each.
+# Usage: scripts/run_benchfast_sweep.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+echo "[build] system/mimalloc (fast + mi)"
+make -s bench_fast bench_tiny_hot_mi >/dev/null
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/tiny_benchfast_sweep_${TS}"
+mkdir -p "$OUTDIR"
+
+run_case() {
+  local tag="$1"; shift
+  echo "[build] HAKMEM bench-fastpath (${tag})"
+  make -s "bench_fastpath_${tag}" >/dev/null
+  echo "[run] triad (${tag})"
+  SKIP_BUILD=1 bash scripts/run_tiny_hot_triad.sh "$cycles"
+  # pick the latest triad CSV and copy with tag
+  local latest_csv
+  latest_csv=$(ls -1dt bench_results/tiny_hot_triad_* | head -1)/results.csv
+  cp "$latest_csv" "$OUTDIR/results_${tag}.csv"
+  echo "[saved] $OUTDIR/results_${tag}.csv"
+}
+
+run_case r8
+run_case r12
+run_case r16
+
+echo "[done] sweep outputs in: $OUTDIR"
+
--- a/benchmarks/scripts/utils/run_larson.sh
+++ b/benchmarks/scripts/utils/run_larson.sh
@ -0,0 +1,170 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Reproducible larson runner for hakmem/system/mimalloc.
+#
+# Usage:
+#   scripts/run_larson.sh [runtime_sec] [threads]
+# Examples:
+#   scripts/run_larson.sh                 # default: 10s, threads=1 4
+#   scripts/run_larson.sh 10 1            # 10s, 1 thread
+#
+# Optional env vars:
+#   HAKMEM_WRAP_TINY=0|1
+#   HAKMEM_WRAP_TINY_REFILL=0|1
+#   HAKMEM_TINY_MAG_CAP=INT
+#   HAKMEM_SAFE_FREE=0|1
+#   HAKMEM_EVO_SAMPLE=INT  (0 disables evo recording; default 0)
+#   MIMALLOC_SO=/path/to/libmimalloc.so.2 (optional; if not set, auto-detect)
+
+usage() {
+  cat << USAGE
+Usage: scripts/run_larson.sh [options] [runtime_sec] [threads_csv]
+
+Options:
+  -d SECONDS     Runtime seconds (default: 10)
+  -t CSV         Threads CSV, e.g. 1,4 (default: 1,4)
+  -c NUM         Chunks per thread (default: 10000)
+  -r NUM         Rounds (default: 1)
+  -m BYTES       Min size (default: 8)
+  -M BYTES       Max size (default: 1024)
+  -s SEED        Random seed (default: 12345)
+  -p PRESET      Preset: burst|loop (sets -c/-r)
+  -w             Include WRAP_TINY=1 runs (default: off)
+  -h             Show this help
+
+Env overrides (alternative to flags):
+  MIN, MAX, CHUNK_PER_THREAD, ROUNDS, SEED
+  HAKMEM_* toggles per README
+USAGE
+}
+
+# Defaults
+RUNTIME="10"
+THREADS_ARG="1,4"
+
+# Workload defaults (burst preset)
+MIN="${MIN:-8}"
+MAX="${MAX:-1024}"
+CHUNK_PER_THREAD="${CHUNK_PER_THREAD:-10000}"
+ROUNDS="${ROUNDS:-1}"
+SEED="${SEED:-12345}"
+
+PRESET=""
+
+INCLUDE_WRAP=0
+while getopts ":d:t:c:r:m:M:s:p:wh" opt; do
+  case $opt in
+    d) RUNTIME="$OPTARG" ;;
+    t) THREADS_ARG="$OPTARG" ;;
+    c) CHUNK_PER_THREAD="$OPTARG" ;;
+    r) ROUNDS="$OPTARG" ;;
+    m) MIN="$OPTARG" ;;
+    M) MAX="$OPTARG" ;;
+    s) SEED="$OPTARG" ;;
+    p) PRESET="$OPTARG" ;;
+    w) INCLUDE_WRAP=1 ;;
+    h) usage; exit 0 ;;
+    :) echo "Missing argument for -$OPTARG" >&2; usage; exit 2 ;;
+    *) usage; exit 2 ;;
+  esac
+done
+shift $((OPTIND-1))
+
+# Backward-compatible positional args
+if [[ $# -ge 1 ]]; then RUNTIME="$1"; fi
+if [[ $# -ge 2 ]]; then THREADS_ARG="$2"; fi
+
+case "$PRESET" in
+  burst|BURST)
+    CHUNK_PER_THREAD=10000; ROUNDS=1 ;;
+  loop|LOOP)
+    CHUNK_PER_THREAD=100; ROUNDS=100 ;;
+  "" ) : ;;
+  *) echo "Unknown preset: $PRESET" >&2; exit 2 ;;
+esac
+
+# Params matching our standard runs (larson reads: runtime, min, max, chunks/thread, rounds, seed, threads)
+# Show resolved parameters for reproducibility
+echo "[CFG] runtime=${RUNTIME}s threads={${THREADS_ARG}} min=${MIN} max=${MAX} chunks/thread=${CHUNK_PER_THREAD} rounds=${ROUNDS} seed=${SEED}"
+
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+LIB_HAKMEM="$ROOT_DIR/libhakmem.so"
+LARSON_BIN="$ROOT_DIR/mimalloc-bench/bench/larson/larson"
+
+if [[ ! -x "$LARSON_BIN" ]]; then
+  echo "[ERR] Larson binary not found at: $LARSON_BIN" >&2
+  echo "      Did you sync submodule/build bench?" >&2
+  exit 1
+fi
+
+if [[ ! -f "$LIB_HAKMEM" ]]; then
+  echo "[INFO] libhakmem.so not found; building..."
+  (cd "$ROOT_DIR" && make -j4 shared >/dev/null)
+fi
+
+abs_hakmem="$(readlink -f "$LIB_HAKMEM")"
+
+detect_mimalloc() {
+  if [[ -n "${MIMALLOC_SO:-}" && -f "$MIMALLOC_SO" ]]; then
+    echo "$MIMALLOC_SO"
+    return 0
+  fi
+  # try common paths or ldconfig
+  for p in \
+    /usr/lib/x86_64-linux-gnu/libmimalloc.so.2 \
+    /lib/x86_64-linux-gnu/libmimalloc.so.2; do
+    [[ -f "$p" ]] && { echo "$p"; return 0; }
+  done
+  if command -v ldconfig >/dev/null 2>&1; then
+    so="$(ldconfig -p | awk '/libmimalloc.so/ {print $4; exit}')"
+    [[ -n "$so" && -f "$so" ]] && { echo "$so"; return 0; }
+  fi
+  return 1
+}
+
+mimalloc_so=""
+if mimalloc_so=$(detect_mimalloc); then
+  :
+else
+  mimalloc_so=""
+fi
+
+run_case() {
+  local label="$1"; shift
+  local preload="$1"; shift
+  local threads="$1"; shift
+
+  echo "\n== $label | ${threads}T | ${RUNTIME}s =="
+  if [[ -n "$preload" ]]; then
+    env LD_PRELOAD="$preload" "$LARSON_BIN" "$RUNTIME" "$MIN" "$MAX" "$CHUNK_PER_THREAD" "$ROUNDS" "$SEED" "$threads" 2>&1 | tail -n 3
+  else
+    "$LARSON_BIN" "$RUNTIME" "$MIN" "$MAX" "$CHUNK_PER_THREAD" "$ROUNDS" "$SEED" "$threads" 2>&1 | tail -n 3
+  fi
+}
+
+IFS=',' read -r -a THREADS <<< "$THREADS_ARG"
+
+for t in "${THREADS[@]}"; do
+  # system malloc
+  run_case "system malloc" "" "$t"
+
+  # mimalloc (optional)
+  if [[ -n "$mimalloc_so" ]]; then
+    run_case "mimalloc" "$mimalloc_so" "$t"
+  else
+    echo "\n== mimalloc | ${t}T | ${RUNTIME}s =="
+    echo "[SKIP] libmimalloc not found"
+  fi
+
+  # hakmem default
+  run_case "hakmem (default)" "$abs_hakmem" "$t"
+
+  # hakmem wrap tiny (optional)
+  if [[ "$INCLUDE_WRAP" -eq 1 ]]; then
+    echo "\n== hakmem (HAKMEM_WRAP_TINY=1) | ${t}T | ${RUNTIME}s =="
+    HAKMEM_WRAP_TINY=1 LD_PRELOAD="$abs_hakmem" "$LARSON_BIN" "$RUNTIME" "$MIN" "$MAX" "$CHUNK_PER_THREAD" "$ROUNDS" "$SEED" "$t" 2>&1 | tail -n 3
+  fi
+done
+
+echo "\nDone."
--- a/benchmarks/scripts/utils/run_memory_efficiency.sh
+++ b/benchmarks/scripts/utils/run_memory_efficiency.sh
@ -0,0 +1,79 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Compare memory efficiency (Max RSS) between HAKMEM and System on tiny-hot bench.
+# - Runs selected sizes/batches with /usr/bin/time -v and parses Maximum resident set size (KB).
+# - Optionally toggles HAKMEM_TINY_FLUSH_ON_EXIT to evaluate exit-time trimming.
+# Output: bench_results/memory_eff_YYYYMMDD_HHMMSS/results.csv
+# Usage: scripts/run_memory_efficiency.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+
+if [[ ! -x /usr/bin/time ]]; then
+  echo "[error] /usr/bin/time not found. Install 'time' package." >&2
+  exit 1
+fi
+
+echo "[build] perf_main benches (no bench-only macros)"
+make -s perf_main >/dev/null
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/memory_eff_${TS}"
+mkdir -p "$OUTDIR"
+CSV="$OUTDIR/results.csv"
+echo "allocator,size,batch,cycles,flush_on_exit,max_rss_kb,elapsed_ms" > "$CSV"
+
+sizes=(32 64 128)
+batches=(100)
+
+run_case() {
+  local alloc="$1"; shift
+  local size="$1"; shift
+  local batch="$1"; shift
+  local cyc="$1"; shift
+  local flush="$1"; shift
+  local bin
+  if [[ "$alloc" == "hakmem" ]]; then bin=./bench_tiny_hot_hakmem; else bin=./bench_tiny_hot_system; fi
+  local tmp_log="$OUTDIR/tmp_${alloc}_${size}_${batch}_${cyc}_${flush}.log"
+  local tmp_out="$OUTDIR/tmp_${alloc}_${size}_${batch}_${cyc}_${flush}.out"
+  if [[ "$alloc" == "hakmem" ]]; then
+    HAKMEM_TINY_FLUSH_ON_EXIT="$flush" /usr/bin/time -v "$bin" "$size" "$batch" "$cyc" >"$tmp_out" 2>"$tmp_log" || true
+  else
+    /usr/bin/time -v "$bin" "$size" "$batch" "$cyc" >"$tmp_out" 2>"$tmp_log" || true
+  fi
+  local rss=$(sed -n 's/^\s*Maximum resident set size (kbytes): \([0-9]\+\).*/\1/p' "$tmp_log" | tail -1)
+  local elapsed=$(sed -n 's/^\s*Elapsed (wall clock) time (h:mm:ss or m:ss): \(.*\)/\1/p' "$tmp_log" | tail -1)
+  # convert elapsed to ms (best-effort; handles m:ss or h:mm:ss)
+  local ms=0
+  if [[ -n "$elapsed" ]]; then
+    local e1="" e2="" e3=""
+    IFS=: read -r e1 e2 e3 <<<"$elapsed" || true
+    if [[ -n "$e3" ]]; then
+      # h:m:s
+      ms=$(( (10#${e1}*3600 + 10#${e2}*60) * 1000 ))
+      ms=$(( ms + (10#${e3%.*})*1000 ))
+    else
+      # m:s
+      ms=$(( (10#${e1}*60) * 1000 ))
+      ms=$(( ms + (10#${e2%.*})*1000 ))
+    fi
+  fi
+  echo "$alloc,$size,$batch,$cyc,$flush,${rss:-},${ms:-}" >> "$CSV"
+}
+
+for s in "${sizes[@]}"; do
+  for b in "${batches[@]}"; do
+    echo "[run] SYSTEM size=$s batch=$b cycles=$cycles"
+    run_case system "$s" "$b" "$cycles" 0
+    echo "[run] HAKMEM (flush=0) size=$s batch=$b cycles=$cycles"
+    run_case hakmem "$s" "$b" "$cycles" 0
+    echo "[run] HAKMEM (flush=1) size=$s batch=$b cycles=$cycles"
+    run_case hakmem "$s" "$b" "$cycles" 1
+  done
+done
+
+echo "[done] CSV: $CSV"
+sed -n '1,40p' "$CSV" || true
--- a/benchmarks/scripts/utils/run_ultra_debug_sweep.sh
+++ b/benchmarks/scripts/utils/run_ultra_debug_sweep.sh
@ -0,0 +1,61 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Sweep Ultra Tiny (SLL-only) with debug counters and output CSV
+# Usage: scripts/run_ultra_debug_sweep.sh [cycles] [batch]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+cd "$ROOT_DIR"
+
+cycles=${1:-60000}
+batch=${2:-200}
+
+make -s bench_fast >/dev/null
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/ultra_debug_${TS}"
+mkdir -p "$OUTDIR"
+CSV="$OUTDIR/results.csv"
+echo "size,batch,cycles,throughput_mops,class,pop_hits,refills,resets,sll_count" > "$CSV"
+
+sizes=(16 32 64)
+
+size_to_class() {
+  case "$1" in
+    16) echo 1;;
+    32) echo 2;;
+    64) echo 3;;
+    8)  echo 0;;
+    128) echo 4;;
+    *) echo -1;;
+  esac
+}
+
+for s in "${sizes[@]}"; do
+  cls=$(size_to_class "$s")
+  log="$OUTDIR/ultra_${s}_b=${batch}_c=${cycles}.log"
+  # Run with Ultra + debug; capture stdout+stderr in one file
+  HAKMEM_TINY_ULTRA=1 HAKMEM_TINY_ULTRA_DEBUG=1 HAKMEM_TINY_MAG_CAP=128 \
+    ./bench_tiny_hot_hakmem "$s" "$batch" "$cycles" >"$log" 2>&1 || true
+
+  thr=$(sed -n 's/^Throughput: \([0-9.][0-9.]*\) M ops.*/\1/p' "$log" | tail -n1)
+  # Extract Ultra debug block
+  start=$(grep -n '^\[Ultra Tiny Debug\]' "$log" | tail -n1 | cut -d: -f1)
+  if [[ -n "$start" ]]; then
+    # header is the next line; data follows
+    data_start=$((start+2))
+    # take next 8 lines (classes 0..7)
+    sed -n "${data_start},$((data_start+7))p" "$log" > "$OUTDIR/tmp_ultra.txt" || true
+    # pick the line for target class
+    line=$(awk -F',' -v k="$cls" '($1==k){print $0}' "$OUTDIR/tmp_ultra.txt" | tail -n1)
+    if [[ -n "$line" ]]; then
+      # line format: class,pop_hits,refills,resets,sll_count
+      IFS=',' read -r c ph rf rs sc <<<"$line"
+      echo "$s,$batch,$cycles,${thr:-},$c,$ph,$rf,$rs,$sc" >> "$CSV"
+    fi
+  fi
+done
+
+echo "[done] CSV: $CSV"
+sed -n '1,20p' "$CSV" || true
+
--- a/benchmarks/scripts/utils/run_usdt_overview.sh
+++ b/benchmarks/scripts/utils/run_usdt_overview.sh
@ -0,0 +1,101 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Build with USDT tracepoints and run perf stat for USDT events + PMU on tiny_hot + mixed
+# Usage: scripts/run_usdt_overview.sh [cycles]
+
+ROOT_DIR=$(cd "$(dirname "$0")/.." && pwd)
+# Allow overriding perf binary (e.g., WSL generic tools). Usage:
+#   PERF_BIN=/usr/lib/linux-tools-6.8.0-86/perf bash scripts/run_usdt_overview.sh 40000
+PERF_BIN=${PERF_BIN:-perf}
+cd "$ROOT_DIR"
+
+cycles=${1:-50000}
+
+if [[ "${SKIP_BUILD:-0}" != "1" ]]; then
+  echo "[build] USDT-enabled benches"
+  make -s clean >/dev/null 2>&1 || true
+  make -s bench_fast CFLAGS+=" -DHAKMEM_USDT=1" >/dev/null
+else
+  echo "[build] skipped (SKIP_BUILD=1)"
+fi
+
+TS=$(date +%Y%m%d_%H%M%S)
+OUTDIR="bench_results/usdt_${TS}"
+mkdir -p "$OUTDIR"
+
+EVENTS_USDT=(
+  sdt:hakmem:sll_pop
+  sdt:hakmem:mag_pop
+  sdt:hakmem:front_pop
+  sdt:hakmem:bump_hit
+  sdt:hakmem:slow_alloc
+  sdt:hakmem:sll_push
+  sdt:hakmem:mag_push
+  sdt:hakmem:spill_super
+  sdt:hakmem:spill_tiny
+  sdt:hakmem:remote_drain
+  sdt:hakmem:superslab_alloc
+  sdt:hakmem:superslab_fail
+)
+EVENTS_PMU=(cycles,instructions,L1-dcache-load-misses,branch-misses)
+
+join_events() {
+  local IFS=','; echo "$*"
+}
+
+PMU_JOINED=$(join_events "${EVENTS_PMU[@]}" )
+
+# Detect USDT availability by actually probing a dummy run
+USDT_JOINED=""
+{
+  "$PERF_BIN" stat -x , -e sdt:hakmem:front_pop true 1>/dev/null 2>"$OUTDIR/.usdt_probe.err"
+} || true
+if rg -q "unknown tracepoint" "$OUTDIR/.usdt_probe.err"; then
+  echo "[warn] perf does not support 'sdt:' on this system (unknown tracepoint). Using PMU-only." | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] Install perf matching your kernel: sudo apt-get install linux-tools-\$(uname -r)" | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] Kernel must have UPROBE/SDT support (CONFIG_UPROBE_EVENTS)." | tee -a "$OUTDIR/summary.txt"
+elif rg -q "can't access trace events|No permissions" "$OUTDIR/.usdt_probe.err"; then
+  echo "[warn] USDT blocked by tracefs perms; falling back to PMU-only." | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] Try: sudo mount -t tracefs -o mode=755 nodev /sys/kernel/tracing" | tee -a "$OUTDIR/summary.txt"
+  echo "[hint] And: sudo sysctl kernel.perf_event_paranoid=1" | tee -a "$OUTDIR/summary.txt"
+else
+  # Looks good; enable USDT events
+  USDT_JOINED=$(join_events "${EVENTS_USDT[@]}")
+fi
+
+# Basic environment info for troubleshooting
+{
+  echo "[env] perf=$($PERF_BIN --version 2>/dev/null | head -n1)";
+  echo "[env] kernel=$(uname -r)";
+  echo "[env] tracefs=$(ls -ld /sys/kernel/tracing 2>/dev/null || true)";
+} | tee -a "$OUTDIR/summary.txt"
+
+run_perf() {
+  local tag="$1"; shift
+  local bin="$1"; shift
+  local size="$1"; shift
+  local batch="$1"; shift
+  local cyc="$1"; shift
+  local log="$OUTDIR/${tag}_s${size}_b${batch}_c${cyc}.stat.csv"
+  echo "[perf] $tag size=$size batch=$batch cycles=$cyc" | tee -a "$OUTDIR/summary.txt"
+  if [[ -n "$USDT_JOINED" ]]; then
+    "$PERF_BIN" stat -x , -e "$USDT_JOINED","$PMU_JOINED" "$bin" "$size" "$batch" "$cyc" 1>/dev/null 2>"$log" || true
+  else
+    "$PERF_BIN" stat -x , -e "$PMU_JOINED" "$bin" "$size" "$batch" "$cyc" 1>/dev/null 2>"$log" || true
+  fi
+}
+
+# Tiny-hot focus (8/16/32/64)
+for s in 8 16 32 64; do
+  for b in 100; do
+    HAKMEM_QUIET=1 run_perf "hakmem" ./bench_tiny_hot_hakmem "$s" "$b" "$cycles"
+    HAKMEM_QUIET=1 run_perf "system" ./bench_tiny_hot_system "$s" "$b" "$cycles"
+  done
+done
+
+# Random mixed overview
+bash scripts/run_random_mixed_matrix.sh 80000 >/dev/null || true
+
+echo "[done] USDT overview: $OUTDIR"
+ls -1 "$OUTDIR" | sed -n '1,20p'
--- a/benchmarks/scripts/utils/save_prof_sweep.sh
+++ b/benchmarks/scripts/utils/save_prof_sweep.sh
@ -0,0 +1,63 @@
+#!/usr/bin/env bash
+set -euo pipefail
+
+# Save a short profiler sweep into docs/benchmarks/<YYYYMMDD_HHMMSS>/
+# Usage: scripts/save_prof_sweep.sh [-d SEC] [-t CSV] [-s N]
+
+RUNTIME=2
+THREADS="1,4"
+SAMPLE_N=8
+BENCH_TIMEOUT=""
+KILL_GRACE=${KILL_GRACE:-2}
+
+while getopts ":d:t:s:h" opt; do
+  case $opt in
+    d) RUNTIME="$OPTARG" ;;
+    t) THREADS="$OPTARG" ;;
+    s) SAMPLE_N="$OPTARG" ;;
+    h) echo "Usage: $0 [-d SEC] [-t CSV] [-s N]"; exit 0 ;;
+    :) echo "Missing arg -$OPTARG"; exit 2 ;;
+    *) echo "Usage: $0 [-d SEC] [-t CSV] [-s N]"; exit 2 ;;
+  esac
+done
+
+ROOT_DIR="$(cd "$(dirname "$0")/.." && pwd)"
+OUTDIR="$ROOT_DIR/docs/benchmarks/$(date +%Y%m%d_%H%M%S)"
+mkdir -p "$OUTDIR"
+
+LIB="$(readlink -f "$ROOT_DIR/libhakmem.so" || true)"
+LARSON="$ROOT_DIR/mimalloc-bench/bench/larson/larson"
+
+if [[ -z "${BENCH_TIMEOUT}" ]]; then
+  BENCH_TIMEOUT=$(( RUNTIME + 3 ))
+fi
+
+echo "Saving sweep into: $OUTDIR" | tee "$OUTDIR/summary.txt"
+echo "RUNTIME=$RUNTIME THREADS=$THREADS SAMPLE=1/$((1<<SAMPLE_N)) TIMEOUT=${BENCH_TIMEOUT}s" | tee -a "$OUTDIR/summary.txt"
+
+declare -a RUNS=(
+  "tiny 8 1024"
+  "mid 2048 32768"
+  "gap 33000 65536"
+  "large 65536 1048576"
+)
+
+IFS=',' read -r -a TARR <<< "$THREADS"
+
+for r in "${RUNS[@]}"; do
+  read -r name rmin rmax <<< "$r"
+  for t in "${TARR[@]}"; do
+    label="${name}_T${t}_${rmin}-${rmax}"
+    echo "== $label ==" | tee -a "$OUTDIR/summary.txt"
+    if [[ -f "$LARSON" && -f "$ROOT_DIR/libhakmem.so" ]]; then
+      timeout -k "${KILL_GRACE}s" "${BENCH_TIMEOUT}s" \
+        env HAKMEM_PROF=1 HAKMEM_PROF_SAMPLE="$SAMPLE_N" \
+        LD_PRELOAD="$LIB" "$LARSON" "$RUNTIME" "$rmin" "$rmax" 10000 1 12345 "$t" 2>&1 \
+        | tee "$OUTDIR/${label}.log" | tail -n 80 | tee -a "$OUTDIR/summary.txt"
+    else
+      echo "Skip: missing larson or libhakmem.so" | tee -a "$OUTDIR/summary.txt"
+    fi
+  done
+done
+
+echo "Done. See $OUTDIR" | tee -a "$OUTDIR/summary.txt"
--- a/benchmarks/scripts/utils/sqlite_workload.sql
+++ b/benchmarks/scripts/utils/sqlite_workload.sql
@ -0,0 +1,32 @@
+PRAGMA journal_mode = OFF;
+PRAGMA synchronous = OFF;
+PRAGMA temp_store = MEMORY;
+
+-- schema
+CREATE TABLE t (
+  id INTEGER PRIMARY KEY,
+  s  TEXT
+);
+
+-- bulk insert via recursive CTE (~50k rows)
+WITH RECURSIVE cnt(x) AS (
+  SELECT 1
+  UNION ALL
+  SELECT x+1 FROM cnt LIMIT 50000
+)
+INSERT INTO t(s)
+SELECT printf('str-%d-%d', x, x*x) FROM cnt;
+
+-- simple read queries
+SELECT COUNT(*) FROM t;
+SELECT SUM(LENGTH(s)) FROM t;
+
+-- point lookups
+SELECT s FROM t WHERE id IN (1, 100, 1000, 10000, 40000);
+
+-- update a slice
+UPDATE t SET s = s || '-x' WHERE (id % 50) = 0;
+
+-- final check
+SELECT COUNT(*) FROM t WHERE s LIKE '%-x';
+