Files
hakorune/tools/perf/microbench.sh
nyash-codex 3d082ca131 fix(perf): propagate NYASH_AOT_NUMERIC_CORE env vars to hakorune_emit_mir.sh
## Problem
Phase 25 numeric_core transformation wasn't working in microbench.sh:
- NYASH_AOT_NUMERIC_CORE=1 was set by user externally
- But wasn't propagated to hakorune_emit_mir.sh
- Result: BoxCall(mul_naive) remained instead of Call("NyNumericMatI64.mul_naive")

## Solution
Add explicit env var propagation in microbench.sh (line 933-934):
```bash
NYASH_AOT_NUMERIC_CORE="${NYASH_AOT_NUMERIC_CORE:-0}" \
NYASH_AOT_NUMERIC_CORE_TRACE="${NYASH_AOT_NUMERIC_CORE_TRACE:-0}" \
```

This ensures user-set NYASH_AOT_NUMERIC_CORE is passed through to:
  hakorune_emit_mir.sh → Provider → AotPrep → numeric_core.hako

## Verification
Tested with:
```bash
NYASH_AOT_NUMERIC_CORE=1 tools/perf/microbench.sh --case matmul_core --backend llvm --exe --runs 1 --n 4
```

Now transformation works correctly (pending numeric_core phi propagation fix).

🤖 Generated with [Claude Code](https://claude.com/claude-code)

Co-Authored-By: Claude <noreply@anthropic.com>
2025-11-15 01:11:21 +09:00

1023 lines
30 KiB
Bash
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

#!/usr/bin/env bash
set -euo pipefail
SCRIPT_DIR="$(cd "$(dirname "$0")" && pwd)"
ROOT="$(cd "$SCRIPT_DIR/../.." && pwd)"
BIN="$ROOT/target/release/hakorune"
usage() { echo "Usage: $0 --case {loop|strlen|box|branch|call|stringchain|arraymap|chip8|kilo|sieve|matmul|matmul_core|linidx|maplin} [--n N] [--runs R] [--backend {llvm|vm}] [--exe] [--budget-ms B]"; }
CASE="loop"; N=5000000; RUNS=5; BACKEND="llvm"; EXE_MODE=0; BUDGET_MS=0
while [[ $# -gt 0 ]]; do
case "$1" in
--case) CASE="$2"; shift 2;;
--n) N="$2"; shift 2;;
--runs) RUNS="$2"; shift 2;;
--backend) BACKEND="$2"; shift 2;;
--exe) EXE_MODE=1; shift 1;;
--budget-ms) BUDGET_MS="$2"; shift 2;;
--help|-h) usage; exit 0;;
*) echo "Unknown arg: $1"; usage; exit 2;;
esac
done
if [[ ! -x "$BIN" ]]; then echo "[FAIL] hakorune not built: $BIN" >&2; exit 2; fi
# Helpers: build once, then reuse
ensure_llvmc() {
if [[ ! -x "$ROOT/target/release/ny-llvmc" ]]; then
(cargo build -q --release -p nyash-llvm-compiler >/dev/null 2>&1) || true
fi
}
ensure_nyrt() {
# Accept either .a or .rlib as presence of built runtime
if [[ ! -f "$ROOT/target/release/libnyash_kernel.a" && ! -f "$ROOT/target/release/libnyash_kernel.rlib" ]]; then
(cd "$ROOT/crates/nyash_kernel" && cargo build -q --release >/dev/null 2>&1) || true
fi
}
bench_hako() {
local file="$1"; local backend="$2"; shift 2
local start end
start=$(date +%s%N)
if [[ "$backend" = "llvm" ]]; then
# Ensure ny-llvmc exists; build if missing
if [[ ! -x "$ROOT/target/release/ny-llvmc" ]]; then
(cargo build -q --release -p nyash-llvm-compiler >/dev/null 2>&1) || true
fi
PYTHONPATH="${PYTHONPATH:-$ROOT}" \
NYASH_AOT_COLLECTIONS_HOT=1 NYASH_LLVM_FAST=1 NYASH_MIR_LOOP_HOIST=1 NYASH_AOT_MAP_KEY_MODE=auto \
NYASH_ENABLE_USING=1 HAKO_ENABLE_USING=1 HAKO_USING_RESOLVER_FIRST=1 \
NYASH_NY_LLVM_COMPILER="${NYASH_NY_LLVM_COMPILER:-$ROOT/target/release/ny-llvmc}" \
NYASH_EMIT_EXE_NYRT="${NYASH_EMIT_EXE_NYRT:-$ROOT/target/release}" \
NYASH_LLVM_USE_HARNESS=1 "$BIN" --backend llvm "$file" >/dev/null 2>&1
else
"$BIN" --backend vm "$file" >/dev/null 2>&1
fi
end=$(date +%s%N)
echo $(( (end - start)/1000000 ))
}
bench_c() {
local csrc="$1"; local exe="$2"
cc -O3 -march=native -o "$exe" "$csrc"
local start end
start=$(date +%s%N)
"$exe" >/dev/null 2>&1
end=$(date +%s%N)
echo $(( (end - start)/1000000 ))
}
# Build once and time executable runs (ms)
time_exe_run() {
local exe="$1"
local start end
start=$(date +%s%N)
"$exe" >/dev/null 2>&1
end=$(date +%s%N)
echo $(( (end - start)/1000000 ))
}
mktemp_hako() { mktemp --suffix .hako; }
mktemp_c() { mktemp --suffix .c; }
# Fallback diagnostics for EXE flow: check MIR JSON for externcall/boxcall/jsonfrag
diag_mir_json() {
local json="$1"
local rewrites; rewrites=$(rg -c '"op":"externcall"' "$json" 2>/dev/null || echo 0)
local arrays; arrays=$(rg -c 'nyash\.array\.' "$json" 2>/dev/null || echo 0)
local maps; maps=$(rg -c 'nyash\.map\.' "$json" 2>/dev/null || echo 0)
local boxcalls; boxcalls=$(rg -c '"op":"boxcall"' "$json" 2>/dev/null || echo 0)
local jsonfrag; jsonfrag=$(rg -c '\[emit/jsonfrag\]' "$json" 2>/dev/null || echo 0)
echo "[diag] externcall=${rewrites} (array=${arrays}, map=${maps}), boxcall_left=${boxcalls}, jsonfrag=${jsonfrag}" >&2
}
case "$CASE" in
loop)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local n = ${N}
local i = 0
local s = 0
loop(i < n) { s = s + i i = i + 1 }
return s
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
int main(){
volatile int64_t n = N_PLACEHOLDER;
volatile int64_t s=0; for (int64_t i=0;i<n;i++){ s+=i; }
return (int)(s&0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
strlen)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local n = ${N}
local i = 0
local s = 0
local t = "abcdefghijklmnopqrstuvwxyz"
loop(i < n) { s = s + t.length() i = i + 1 }
return s
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <string.h>
int main(){
volatile int64_t n = N_PLACEHOLDER; volatile int64_t s=0;
const char* t = "abcdefghijklmnopqrstuvwxyz";
for (int64_t i=0;i<n;i++){ s += (int64_t)strlen(t); }
return (int)(s&0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
box)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local n = ${N}
local i = 0
loop(i < n) { local t = new StringBox("x"); i = i + 1 }
return 0
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <stdlib.h>
#include <string.h>
typedef struct { char* p; } Str;
static inline Str* new_str(){ Str* s=(Str*)malloc(sizeof(Str)); s->p=strdup("x"); free(s->p); free(s); return s; }
int main(){ volatile int64_t n=N_PLACEHOLDER; for(int64_t i=0;i<n;i++){ new_str(); } return 0; }
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
branch)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local n = ${N}
local i = 0
local acc = 0
loop(i < n) {
local mod = i % 30
if (mod == 0) {
acc = acc + 3
} else if (mod < 10) {
acc = acc + (i % 7)
} else if (mod < 20) {
acc = acc - (i % 11)
} else {
acc = acc + 1
}
i = i + 1
}
return acc
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
int main(){
volatile int64_t n = N_PLACEHOLDER;
volatile int64_t acc = 0;
for (int64_t i=0;i<n;i++){
int64_t mod = i % 30;
if (mod == 0) {
acc += 3;
} else if (mod < 10) {
acc += (i % 7);
} else if (mod < 20) {
acc -= (i % 11);
} else {
acc += 1;
}
}
return (int)(acc & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
call)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
function mix(a, b, c) {
return (a + b) - c
}
function twist(v) {
if (v % 2 == 0) { return v / 2 }
return v * 3 + 1
}
static box Main { method main(args) {
local n = ${N}
local i = 0
local value = 1
loop(i < n) {
value = mix(value, i, value % 7)
value = mix(value, twist(i), twist(value))
i = i + 1
}
return value
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
static inline int64_t mix(int64_t a, int64_t b, int64_t c){ return (a + b) - c; }
static inline int64_t twist(int64_t v){ return (v % 2 == 0) ? v / 2 : v * 3 + 1; }
int main(){
volatile int64_t n = N_PLACEHOLDER; volatile int64_t value = 1;
for (int64_t i=0;i<n;i++){
value = mix(value, i, value % 7);
value = mix(value, twist(i), twist(value));
}
return (int)(value & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
stringchain)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local n = ${N}
local base = "abcdefghijklmnopqrstuvwxyz0123456789"
local acc = 0
local i = 0
loop(i < n) {
local part1 = base.substring(0, 12)
local part2 = base.substring(5, 20)
local s = part1 + part2 + base.substring(2, 18)
acc = acc + s.length()
i = i + 1
}
return acc
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <string.h>
int main(){
volatile int64_t n = N_PLACEHOLDER; volatile int64_t acc = 0;
const char* base = "abcdefghijklmnopqrstuvwxyz0123456789";
char tmp[128];
for (int64_t i=0;i<n;i++){
memcpy(tmp, base, 12); tmp[12] = '\0';
char buf[192];
strcpy(buf, tmp);
strncat(buf, base+5, 15);
strncat(buf, base+2, 16);
acc += (int64_t)strlen(buf);
}
return (int)(acc & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
arraymap)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local n = ${N}
local arr = new ArrayBox()
local map = new MapBox()
local bucket = 32
local i = 0
loop(i < bucket) {
arr.push(i)
map.set("k" + i.toString(), i)
i = i + 1
}
local sum = 0
i = 0
loop(i < n) {
local idx = i % bucket
local val = arr.get(idx)
arr.set(idx, val + 1)
local key = "k" + idx.toString()
map.set(key, val)
sum = sum + map.get(key)
i = i + 1
}
return sum
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
int main(){
volatile int64_t n = N_PLACEHOLDER; volatile int64_t sum = 0;
int64_t bucket = 32;
int64_t arr[32];
int64_t mapv[32];
for (int i=0;i<32;i++){ arr[i]=i; mapv[i]=i; }
for (int64_t i=0;i<n;i++){
int64_t idx = i % bucket;
int64_t val = arr[idx];
arr[idx] = val + 1;
mapv[idx] = val;
sum += mapv[idx];
}
return (int)(sum & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
chip8)
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
box Chip8Bench {
init { program, registers, pc, program_size }
birth() {
me.program = new ArrayBox()
me.registers = new ArrayBox()
me.pc = 0
local i = 0
loop(i < 16) { me.registers.push(0); i = i + 1 }
local opcodes = new ArrayBox()
// 6005, 6107, 7003, 7102, 1200 pattern
opcodes.push(96); opcodes.push(5)
opcodes.push(97); opcodes.push(7)
opcodes.push(112); opcodes.push(3)
opcodes.push(113); opcodes.push(2)
opcodes.push(18); opcodes.push(0)
local count_box = opcodes.length()
local count = 0
if count_box != null { count = count_box.toString().toInteger() }
i = 0
loop(i < count) {
me.program.push(opcodes.get(i))
i = i + 1
}
me.program_size = count
}
execute_cycle() {
local hi = me.program.get(me.pc)
local lo = me.program.get((me.pc + 1) % me.program_size)
local opcode = (hi * 256) + lo
me.pc = (me.pc + 2) % me.program_size
local nib = opcode / 4096
if (nib == 1) {
me.pc = opcode % me.program_size
} else if (nib == 6) {
local reg = (opcode / 256) % 16
local value = opcode % 256
me.registers.set(reg, value)
} else if (nib == 7) {
local reg = (opcode / 256) % 16
local value = opcode % 256
local cur = me.registers.get(reg)
me.registers.set(reg, cur + value)
}
}
run(cycles) {
local i = 0
loop(i < cycles) { me.execute_cycle(); i = i + 1 }
}
checksum() {
local total = 0
local len = me.registers.length().toString().toInteger()
local i = 0
loop(i < len) { total = total + me.registers.get(i); i = i + 1 }
return total
}
}
static box Main { method main(args) {
local cycles = ${N}
local bench = new Chip8Bench()
bench.birth()
bench.run(cycles)
return bench.checksum()
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
int main(){
volatile int64_t cycles = N_PLACEHOLDER;
int pc = 0;
int program_size = 10;
int program[10] = {96,5,97,7,112,3,113,2,18,0};
int regs[16] = {0};
for (int64_t i=0;i<cycles;i++){
int hi = program[pc];
int lo = program[(pc+1)%program_size];
int opcode = (hi<<8) | lo;
pc = (pc + 2) % program_size;
int nib = opcode >> 12;
if (nib == 1) {
pc = opcode & 0x0FFF;
pc %= program_size;
} else if (nib == 6) {
int reg = (opcode >> 8) & 0xF;
regs[reg] = opcode & 0xFF;
} else if (nib == 7) {
int reg = (opcode >> 8) & 0xF;
regs[reg] += opcode & 0xFF;
}
}
int64_t sum = 0; for (int i=0;i<16;i++){ sum += regs[i]; }
return (int)(sum & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
sieve)
# N: 上限値。EXE モードは計測安定性のため C 実行時間が十分大きくなる既定値に固定
# 既定 N=5,000,000 のまま維持(以前の 500,000 丸めはタイマ粒度ノイズを増やすため撤廃)
if [[ "$EXE_MODE" = "1" && "$N" = "5000000" ]]; then
N=5000000
fi
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local limit = ${N}
// true=prime候補
local flags = new ArrayBox()
local i = 0
loop(i <= limit) { flags.push(1) i = i + 1 }
flags.set(0, 0) flags.set(1, 0)
local p = 2
loop(p * p <= limit) {
if (flags.get(p) == 1) {
local m = p * p
loop(m <= limit) { flags.set(m, 0) m = m + p }
}
p = p + 1
}
local count = 0
i = 0
loop(i <= limit) { count = count + flags.get(i) i = i + 1 }
return count
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <stdlib.h>
int main(){
int64_t limit = N_PLACEHOLDER;
unsigned char *flags = (unsigned char*)malloc((limit+1));
for (int64_t i=0;i<=limit;i++) flags[i]=1;
flags[0]=flags[1]=0;
for (int64_t p=2;p*p<=limit;p++) if (flags[p]) for (int64_t m=p*p;m<=limit;m+=p) flags[m]=0;
int64_t count=0; for (int64_t i=0;i<=limit;i++) count+=flags[i];
free(flags);
return (int)(count & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
matmul)
# N: 行列サイズ。EXE モード既定は N=512、REPS_M=16 に上げてタイマ粒度ノイズを低減
if [[ "$EXE_MODE" = "1" && "$N" = "5000000" ]]; then
N=512
fi
REPS_M=${REPS_M:-8}
if [[ "$EXE_MODE" = "1" && "${REPS_M}" = "8" ]]; then
REPS_M=16
fi
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local n = ${N}
local reps = ${REPS_M}
// A,B,C を一次元ArrayBoxに格納row-major
local A = new ArrayBox(); local B = new ArrayBox(); local C = new ArrayBox()
local i = 0
loop(i < n*n) { A.push(i % 97) B.push((i*3) % 101) C.push(0) i = i + 1 }
i = 0
loop(i < n) {
local j = 0
loop(j < n) {
local sum = 0
local k = 0
loop(k < n) {
local a = A.get(i*n + k)
local b = B.get(k*n + j)
sum = sum + a * b
k = k + 1
}
// repeat accumulation to scale work per element
local r = 0
loop(r < reps) { sum = sum + (r % 7) r = r + 1 }
C.set(i*n + j, sum)
j = j + 1
}
i = i + 1
}
// 端を返して最適化抑止
return C.get((n-1)*n + (n-1))
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <stdlib.h>
int main(){
int n = N_PLACEHOLDER;
int reps = REPS_PLACE;
int *A = (int*)malloc(sizeof(int)*n*n);
int *B = (int*)malloc(sizeof(int)*n*n);
int *C = (int*)malloc(sizeof(int)*n*n);
for (int i=0;i<n*n;i++){ A[i]=i%97; B[i]=(i*3)%101; C[i]=0; }
for (int i=0;i<n;i++){
for (int j=0;j<n;j++){
long long sum=0;
for (int k=0;k<n;k++) sum += (long long)A[i*n+k]*B[k*n+j];
for (int r=0;r<reps;r++) sum += (r % 7);
C[i*n+j]=(int)sum;
}
}
int r = C[(n-1)*n + (n-1)];
free(A); free(B); free(C);
return r & 0xFF;
}
C
sed -i "s/N_PLACEHOLDER/${N}/; s/REPS_PLACE/${REPS_M}/" "$C_FILE"
# Pre-check: verify emit stability for matmul in EXE mode
if [[ "$EXE_MODE" = "1" ]]; then
TMP_CHECK_JSON=$(mktemp --suffix .json)
if ! \
HAKO_SELFHOST_BUILDER_FIRST=0 HAKO_SELFHOST_NO_DELEGATE=0 \
NYASH_AOT_COLLECTIONS_HOT=1 NYASH_LLVM_FAST=1 NYASH_MIR_LOOP_HOIST=1 NYASH_AOT_MAP_KEY_MODE=auto \
HAKO_MIR_BUILDER_LOOP_JSONFRAG="${HAKO_MIR_BUILDER_LOOP_JSONFRAG:-0}" \
bash "$ROOT/tools/hakorune_emit_mir.sh" "$HAKO_FILE" "$TMP_CHECK_JSON" \
>/dev/null 2>&1; then
echo "[SKIP] matmul emit unstable (try PERF_USE_JSONFRAG=1 for diagnosis)" >&2
rm -f "$TMP_CHECK_JSON" "$HAKO_FILE" "$C_FILE" 2>/dev/null || true
exit 0
fi
rm -f "$TMP_CHECK_JSON" 2>/dev/null || true
fi
;;
matmul_core)
# Core numeric matmul using MatI64 + IntArrayCore
# Use smaller default N to keep runtime reasonable
if [[ "$EXE_MODE" = "1" && "$N" = "5000000" ]]; then
N=256
fi
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
using nyash.core.numeric.matrix_i64 as MatI64
static box Main { method main(args) {
local n = ${N}
// Initialize A, B, C as n x n matrices
local A = MatI64.new(n, n)
local B = MatI64.new(n, n)
local C = MatI64.new(n, n)
local i = 0
loop(i < n) {
local j = 0
loop(j < n) {
local idx = i*n + j
A.set(i, j, idx % 97)
B.set(i, j, (idx * 3) % 101)
C.set(i, j, 0)
j = j + 1
}
i = i + 1
}
// Naive matmul via MatI64.mul_naive
local out = A.mul_naive(B)
return out.at(n-1, n-1)
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <stdlib.h>
typedef struct {
int64_t *ptr;
int64_t rows;
int64_t cols;
int64_t stride;
} MatI64Core;
static inline int64_t mat_get(MatI64Core *m, int64_t r, int64_t c) {
return m->ptr[r * m->stride + c];
}
static inline void mat_set(MatI64Core *m, int64_t r, int64_t c, int64_t v) {
m->ptr[r * m->stride + c] = v;
}
int main() {
int64_t n = N_PLACEHOLDER;
int64_t total = n * n;
MatI64Core A, B, C;
A.rows = B.rows = C.rows = n;
A.cols = B.cols = C.cols = n;
A.stride = B.stride = C.stride = n;
A.ptr = (int64_t*)malloc(sizeof(int64_t)*total);
B.ptr = (int64_t*)malloc(sizeof(int64_t)*total);
C.ptr = (int64_t*)malloc(sizeof(int64_t)*total);
for (int64_t idx = 0; idx < total; idx++) {
A.ptr[idx] = idx % 97;
B.ptr[idx] = (idx * 3) % 101;
C.ptr[idx] = 0;
}
for (int64_t i = 0; i < n; i++) {
for (int64_t k = 0; k < n; k++) {
int64_t aik = mat_get(&A, i, k);
for (int64_t j = 0; j < n; j++) {
int64_t idx = i * C.stride + j;
int64_t v = C.ptr[idx] + aik * mat_get(&B, k, j);
C.ptr[idx] = v;
}
}
}
int64_t r = mat_get(&C, n-1, n-1);
free(A.ptr); free(B.ptr); free(C.ptr);
return (int)(r & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
linidx)
# Linear index pattern: idx = i*cols + j
# Derive rows/cols from N to keep runtime stable
ROWS=10000; COLS=32
if [[ "$EXE_MODE" = "1" && "$N" = "5000000" ]]; then ROWS=20000; COLS=32; fi
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local rows = ${ROWS}
local cols = ${COLS}
local total = rows * cols
local A = new ArrayBox()
local i = 0
loop(i < total) { A.push(i % 97) i = i + 1 }
local acc = 0
i = 0
loop(i < rows) {
local j = 0
loop(j < cols) {
local idx = i * cols + j
local v = A.get(idx)
acc = acc + v
A.set(idx, (v + acc) % 17)
j = j + 1
}
i = i + 1
}
return acc & 255
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <stdlib.h>
int main(){
const int64_t rows = ROWS_P; const int64_t cols = COLS_P;
const int64_t total = rows * cols;
int64_t *A = (int64_t*)malloc(sizeof(int64_t)*total);
for (int64_t i=0;i<total;i++) A[i]=i%97;
int64_t acc=0;
for (int64_t i=0;i<rows;i++){
for (int64_t j=0;j<cols;j++){
int64_t idx = i*cols + j;
int64_t v = A[idx];
acc += v;
A[idx] = (v + acc) % 17;
}
}
free(A);
return (int)(acc & 255);
}
C
sed -i "s/ROWS_P/${ROWS}/; s/COLS_P/${COLS}/" "$C_FILE"
;;
maplin)
# Map with integer linear key: key = i*bucket + j
# Keep bucket small to stress get/set hot path; add REPS to increase per-iter work
# Interpret N as rows when provided (except when default 5_000_000)
ROWS=50000; BUCKET=32; REPS=8
if [[ "$EXE_MODE" = "1" && "$N" = "5000000" ]]; then
ROWS=200000; REPS=16
elif [[ "$N" != "5000000" ]]; then
ROWS="$N"
fi
BUCKET=32
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
static box Main { method main(args) {
local rows = ${ROWS}
local bucket = ${BUCKET}
local reps = ${REPS}
local arr = new ArrayBox()
local map = new MapBox()
// Prefill
local i = 0
loop(i < bucket) { arr.push(i) i = i + 1 }
// Run
i = 0
local acc = 0
loop(i < rows) {
local j = i % bucket
local key = (i / bucket) * bucket + j
local v = arr.get(j)
arr.set(j, v + 1)
map.set(key, v)
acc = acc + map.get(key)
// additional reps to reduce timer granularity effects
local r = 0
loop(r < reps) {
// keep keys within [0, rows)
local ii = (i + r) % rows
local jj = (j + r) % bucket
local k2 = (ii / bucket) * bucket + jj
map.set(k2, v)
acc = acc + map.get(k2)
r = r + 1
}
i = i + 1
}
return acc & 255
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <stdlib.h>
int main(){
const int64_t rows = ROWS_P; const int64_t bucket = BUCKET_P; const int64_t reps = REPS_P;
int64_t *arr = (int64_t*)malloc(sizeof(int64_t)*bucket);
int64_t *mapv = (int64_t*)malloc(sizeof(int64_t)*rows);
for (int64_t i=0;i<bucket;i++) arr[i]=i;
int64_t acc=0;
for (int64_t i=0;i<rows;i++){
int64_t j = i % bucket;
int64_t key = (i / bucket) * bucket + j;
int64_t v = arr[j];
arr[j] = v + 1;
mapv[key] = v;
acc += mapv[key];
for (int64_t r=0;r<reps;r++){
int64_t ii = (i + r) % rows;
int64_t jj = (j + r) % bucket;
int64_t k2 = (ii / bucket) * bucket + jj;
mapv[k2] = v;
acc += mapv[k2];
}
}
free(arr); free(mapv);
return (int)(acc & 255);
}
C
sed -i "s/ROWS_P/${ROWS}/; s/BUCKET_P/${BUCKET}/; s/REPS_P/${REPS}/" "$C_FILE"
;;
kilo)
# kilo は C 参照側が重く、デフォルト N=5_000_000 だと実行が非常に長くなる。
# Phase 21.5 最適化フェーズでは LLVM 系ベンチは EXE 経路のみを対象にする。
# - LLVM backend かつ N が既定値5_000_000の場合は、常に N=200_000 に下げる。
# - LLVM backend で EXE_MODE=0 の場合も、EXE 経路へ強制昇格するVM フォールバック禁止)。
if [[ "$BACKEND" = "llvm" && "$N" = "5000000" ]]; then
N=200000
fi
if [[ "$BACKEND" = "llvm" && "$EXE_MODE" = "0" ]]; then
echo "[info] kilo: forcing --exe for llvm backend (Phase 21.5 optimization)" >&2
EXE_MODE=1
fi
HAKO_FILE=$(mktemp_hako)
cat >"$HAKO_FILE" <<HAKO
box KiloBench {
init { lines, undo }
birth() {
me.lines = new ArrayBox()
me.undo = new ArrayBox()
local i = 0
loop(i < 64) {
me.lines.push("line-" + i.toString())
i = i + 1
}
}
insert_chunk(row, text) {
local line = me.lines.get(row)
local len_box = line.length()
local len = 0
if len_box != null { len = len_box.toString().toInteger() }
local split = len / 2
local new_line = line.substring(0, split) + text + line.substring(split, len)
me.lines.set(row, new_line)
me.undo.push(text)
}
replace(pattern, replacement) {
local count = me.lines.length().toString().toInteger()
local i = 0
loop(i < count) {
local line = me.lines.get(i)
if (line.indexOf(pattern) >= 0) {
me.lines.set(i, line + replacement)
}
i = i + 1
}
}
digest() {
local total = 0
local count = me.lines.length().toString().toInteger()
local i = 0
loop(i < count) {
total = total + me.lines.get(i).length()
i = i + 1
}
return total + me.undo.length().toString().toInteger()
}
}
static box Main { method main(args) {
local ops = ${N}
local bench = new KiloBench()
bench.birth()
local i = 0
loop(i < ops) {
bench.insert_chunk(i % 64, "xx")
if (i % 8 == 0) {
bench.replace("line", "ln")
}
i = i + 1
}
return bench.digest()
} }
HAKO
C_FILE=$(mktemp_c)
cat >"$C_FILE" <<'C'
#include <stdint.h>
#include <string.h>
#include <stdlib.h>
#include <stdio.h>
static void insert_chunk(char **lines, int row, const char *text){
char *line = lines[row];
size_t len = strlen(line);
size_t split = len/2;
char *out = malloc(len + strlen(text) + 1);
memcpy(out, line, split);
strcpy(out+split, text);
strcpy(out+split+strlen(text), line+split);
free(line);
lines[row] = out;
}
static void replace_line(char **lines, const char *pattern, const char *repl){
for (int i=0;i<64;i++){
if (strstr(lines[i], pattern)){
size_t len = strlen(lines[i]) + strlen(repl) + 1;
char *out = malloc(len);
strcpy(out, lines[i]);
strcat(out, repl);
free(lines[i]);
lines[i] = out;
}
}
}
int main(){
volatile int64_t ops = N_PLACEHOLDER;
char *lines[64];
for (int i=0;i<64;i++){
char buf[32]; sprintf(buf, "line-%d", i);
lines[i] = strdup(buf);
}
for (int64_t i=0;i<ops;i++){
insert_chunk(lines, i % 64, "xx");
if (i % 8 == 0) replace_line(lines, "line", "ln");
}
int64_t total = 0;
for (int i=0;i<64;i++){ total += strlen(lines[i]); }
for (int i=0;i<64;i++){ free(lines[i]); }
return (int)(total & 0xFF);
}
C
sed -i "s/N_PLACEHOLDER/${N}/" "$C_FILE"
;;
*) echo "Unknown case: $CASE"; exit 2;;
esac
echo "[perf] case=$CASE n=$N runs=$RUNS backend=$BACKEND" >&2
sum_c=0; sum_h=0
if [[ "$EXE_MODE" = "1" ]]; then
# Build C exe once
C_EXE=$(mktemp --suffix .out)
cc -O3 -march=native -o "$C_EXE" "$C_FILE"
# Build Nyash exe once (requires llvm harness)
if [[ "$BACKEND" != "llvm" ]]; then
echo "[FAIL] --exe requires --backend llvm" >&2; exit 2
fi
ensure_llvmc
ensure_nyrt
HAKO_EXE=$(mktemp --suffix .out)
TMP_JSON=$(mktemp --suffix .json)
# Default: use provider-first with AotPrep for maximum optimization
# DEBUG: Show file paths
echo "[matmul/debug] HAKO_FILE=$HAKO_FILE TMP_JSON=$TMP_JSON" >&2
if ! \
HAKO_SELFHOST_TRACE=1 \
HAKO_SELFHOST_BUILDER_FIRST=0 HAKO_SELFHOST_NO_DELEGATE=0 \
HAKO_APPLY_AOT_PREP=1 \
NYASH_AOT_COLLECTIONS_HOT=1 NYASH_LLVM_FAST=1 NYASH_MIR_LOOP_HOIST=1 NYASH_AOT_MAP_KEY_MODE=auto \
HAKO_MIR_BUILDER_LOOP_JSONFRAG="${HAKO_MIR_BUILDER_LOOP_JSONFRAG:-$([[ "${PERF_USE_JSONFRAG:-0}" = 1 ]] && echo 1 || echo 0)}" \
HAKO_MIR_BUILDER_LOOP_FORCE_JSONFRAG="${HAKO_MIR_BUILDER_LOOP_FORCE_JSONFRAG:-$([[ "${PERF_USE_JSONFRAG:-0}" = 1 ]] && echo 1 || echo 0)}" \
HAKO_MIR_BUILDER_JSONFRAG_NORMALIZE="${HAKO_MIR_BUILDER_JSONFRAG_NORMALIZE:-1}" \
HAKO_MIR_BUILDER_JSONFRAG_PURIFY="${HAKO_MIR_BUILDER_JSONFRAG_PURIFY:-1}" \
NYASH_AOT_NUMERIC_CORE="${NYASH_AOT_NUMERIC_CORE:-0}" \
NYASH_AOT_NUMERIC_CORE_TRACE="${NYASH_AOT_NUMERIC_CORE_TRACE:-0}" \
NYASH_ENABLE_USING=1 HAKO_ENABLE_USING=1 \
NYASH_JSON_ONLY=1 bash "$ROOT/tools/hakorune_emit_mir.sh" "$HAKO_FILE" "$TMP_JSON" 2>&1 | tee /tmp/matmul_emit_log.txt | grep -E "\[prep:|provider/emit\]" >&2; then
echo "[FAIL] emit MIR JSON failed (hint: set PERF_USE_PROVIDER=1 or HAKO_MIR_BUILDER_LOOP_FORCE_JSONFRAG=1)" >&2; exit 3
fi
# Quick diagnostics: ensure AotPrep rewrites are present and jsonfrag fallback is not used
# DEBUG: Copy TMP_JSON for inspection
cp "$TMP_JSON" /tmp/matmul_from_perf.json 2>/dev/null || true
echo "[matmul/debug] TMP_JSON copied to /tmp/matmul_from_perf.json" >&2
echo "[matmul/debug] Direct externcall count: $(grep -o '"op":"externcall"' "$TMP_JSON" 2>/dev/null | wc -l)" >&2
diag_mir_json "$TMP_JSON"
# AotPrep is now applied in hakorune_emit_mir.sh via HAKO_APPLY_AOT_PREP=1
# Build EXE via helper (selects crate backend ny-llvmc under the hood)
if ! NYASH_LLVM_BACKEND=crate NYASH_LLVM_SKIP_BUILD=1 \
NYASH_NY_LLVM_COMPILER="${NYASH_NY_LLVM_COMPILER:-$ROOT/target/release/ny-llvmc}" \
NYASH_EMIT_EXE_NYRT="${NYASH_EMIT_EXE_NYRT:-$ROOT/target/release}" \
NYASH_LLVM_VERIFY=1 NYASH_LLVM_VERIFY_IR=1 NYASH_LLVM_FAST=1 \
bash "$ROOT/tools/ny_mir_builder.sh" --in "$TMP_JSON" --emit exe -o "$HAKO_EXE" --quiet >/dev/null 2>&1; then
echo "[FAIL] build Nyash EXE failed (crate backend). Ensure ny-llvmc exists or try NYASH_LLVM_BACKEND=crate." >&2; exit 3
fi
# Execute runs. If BUDGET_MS>0, keep running until budget is exhausted.
if [[ "$BUDGET_MS" != "0" ]]; then
i=0; used=0
while true; do
i=$((i+1))
t_c=$(time_exe_run "$C_EXE"); t_h=$(time_exe_run "$HAKO_EXE")
sum_c=$((sum_c + t_c)); sum_h=$((sum_h + t_h)); used=$((used + t_h))
if command -v python3 >/dev/null 2>&1; then ratio=$(python3 -c "print(round(${t_h}/max(${t_c},1)*100,2))" 2>/dev/null || echo NA); else ratio=NA; fi
echo "run#$i c=${t_c}ms hak=${t_h}ms ratio=${ratio}% (budget used=${used}/${BUDGET_MS}ms)" >&2
if [[ $used -ge $BUDGET_MS ]]; then RUNS=$i; break; fi
# Safety valve to avoid infinite loop if t_h is 0ms
if [[ $i -ge 999 ]]; then RUNS=$i; break; fi
done
else
for i in $(seq 1 "$RUNS"); do
t_c=$(time_exe_run "$C_EXE")
t_h=$(time_exe_run "$HAKO_EXE")
sum_c=$((sum_c + t_c)); sum_h=$((sum_h + t_h))
if command -v python3 >/dev/null 2>&1; then
ratio=$(python3 -c "print(round(${t_h}/max(${t_c},1)*100,2))" 2>/dev/null || echo NA)
else
ratio=NA
fi
echo "run#$i c=${t_c}ms hak=${t_h}ms ratio=${ratio}%" >&2
done
fi
avg_c=$((sum_c / RUNS)); avg_h=$((sum_h / RUNS))
echo "avg c=${avg_c}ms hak=${avg_h}ms" >&2
if [ "$avg_c" -lt 5 ]; then
echo "[WARN] C runtime is very small (${avg_c}ms). Increase --n to reduce timer granularity noise." >&2
fi
if command -v python3 >/dev/null 2>&1; then
python3 - <<PY
c=$avg_c; h=$avg_h
ratio = (h/max(c,1))*100.0
print(f"ratio={ratio:.2f}%")
PY
fi
rm -f "$C_EXE" "$HAKO_EXE" "$TMP_JSON" 2>/dev/null || true
else
for i in $(seq 1 "$RUNS"); do
t_c=$(bench_c "$C_FILE" "${C_FILE%.c}")
t_h=$(bench_hako "$HAKO_FILE" "$BACKEND")
sum_c=$((sum_c + t_c)); sum_h=$((sum_h + t_h))
if command -v python3 >/dev/null 2>&1; then
ratio=$(python3 -c "print(round(${t_h}/max(${t_c},1)*100,2))" 2>/dev/null || echo NA)
else
ratio=NA
fi
echo "run#$i c=${t_c}ms hak=${t_h}ms ratio=${ratio}%" >&2
done
avg_c=$((sum_c / RUNS)); avg_h=$((sum_h / RUNS))
echo "avg c=${avg_c}ms hak=${avg_h}ms" >&2
if [ "$avg_c" -lt 5 ]; then
echo "[WARN] C runtime is very small (${avg_c}ms). Increase --n to reduce timer granularity noise." >&2
fi
if command -v python3 >/dev/null 2>&1; then
python3 - <<PY
c=$avg_c; h=$avg_h
ratio = (h/max(c,1))*100.0
print(f"ratio={ratio:.2f}%")
PY
fi
fi
rm -f "$HAKO_FILE" "$C_FILE" "${C_FILE%.c}" 2>/dev/null || true