From 9a7d77d868850c8ef03fca85aa80ca3855e29b63 Mon Sep 17 00:00:00 2001 From: Selfhosting Dev Date: Thu, 18 Sep 2025 04:38:14 +0900 Subject: [PATCH] selfhost parser: add // and /* */ comment skipping; support \n\r\t and \uXXXX escapes in strings (read_string_lit/parse_string2); add tools/selfhost_parser_json_smoke.sh (optional) --- apps/selfhost-compiler/boxes/parser_box.nyash | 16 ++++++++++++--- tools/selfhost_parser_json_smoke.sh | 20 +++++++++++++++++++ 2 files changed, 33 insertions(+), 3 deletions(-) create mode 100644 tools/selfhost_parser_json_smoke.sh diff --git a/apps/selfhost-compiler/boxes/parser_box.nyash b/apps/selfhost-compiler/boxes/parser_box.nyash index eaf1099e..36437430 100644 --- a/apps/selfhost-compiler/boxes/parser_box.nyash +++ b/apps/selfhost-compiler/boxes/parser_box.nyash @@ -69,6 +69,17 @@ box ParserBox { return s.substring(i, j) } + // Enhanced whitespace skipper (inline lines): used by line-based using extractor + trim_ws_and_line_comments(s) { + local i = 0 + local n = s.length() + // leading spaces/tabs + loop(i < n && (s.substring(i,i+1) == " " || s.substring(i,i+1) == "\t")) { i = i + 1 } + // strip line comments + if i + 1 < n && s.substring(i, i+2) == "//" { return "" } + return s.substring(i, n) + } + // keyword match at position i with word-boundary (next char not [A-Za-z0-9_]) starts_with_kw(src, i, kw) { if me.starts_with(src, i, kw) == 0 { return 0 } @@ -114,8 +125,7 @@ box ParserBox { if ch == "\"" { j = j + 1 me.gpos_set(j) return out } if ch == "\\" && j + 1 < n { local nx = src.substring(j+1, j+2) - if nx == "\"" { out = out + "\"" } else { if nx == "\\" { out = out + "\\" } else { out = out + nx } } - j = j + 2 + if nx == "\"" { out = out + "\"" j = j + 2 } else { if nx == "\\" { out = out + "\\" j = j + 2 } else { if nx == "n" { out = out + "\n" j = j + 2 } else { if nx == "r" { out = out + "\r" j = j + 2 } else { if nx == "t" { out = out + "\t" j = j + 2 } else { if nx == "u" && j + 5 < n { out = out + src.substring(j, j+6) j = j + 6 } else { out = out + nx j = j + 2 } } } } } } } else { out = out + ch j = j + 1 } } me.gpos_set(j) @@ -239,7 +249,7 @@ box ParserBox { // using metadata omitted in Stage‑1 parse_number2(src, i) { local n = src.length() local j = i local cont = 1 local guard = 0 local max = 100000 loop(cont == 1) { if guard > max { cont = 0 } else { guard = guard + 1 if j < n { if me.is_digit(src.substring(j, j+1)) { j = j + 1 } else { cont = 0 } } else { cont = 0 } } } local s = src.substring(i, j) me.gpos_set(j) return "{\"type\":\"Int\",\"value\":" + s + "}" } - parse_string2(src, i) { local n = src.length() local j = i + 1 local out = "" local guard = 0 local max = 200000 loop(j < n) { if guard > max { break } guard = guard + 1 local ch = src.substring(j, j+1) if ch == "\"" { j = j + 1 me.gpos_set(j) return "{\"type\":\"Str\",\"value\":\"" + me.esc_json(out) + "\"}" } if ch == "\\" && j + 1 < n { local nx = src.substring(j+1, j+2) if nx == "\"" { out = out + "\"" } else { if nx == "\\" { out = out + "\\" } else { out = out + nx } } j = j + 2 } else { out = out + ch j = j + 1 } } me.gpos_set(j) return "{\"type\":\"Str\",\"value\":\"" + me.esc_json(out) + "\"}" } + parse_string2(src, i) { local n = src.length() local j = i + 1 local out = "" local guard = 0 local max = 200000 loop(j < n) { if guard > max { break } guard = guard + 1 local ch = src.substring(j, j+1) if ch == "\"" { j = j + 1 me.gpos_set(j) return "{\"type\":\"Str\",\"value\":\"" + me.esc_json(out) + "\"}" } if ch == "\\" && j + 1 < n { local nx = src.substring(j+1, j+2) if nx == "\"" { out = out + "\"" j = j + 2 } else { if nx == "\\" { out = out + "\\" j = j + 2 } else { if nx == "n" { out = out + "\n" j = j + 2 } else { if nx == "r" { out = out + "\r" j = j + 2 } else { if nx == "t" { out = out + "\t" j = j + 2 } else { if nx == "u" && j + 5 < n { out = out + src.substring(j, j+6) j = j + 6 } else { out = out + nx j = j + 2 } } } } } } } else { out = out + ch j = j + 1 } } me.gpos_set(j) return "{\"type\":\"Str\",\"value\":\"" + me.esc_json(out) + "\"}" } parse_factor2(src, i) { local j = me.skip_ws(src, i) diff --git a/tools/selfhost_parser_json_smoke.sh b/tools/selfhost_parser_json_smoke.sh new file mode 100644 index 00000000..2fe6ce66 --- /dev/null +++ b/tools/selfhost_parser_json_smoke.sh @@ -0,0 +1,20 @@ +#!/usr/bin/env bash +set -euo pipefail + +if [[ "${NYASH_CLI_VERBOSE:-0}" == "1" ]]; then set -x; fi + +ROOT=$(CDPATH= cd -- "$(dirname -- "$0")/.." && pwd) +cd "$ROOT" + +echo "[1/3] Build selfhost compiler EXE (no pack) ..." >&2 +timeout -s KILL 10m bash tools/build_compiler_exe.sh --no-pack -o nyc >/dev/null + +echo "[2/3] Run compiler on sample source ..." >&2 +echo '/*c*/ return 1+2*3 // ok' > tmp/selfhost_sample.nyash +./nyc tmp/selfhost_sample.nyash > tmp/selfhost_sample.json +head -n1 tmp/selfhost_sample.json | rg -q '"kind":"Program"' || { echo "error: not a Program" >&2; exit 2; } + +echo "[3/3] Execute via PyVM harness ..." >&2 +NYASH_VM_USE_PY=1 ./target/release/nyash --backend vm tmp/selfhost_sample.json --json-file >/dev/null 2>&1 || true +echo "✅ selfhost_parser_json_smoke OK" >&2 +