Files
hakorune/tools/hako_parser/tokenizer.hako
nyash-codex fa3091061d trace: add execution route visibility + debug passthrough; phase2170 canaries; docs
- Add HAKO_TRACE_EXECUTION to trace executor route
  - Rust hv1_inline: stderr [trace] executor: hv1_inline (rust)
  - Hakovm dispatcher: stdout [trace] executor: hakovm (hako)
  - test_runner: trace lines for hv1_inline/core/hakovm routes
- Add HAKO_VERIFY_SHOW_LOGS and HAKO_DEBUG=1 (enables both)
  - verify_v1_inline_file() log passthrough with numeric rc extraction
  - test_runner exports via HAKO_DEBUG
- Canary expansion under phase2170 (state spec)
  - Array: push×5/10 → size, len/length alias, per‑recv/global, flow across blocks
  - Map: set dup-key non-increment, value_state get/has
  - run_all.sh: unify, remove SKIPs; all PASS
- Docs
  - ENV_VARS.md: add Debug/Tracing toggles and examples
  - PLAN.md/CURRENT_TASK.md: mark 21.7 green, add Quickstart lines

All changes gated by env vars; default behavior unchanged.
2025-11-08 23:45:29 +09:00

137 lines
5.3 KiB
Plaintext

// tools/hako_parser/tokenizer.hako - HakoTokenizerBox (Stage-3 aware tokenizer, MVP)
// Produces tokens with type, lexeme, line, col. Handles strings (escapes), numbers,
// identifiers, and punctuation. Keywords are normalized to upper-case kinds.
// no external deps (self-contained tokenizer)
static box HakoTokenizerBox {
// Token: Map { type, lexeme, line, col }
tokenize(text) {
local out = new ArrayBox()
if text == null { return out }
local n = text.length()
local i = 0
local line = 1
local col = 1
while i < n {
local ch = text.substring(i,i+1)
// whitespace and newlines
if ch == " " || ch == "\t" { i = i + 1; col = col + 1; continue }
if ch == "\r" { i = i + 1; continue }
if ch == "\n" { i = i + 1; line = line + 1; col = 1; continue }
// line comment // ... (consume until EOL)
if ch == "/" && i+1 < n && text.substring(i+1,i+2) == "/" {
// skip until newline
i = i + 2; col = col + 2
while i < n {
local c2 = text.substring(i,i+1)
if c2 == "\n" { break }
i = i + 1; col = col + 1
}
continue
}
// block comment /* ... */ (consume until closing, track newlines)
if ch == "/" && i+1 < n && text.substring(i+1,i+2) == "*" {
i = i + 2; col = col + 2
local closed = 0
while i < n {
local c2 = text.substring(i,i+1)
if c2 == "*" && i+1 < n && text.substring(i+1,i+2) == "/" { i = i + 2; col = col + 2; closed = 1; break }
if c2 == "\n" { i = i + 1; line = line + 1; col = 1; continue }
i = i + 1; col = col + 1
}
continue
}
// string literal "..." with escapes \" \\ \n \t
if ch == '"' {
local start_col = col
local buf = ""
i = i + 1; col = col + 1
local closed = 0
while i < n {
local c3 = text.substring(i,i+1)
if c3 == '"' { closed = 1; i = i + 1; col = col + 1; break }
if c3 == "\\" {
if i+1 < n {
local esc = text.substring(i+1,i+2)
if esc == '"' { buf = buf.concat('"') }
else if esc == "\\" { buf = buf.concat("\\") }
else if esc == "n" { buf = buf.concat("\n") }
else if esc == "t" { buf = buf.concat("\t") }
else { buf = buf.concat(esc) }
i = i + 2; col = col + 2
continue
} else { i = i + 1; col = col + 1; break }
}
buf = buf.concat(c3)
i = i + 1; col = col + 1
}
local tok = new MapBox(); tok.set("type","STRING"); tok.set("lexeme", buf); tok.set("line", line); tok.set("col", start_col)
out.push(tok); continue
}
// number (integer only for MVP)
if ch >= "0" && ch <= "9" {
local start = i; local start_col = col
while i < n {
local c4 = text.substring(i,i+1)
if !(c4 >= "0" && c4 <= "9") { break }
i = i + 1; col = col + 1
}
local lex = text.substring(start, i)
local tok = new MapBox(); tok.set("type","NUMBER"); tok.set("lexeme", lex); tok.set("line", line); tok.set("col", start_col)
out.push(tok); continue
}
// identifier or keyword
if me._is_ident_start(ch) == 1 {
local start = i; local start_col = col
while i < n {
local c5 = text.substring(i,i+1)
if me._is_ident_char(c5) == 0 { break }
i = i + 1; col = col + 1
}
local lex = text.substring(start, i)
local kind = me._kw_kind(lex)
local tok = new MapBox(); tok.set("type", kind); tok.set("lexeme", lex); tok.set("line", line); tok.set("col", start_col)
out.push(tok); continue
}
// punctuation / symbols we care about
local sym_kind = me._sym_kind(ch)
if sym_kind != null {
local tok = new MapBox(); tok.set("type", sym_kind); tok.set("lexeme", ch); tok.set("line", line); tok.set("col", col)
out.push(tok); i = i + 1; col = col + 1; continue
}
// unknown char -> emit as PUNC so parser can skip gracefully
local tok = new MapBox(); tok.set("type","PUNC"); tok.set("lexeme", ch); tok.set("line", line); tok.set("col", col)
out.push(tok); i = i + 1; col = col + 1
}
return out
}
_is_ident_start(c) { if c=="_" {return 1}; if c>="A"&&c<="Z" {return 1}; if c>="a"&&c<="z" {return 1}; return 0 }
_is_ident_char(c) { if me._is_ident_start(c)==1 { return 1 }; if c>="0"&&c<="9" { return 1 }; return 0 }
_kw_kind(lex) {
if lex == "using" { return "USING" }
if lex == "as" { return "AS" }
if lex == "static" { return "STATIC" }
if lex == "box" { return "BOX" }
if lex == "method" { return "METHOD" }
if lex == "include" { return "INCLUDE" }
if lex == "while" { return "WHILE" } // Stage-3 tokens (MVP)
if lex == "for" { return "FOR" }
if lex == "in" { return "IN" }
return "IDENT"
}
_sym_kind(c) {
if c == "{" { return "LBRACE" }
if c == "}" { return "RBRACE" }
if c == "(" { return "LPAREN" }
if c == ")" { return "RPAREN" }
if c == "," { return "COMMA" }
if c == "." { return "DOT" }
if c == ":" { return "COLON" }
if c == "=" { return "EQ" }
if c == ";" { return "SEMI" }
return null
}
}
static box HakoTokenizerMain { method main(args) { return 0 } }