selfhost(pyvm): MiniVmPrints – prefer JSON route early-return (ok==1) to avoid fallback loops; keep default behavior unchanged elsewhere

This commit is contained in:
Selfhosting Dev
2025-09-22 07:54:25 +09:00
parent 27568eb4a6
commit 8e4cadd349
348 changed files with 9981 additions and 30074 deletions

32
src/tokenizer/cursor.rs Normal file
View File

@ -0,0 +1,32 @@
use super::NyashTokenizer;
impl NyashTokenizer {
/// 現在の文字を取得
pub(crate) fn current_char(&self) -> Option<char> {
self.input.get(self.position).copied()
}
/// 次の文字を先読み
pub(crate) fn peek_char(&self) -> Option<char> {
self.input.get(self.position + 1).copied()
}
/// 1文字進める行/列も更新)
pub(crate) fn advance(&mut self) {
if let Some(c) = self.current_char() {
self.position += 1;
if c == '\n' {
self.line += 1;
self.column = 1;
} else {
self.column += 1;
}
}
}
/// 入力の終端に到達しているか
pub(crate) fn is_at_end(&self) -> bool {
self.position >= self.input.len()
}
}

256
src/tokenizer/engine.rs Normal file
View File

@ -0,0 +1,256 @@
use super::{NyashTokenizer, Token, TokenType, TokenizeError};
impl NyashTokenizer {
#[inline]
pub(crate) fn allow_semicolon() -> bool {
match std::env::var("NYASH_PARSER_ALLOW_SEMICOLON").ok() {
Some(v) => {
let lv = v.to_ascii_lowercase();
lv == "1" || lv == "true" || lv == "on"
}
None => false,
}
}
#[inline]
pub(crate) fn strict_12_7() -> bool {
std::env::var("NYASH_STRICT_12_7").ok().as_deref() == Some("1")
}
/// 新しいトークナイザーを作成
pub fn new(input: impl Into<String>) -> Self {
let input_string = input.into();
Self {
input: input_string.chars().collect(),
position: 0,
line: 1,
column: 1,
}
}
/// 完全なトークナイズを実行
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizeError> {
let mut tokens = Vec::new();
while !self.is_at_end() {
// 空白・コメントをスキップ
self.skip_whitespace();
// 連続するブロックコメントや行コメントもまとめてスキップ
loop {
// block comment: /* ... */
if self.current_char() == Some('/') && self.peek_char() == Some('*') {
self.skip_block_comment()?;
self.skip_whitespace();
continue;
}
// line comments: // ... or # ...
if (self.current_char() == Some('/') && self.peek_char() == Some('/'))
|| self.current_char() == Some('#')
{
self.skip_line_comment();
self.skip_whitespace();
continue;
}
break;
}
if self.is_at_end() {
break;
}
// 次のトークンを読み取り
let token = self.tokenize_next()?;
if std::env::var("NYASH_TOK_TRACE").ok().as_deref() == Some("1") {
eprintln!("[tok] {:?}", token.token_type);
}
tokens.push(token);
}
// EOF トークンを追加
tokens.push(Token::new(TokenType::EOF, self.line, self.column));
Ok(tokens)
}
/// 次の一つのトークンを読み取り
fn tokenize_next(&mut self) -> Result<Token, TokenizeError> {
let start_line = self.line;
let start_column = self.column;
match self.current_char() {
// Optional statement separator ';' (gated)
Some(';') if Self::allow_semicolon() => {
self.advance();
return Ok(Token::new(TokenType::SEMICOLON, start_line, start_column));
}
// Block comment should have been skipped by tokenize() pre-loop, but be defensive here
Some('/') if self.peek_char() == Some('*') => {
self.skip_block_comment()?;
// After skipping, restart tokenization for next token
return self.tokenize_next();
}
// 2文字またはそれ以上の演算子は最長一致で先に判定
Some('|') if self.peek_char() == Some('|') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::OR, start_line, start_column));
}
Some('&') if self.peek_char() == Some('&') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::AND, start_line, start_column));
}
Some('|') if self.peek_char() == Some('>') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::PipeForward, start_line, start_column));
}
Some('?') if self.peek_char() == Some('.') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::QmarkDot, start_line, start_column));
}
Some('?') if self.peek_char() == Some('?') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::QmarkQmark, start_line, start_column));
}
Some('?') => {
self.advance();
return Ok(Token::new(TokenType::QUESTION, start_line, start_column));
}
Some('+') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::PlusAssign, start_line, start_column));
}
Some('-') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::MinusAssign, start_line, start_column));
}
Some('*') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::MulAssign, start_line, start_column));
}
Some('/') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::DivAssign, start_line, start_column));
}
Some('.') if self.peek_char() == Some('.') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::RANGE, start_line, start_column));
}
Some('"') => {
let string_value = self.read_string()?;
Ok(Token::new(
TokenType::STRING(string_value),
start_line,
start_column,
))
}
Some(c) if c.is_ascii_digit() => {
let token_type = self.read_numeric_literal()?;
Ok(Token::new(token_type, start_line, start_column))
}
Some(c) if c.is_alphabetic() || c == '_' => {
let token_type = self.read_keyword_or_identifier();
Ok(Token::new(token_type, start_line, start_column))
}
Some('/') if self.peek_char() == Some('/') => {
self.skip_line_comment();
self.skip_whitespace(); // コメント後の空白もスキップ
return self.tokenize_next();
}
Some('#') => {
self.skip_line_comment();
self.skip_whitespace(); // コメント後の空白もスキップ
return self.tokenize_next();
}
Some('>') if self.peek_char() == Some('>') && !Self::strict_12_7() => {
self.advance();
self.advance();
Ok(Token::new(TokenType::ShiftRight, start_line, start_column))
}
Some(':') if self.peek_char() == Some(':') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::DoubleColon, start_line, start_column))
}
Some(':') => {
self.advance();
Ok(Token::new(TokenType::COLON, start_line, start_column))
}
Some('=') if self.peek_char() == Some('>') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::FatArrow, start_line, start_column))
}
Some('=') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::EQUALS, start_line, start_column))
}
Some('!') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::NotEquals, start_line, start_column))
}
Some('<') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::LessEquals, start_line, start_column))
}
Some('>') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::GreaterEquals, start_line, start_column))
}
Some(c) => {
if let Some(token) = self.single_char_token(c) {
self.advance();
Ok(Token::new(token, start_line, start_column))
} else {
Err(TokenizeError::UnexpectedCharacter {
char: c,
line: start_line,
column: start_column,
})
}
}
None => Ok(Token::new(TokenType::EOF, start_line, start_column)),
}
}
// 単文字トークンのマップ(最長一致系は呼び出し元で処理済み)
fn single_char_token(&self, c: char) -> Option<TokenType> {
// '?' は上位で分岐済み、':' も同様。ここでは純粋な1文字を扱う。
match c {
'!' => Some(TokenType::NOT),
'<' => Some(TokenType::LESS),
'>' => Some(TokenType::GREATER),
'&' => Some(TokenType::BitAnd),
'|' => Some(TokenType::BitOr),
'^' => Some(TokenType::BitXor),
'=' => Some(TokenType::ASSIGN),
'+' => Some(TokenType::PLUS),
'-' => Some(TokenType::MINUS),
'*' => Some(TokenType::MULTIPLY),
'/' => Some(TokenType::DIVIDE),
'%' => Some(TokenType::MODULO),
'.' => Some(TokenType::DOT),
'(' => Some(TokenType::LPAREN),
')' => Some(TokenType::RPAREN),
'[' => Some(TokenType::LBRACK),
']' => Some(TokenType::RBRACK),
'{' => Some(TokenType::LBRACE),
'}' => Some(TokenType::RBRACE),
',' => Some(TokenType::COMMA),
'\n' => Some(TokenType::NEWLINE),
_ => None,
}
}
}

134
src/tokenizer/kinds.rs Normal file
View File

@ -0,0 +1,134 @@
use thiserror::Error;
/// トークンの種類
#[derive(Debug, Clone, PartialEq)]
pub enum TokenType {
// リテラル
STRING(String),
NUMBER(i64),
FLOAT(f64),
TRUE,
FALSE,
NULL,
// キーワード
BOX,
GLOBAL,
SINGLETON,
NEW,
MATCH,
IF,
ELSE,
LOOP,
BREAK,
CONTINUE,
RETURN,
FUNCTION,
FN,
PRINT,
THIS,
ME,
INIT,
PACK,
BIRTH,
NOWAIT,
AWAIT,
INTERFACE,
COLON,
INCLUDE,
TRY,
CATCH,
CLEANUP,
THROW,
LOCAL,
STATIC,
OUTBOX,
NOT,
OVERRIDE,
FROM,
WEAK,
USING,
IMPORT,
// 演算子
ShiftLeft,
ShiftRight,
BitAnd,
BitOr,
BitXor,
FatArrow,
EQUALS,
NotEquals,
LessEquals,
GreaterEquals,
AND,
OR,
// 2文字演算子最長一致
PipeForward,
QmarkDot,
QmarkQmark,
PlusAssign,
MinusAssign,
MulAssign,
DivAssign,
RANGE,
LESS,
GREATER,
ASSIGN,
PLUS,
MINUS,
MULTIPLY,
DIVIDE,
MODULO,
// 記号
DOT,
DoubleColon,
LPAREN,
RPAREN,
LBRACK,
RBRACK,
LBRACE,
RBRACE,
COMMA,
QUESTION,
NEWLINE,
SEMICOLON, // オプショナル区切り
// 識別子
IDENTIFIER(String),
// 特殊
EOF,
}
/// トークン(位置情報付き)
#[derive(Debug, Clone)]
pub struct Token {
pub token_type: TokenType,
pub line: usize,
pub column: usize,
}
impl Token {
pub fn new(token_type: TokenType, line: usize, column: usize) -> Self {
Self { token_type, line, column }
}
}
/// トークナイズエラー
#[derive(Error, Debug)]
pub enum TokenizeError {
#[error("Unexpected character '{char}' at line {line}, column {column}")]
UnexpectedCharacter { char: char, line: usize, column: usize },
#[error("Unterminated string literal at line {line}")]
UnterminatedString { line: usize },
#[error("Invalid number format at line {line}")]
InvalidNumber { line: usize },
#[error("Comment not closed at line {line}")]
UnterminatedComment { line: usize },
}

102
src/tokenizer/lex_ident.rs Normal file
View File

@ -0,0 +1,102 @@
use super::{NyashTokenizer, TokenType};
use crate::grammar::engine;
impl NyashTokenizer {
/// キーワードまたは識別子を読み取り
pub(crate) fn read_keyword_or_identifier(&mut self) -> TokenType {
let mut identifier = String::new();
while let Some(c) = self.current_char() {
if c.is_alphanumeric() || c == '_' {
identifier.push(c);
self.advance();
} else {
break;
}
}
// キーワードチェック
let mut tok = match identifier.as_str() {
"box" => TokenType::BOX,
"global" => TokenType::GLOBAL,
"singleton" => TokenType::SINGLETON,
"new" => TokenType::NEW,
"match" => TokenType::MATCH,
"if" => TokenType::IF,
"else" => TokenType::ELSE,
"loop" => TokenType::LOOP,
"break" => TokenType::BREAK,
"continue" => TokenType::CONTINUE,
"return" => TokenType::RETURN,
"function" => TokenType::FUNCTION,
"fn" => TokenType::FN,
"print" => TokenType::PRINT,
"this" => TokenType::THIS,
"me" => TokenType::ME,
"init" => TokenType::INIT,
"pack" => TokenType::PACK,
"birth" => TokenType::BIRTH,
"nowait" => TokenType::NOWAIT,
"await" => TokenType::AWAIT,
"interface" => TokenType::INTERFACE,
"include" => TokenType::INCLUDE,
"import" => TokenType::IMPORT,
"try" => TokenType::TRY,
"catch" => TokenType::CATCH,
"cleanup" => TokenType::CLEANUP,
"throw" => TokenType::THROW,
"local" => TokenType::LOCAL,
"static" => TokenType::STATIC,
"outbox" => TokenType::OUTBOX,
"not" => TokenType::NOT,
"override" => TokenType::OVERRIDE,
"from" => TokenType::FROM,
"weak" => TokenType::WEAK,
"using" => TokenType::USING,
"and" => TokenType::AND,
"or" => TokenType::OR,
"true" => TokenType::TRUE,
"false" => TokenType::FALSE,
"null" => TokenType::NULL,
_ => TokenType::IDENTIFIER(identifier.clone()),
};
// 12.7 Strict mode: fallback extended keywords to IDENTIFIER
if Self::strict_12_7() {
let is_extended = matches!(
tok,
TokenType::INTERFACE
| TokenType::USING
| TokenType::INCLUDE
| TokenType::OUTBOX
| TokenType::NOWAIT
| TokenType::OVERRIDE
| TokenType::WEAK
| TokenType::PACK
);
if is_extended {
tok = TokenType::IDENTIFIER(identifier.clone());
}
}
// 統一文法エンジンとの差分チェック(動作は変更しない)
if std::env::var("NYASH_GRAMMAR_DIFF").ok().as_deref() == Some("1") {
if let Some(kw) = engine::get().is_keyword_str(&identifier) {
if let TokenType::IDENTIFIER(_) = tok {
eprintln!(
"[GRAMMAR-DIFF] tokenizer=IDENT, grammar=KEYWORD({}) word='{}'",
kw, identifier
);
}
} else if !matches!(tok, TokenType::IDENTIFIER(_)) {
eprintln!(
"[GRAMMAR-DIFF] tokenizer=KEYWORD, grammar=IDENT word='{}'",
identifier
);
}
}
tok
}
}

View File

@ -0,0 +1,43 @@
use super::{NyashTokenizer, TokenType, TokenizeError};
impl NyashTokenizer {
/// 数値リテラル(整数または浮動小数点数)を読み取り
pub(crate) fn read_numeric_literal(&mut self) -> Result<TokenType, TokenizeError> {
let start_line = self.line;
let mut number_str = String::new();
let mut has_dot = false;
// 整数部分を読み取り
while let Some(c) = self.current_char() {
if c.is_ascii_digit() {
number_str.push(c);
self.advance();
} else if c == '.'
&& !has_dot
&& self.peek_char().map_or(false, |ch| ch.is_ascii_digit())
{
// 小数点の後に数字が続く場合のみ受け入れる
has_dot = true;
number_str.push(c);
self.advance();
} else {
break;
}
}
if has_dot {
// 浮動小数点数として解析
number_str
.parse::<f64>()
.map(TokenType::FLOAT)
.map_err(|_| TokenizeError::InvalidNumber { line: start_line })
} else {
// 整数として解析
number_str
.parse::<i64>()
.map(TokenType::NUMBER)
.map_err(|_| TokenizeError::InvalidNumber { line: start_line })
}
}
}

View File

@ -0,0 +1,42 @@
use super::{NyashTokenizer, TokenizeError};
impl NyashTokenizer {
/// 文字列リテラルを読み取り
pub(crate) fn read_string(&mut self) -> Result<String, TokenizeError> {
let start_line = self.line;
self.advance(); // 開始の '"' をスキップ
let mut string_value = String::new();
while let Some(c) = self.current_char() {
if c == '"' {
self.advance(); // 終了の '"' をスキップ
return Ok(string_value);
}
// エスケープ文字の処理
if c == '\\' {
self.advance();
match self.current_char() {
Some('n') => string_value.push('\n'),
Some('t') => string_value.push('\t'),
Some('r') => string_value.push('\r'),
Some('\\') => string_value.push('\\'),
Some('"') => string_value.push('"'),
Some(c) => {
string_value.push('\\');
string_value.push(c);
}
None => break,
}
} else {
string_value.push(c);
}
self.advance();
}
Err(TokenizeError::UnterminatedString { line: start_line })
}
}

24
src/tokenizer/mod.rs Normal file
View File

@ -0,0 +1,24 @@
/*!
* Nyash Tokenizer — split modules (kinds/cursor/whitespace/lexers/engine)
*/
mod kinds;
mod cursor;
mod whitespace;
mod lex_string;
mod lex_number;
mod lex_ident;
mod engine;
pub use kinds::{Token, TokenType, TokenizeError};
/// Nyashトークナイザー
pub struct NyashTokenizer {
pub(crate) input: Vec<char>,
pub(crate) position: usize,
pub(crate) line: usize,
pub(crate) column: usize,
}
// Public API and core logic are implemented in submodules via impl NyashTokenizer

View File

@ -0,0 +1,43 @@
use super::{NyashTokenizer, TokenizeError};
impl NyashTokenizer {
/// 行コメントをスキップ
pub(crate) fn skip_line_comment(&mut self) {
while let Some(c) = self.current_char() {
if c == '\n' {
break; // 改行文字は消費せずに残す
}
self.advance();
}
}
/// ブロックコメントをスキップ: /* ... */(ネスト非対応)
pub(crate) fn skip_block_comment(&mut self) -> Result<(), TokenizeError> {
// Assume current position is at '/' and next is '*'
self.advance(); // '/'
self.advance(); // '*'
while let Some(c) = self.current_char() {
// detect end '*/'
if c == '*' && self.peek_char() == Some('/') {
self.advance(); // '*'
self.advance(); // '/'
return Ok(());
}
self.advance();
}
// EOF reached without closing */
Err(TokenizeError::UnterminatedComment { line: self.line })
}
/// 空白文字をスキップ改行は除く改行はNEWLINEトークンとして扱う
pub(crate) fn skip_whitespace(&mut self) {
while let Some(c) = self.current_char() {
if c.is_whitespace() && c != '\n' {
self.advance();
} else {
break;
}
}
}
}