selfhost(pyvm): MiniVmPrints – prefer JSON route early-return (ok==1) to avoid fallback loops; keep default behavior unchanged elsewhere

2025-09-22 07:54:25 +09:00
parent 27568eb4a6
commit 8e4cadd349
348 changed files with 9981 additions and 30074 deletions
--- a/src/tokenizer/cursor.rs
+++ b/src/tokenizer/cursor.rs
@ -0,0 +1,32 @@
+use super::NyashTokenizer;
+
+impl NyashTokenizer {
+    /// 現在の文字を取得
+    pub(crate) fn current_char(&self) -> Option<char> {
+        self.input.get(self.position).copied()
+    }
+
+    /// 次の文字を先読み
+    pub(crate) fn peek_char(&self) -> Option<char> {
+        self.input.get(self.position + 1).copied()
+    }
+
+    /// 1文字進める（行/列も更新）
+    pub(crate) fn advance(&mut self) {
+        if let Some(c) = self.current_char() {
+            self.position += 1;
+            if c == '\n' {
+                self.line += 1;
+                self.column = 1;
+            } else {
+                self.column += 1;
+            }
+        }
+    }
+
+    /// 入力の終端に到達しているか
+    pub(crate) fn is_at_end(&self) -> bool {
+        self.position >= self.input.len()
+    }
+}
+
--- a/src/tokenizer/engine.rs
+++ b/src/tokenizer/engine.rs
@ -0,0 +1,256 @@
+use super::{NyashTokenizer, Token, TokenType, TokenizeError};
+
+impl NyashTokenizer {
+    #[inline]
+    pub(crate) fn allow_semicolon() -> bool {
+        match std::env::var("NYASH_PARSER_ALLOW_SEMICOLON").ok() {
+            Some(v) => {
+                let lv = v.to_ascii_lowercase();
+                lv == "1" || lv == "true" || lv == "on"
+            }
+            None => false,
+        }
+    }
+
+    #[inline]
+    pub(crate) fn strict_12_7() -> bool {
+        std::env::var("NYASH_STRICT_12_7").ok().as_deref() == Some("1")
+    }
+
+    /// 新しいトークナイザーを作成
+    pub fn new(input: impl Into<String>) -> Self {
+        let input_string = input.into();
+        Self {
+            input: input_string.chars().collect(),
+            position: 0,
+            line: 1,
+            column: 1,
+        }
+    }
+
+    /// 完全なトークナイズを実行
+    pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizeError> {
+        let mut tokens = Vec::new();
+
+        while !self.is_at_end() {
+            // 空白・コメントをスキップ
+            self.skip_whitespace();
+            // 連続するブロックコメントや行コメントもまとめてスキップ
+            loop {
+                // block comment: /* ... */
+                if self.current_char() == Some('/') && self.peek_char() == Some('*') {
+                    self.skip_block_comment()?;
+                    self.skip_whitespace();
+                    continue;
+                }
+                // line comments: // ... or # ...
+                if (self.current_char() == Some('/') && self.peek_char() == Some('/'))
+                    || self.current_char() == Some('#')
+                {
+                    self.skip_line_comment();
+                    self.skip_whitespace();
+                    continue;
+                }
+                break;
+            }
+
+            if self.is_at_end() {
+                break;
+            }
+
+            // 次のトークンを読み取り
+            let token = self.tokenize_next()?;
+            if std::env::var("NYASH_TOK_TRACE").ok().as_deref() == Some("1") {
+                eprintln!("[tok] {:?}", token.token_type);
+            }
+            tokens.push(token);
+        }
+
+        // EOF トークンを追加
+        tokens.push(Token::new(TokenType::EOF, self.line, self.column));
+
+        Ok(tokens)
+    }
+
+    /// 次の一つのトークンを読み取り
+    fn tokenize_next(&mut self) -> Result<Token, TokenizeError> {
+        let start_line = self.line;
+        let start_column = self.column;
+
+        match self.current_char() {
+            // Optional statement separator ';' (gated)
+            Some(';') if Self::allow_semicolon() => {
+                self.advance();
+                return Ok(Token::new(TokenType::SEMICOLON, start_line, start_column));
+            }
+            // Block comment should have been skipped by tokenize() pre-loop, but be defensive here
+            Some('/') if self.peek_char() == Some('*') => {
+                self.skip_block_comment()?;
+                // After skipping, restart tokenization for next token
+                return self.tokenize_next();
+            }
+            // 2文字（またはそれ以上）の演算子は最長一致で先に判定
+            Some('|') if self.peek_char() == Some('|') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::OR, start_line, start_column));
+            }
+            Some('&') if self.peek_char() == Some('&') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::AND, start_line, start_column));
+            }
+            Some('|') if self.peek_char() == Some('>') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::PipeForward, start_line, start_column));
+            }
+            Some('?') if self.peek_char() == Some('.') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::QmarkDot, start_line, start_column));
+            }
+            Some('?') if self.peek_char() == Some('?') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::QmarkQmark, start_line, start_column));
+            }
+            Some('?') => {
+                self.advance();
+                return Ok(Token::new(TokenType::QUESTION, start_line, start_column));
+            }
+            Some('+') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::PlusAssign, start_line, start_column));
+            }
+            Some('-') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::MinusAssign, start_line, start_column));
+            }
+            Some('*') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::MulAssign, start_line, start_column));
+            }
+            Some('/') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::DivAssign, start_line, start_column));
+            }
+            Some('.') if self.peek_char() == Some('.') => {
+                self.advance();
+                self.advance();
+                return Ok(Token::new(TokenType::RANGE, start_line, start_column));
+            }
+            Some('"') => {
+                let string_value = self.read_string()?;
+                Ok(Token::new(
+                    TokenType::STRING(string_value),
+                    start_line,
+                    start_column,
+                ))
+            }
+            Some(c) if c.is_ascii_digit() => {
+                let token_type = self.read_numeric_literal()?;
+                Ok(Token::new(token_type, start_line, start_column))
+            }
+            Some(c) if c.is_alphabetic() || c == '_' => {
+                let token_type = self.read_keyword_or_identifier();
+                Ok(Token::new(token_type, start_line, start_column))
+            }
+            Some('/') if self.peek_char() == Some('/') => {
+                self.skip_line_comment();
+                self.skip_whitespace(); // コメント後の空白もスキップ
+                return self.tokenize_next();
+            }
+            Some('#') => {
+                self.skip_line_comment();
+                self.skip_whitespace(); // コメント後の空白もスキップ
+                return self.tokenize_next();
+            }
+            Some('>') if self.peek_char() == Some('>') && !Self::strict_12_7() => {
+                self.advance();
+                self.advance();
+                Ok(Token::new(TokenType::ShiftRight, start_line, start_column))
+            }
+            Some(':') if self.peek_char() == Some(':') => {
+                self.advance();
+                self.advance();
+                Ok(Token::new(TokenType::DoubleColon, start_line, start_column))
+            }
+            Some(':') => {
+                self.advance();
+                Ok(Token::new(TokenType::COLON, start_line, start_column))
+            }
+            Some('=') if self.peek_char() == Some('>') => {
+                self.advance();
+                self.advance();
+                Ok(Token::new(TokenType::FatArrow, start_line, start_column))
+            }
+            Some('=') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                Ok(Token::new(TokenType::EQUALS, start_line, start_column))
+            }
+            Some('!') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                Ok(Token::new(TokenType::NotEquals, start_line, start_column))
+            }
+            Some('<') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                Ok(Token::new(TokenType::LessEquals, start_line, start_column))
+            }
+            Some('>') if self.peek_char() == Some('=') => {
+                self.advance();
+                self.advance();
+                Ok(Token::new(TokenType::GreaterEquals, start_line, start_column))
+            }
+            Some(c) => {
+                if let Some(token) = self.single_char_token(c) {
+                    self.advance();
+                    Ok(Token::new(token, start_line, start_column))
+                } else {
+                    Err(TokenizeError::UnexpectedCharacter {
+                        char: c,
+                        line: start_line,
+                        column: start_column,
+                    })
+                }
+            }
+            None => Ok(Token::new(TokenType::EOF, start_line, start_column)),
+        }
+    }
+
+    // 単文字トークンのマップ（最長一致系は呼び出し元で処理済み）
+    fn single_char_token(&self, c: char) -> Option<TokenType> {
+        // '?' は上位で分岐済み、':' も同様。ここでは純粋な1文字を扱う。
+        match c {
+            '!' => Some(TokenType::NOT),
+            '<' => Some(TokenType::LESS),
+            '>' => Some(TokenType::GREATER),
+            '&' => Some(TokenType::BitAnd),
+            '|' => Some(TokenType::BitOr),
+            '^' => Some(TokenType::BitXor),
+            '=' => Some(TokenType::ASSIGN),
+            '+' => Some(TokenType::PLUS),
+            '-' => Some(TokenType::MINUS),
+            '*' => Some(TokenType::MULTIPLY),
+            '/' => Some(TokenType::DIVIDE),
+            '%' => Some(TokenType::MODULO),
+            '.' => Some(TokenType::DOT),
+            '(' => Some(TokenType::LPAREN),
+            ')' => Some(TokenType::RPAREN),
+            '[' => Some(TokenType::LBRACK),
+            ']' => Some(TokenType::RBRACK),
+            '{' => Some(TokenType::LBRACE),
+            '}' => Some(TokenType::RBRACE),
+            ',' => Some(TokenType::COMMA),
+            '\n' => Some(TokenType::NEWLINE),
+            _ => None,
+        }
+    }
+}
--- a/src/tokenizer/kinds.rs
+++ b/src/tokenizer/kinds.rs
@ -0,0 +1,134 @@
+use thiserror::Error;
+
+/// トークンの種類
+#[derive(Debug, Clone, PartialEq)]
+pub enum TokenType {
+    // リテラル
+    STRING(String),
+    NUMBER(i64),
+    FLOAT(f64),
+    TRUE,
+    FALSE,
+    NULL,
+
+    // キーワード
+    BOX,
+    GLOBAL,
+    SINGLETON,
+    NEW,
+    MATCH,
+    IF,
+    ELSE,
+    LOOP,
+    BREAK,
+    CONTINUE,
+    RETURN,
+    FUNCTION,
+    FN,
+    PRINT,
+    THIS,
+    ME,
+    INIT,
+    PACK,
+    BIRTH,
+    NOWAIT,
+    AWAIT,
+    INTERFACE,
+    COLON,
+    INCLUDE,
+    TRY,
+    CATCH,
+    CLEANUP,
+    THROW,
+    LOCAL,
+    STATIC,
+    OUTBOX,
+    NOT,
+    OVERRIDE,
+    FROM,
+    WEAK,
+    USING,
+    IMPORT,
+
+    // 演算子
+    ShiftLeft,
+    ShiftRight,
+    BitAnd,
+    BitOr,
+    BitXor,
+    FatArrow,
+    EQUALS,
+    NotEquals,
+    LessEquals,
+    GreaterEquals,
+    AND,
+    OR,
+    // 2文字演算子（最長一致）
+    PipeForward,
+    QmarkDot,
+    QmarkQmark,
+    PlusAssign,
+    MinusAssign,
+    MulAssign,
+    DivAssign,
+    RANGE,
+    LESS,
+    GREATER,
+    ASSIGN,
+    PLUS,
+    MINUS,
+    MULTIPLY,
+    DIVIDE,
+    MODULO,
+
+    // 記号
+    DOT,
+    DoubleColon,
+    LPAREN,
+    RPAREN,
+    LBRACK,
+    RBRACK,
+    LBRACE,
+    RBRACE,
+    COMMA,
+    QUESTION,
+    NEWLINE,
+    SEMICOLON, // オプショナル区切り
+
+    // 識別子
+    IDENTIFIER(String),
+
+    // 特殊
+    EOF,
+}
+
+/// トークン（位置情報付き）
+#[derive(Debug, Clone)]
+pub struct Token {
+    pub token_type: TokenType,
+    pub line: usize,
+    pub column: usize,
+}
+
+impl Token {
+    pub fn new(token_type: TokenType, line: usize, column: usize) -> Self {
+        Self { token_type, line, column }
+    }
+}
+
+/// トークナイズエラー
+#[derive(Error, Debug)]
+pub enum TokenizeError {
+    #[error("Unexpected character '{char}' at line {line}, column {column}")]
+    UnexpectedCharacter { char: char, line: usize, column: usize },
+
+    #[error("Unterminated string literal at line {line}")]
+    UnterminatedString { line: usize },
+
+    #[error("Invalid number format at line {line}")]
+    InvalidNumber { line: usize },
+
+    #[error("Comment not closed at line {line}")]
+    UnterminatedComment { line: usize },
+}
+
--- a/src/tokenizer/lex_ident.rs
+++ b/src/tokenizer/lex_ident.rs
@ -0,0 +1,102 @@
+use super::{NyashTokenizer, TokenType};
+use crate::grammar::engine;
+
+impl NyashTokenizer {
+    /// キーワードまたは識別子を読み取り
+    pub(crate) fn read_keyword_or_identifier(&mut self) -> TokenType {
+        let mut identifier = String::new();
+
+        while let Some(c) = self.current_char() {
+            if c.is_alphanumeric() || c == '_' {
+                identifier.push(c);
+                self.advance();
+            } else {
+                break;
+            }
+        }
+
+        // キーワードチェック
+        let mut tok = match identifier.as_str() {
+            "box" => TokenType::BOX,
+            "global" => TokenType::GLOBAL,
+            "singleton" => TokenType::SINGLETON,
+            "new" => TokenType::NEW,
+            "match" => TokenType::MATCH,
+            "if" => TokenType::IF,
+            "else" => TokenType::ELSE,
+            "loop" => TokenType::LOOP,
+            "break" => TokenType::BREAK,
+            "continue" => TokenType::CONTINUE,
+            "return" => TokenType::RETURN,
+            "function" => TokenType::FUNCTION,
+            "fn" => TokenType::FN,
+            "print" => TokenType::PRINT,
+            "this" => TokenType::THIS,
+            "me" => TokenType::ME,
+            "init" => TokenType::INIT,
+            "pack" => TokenType::PACK,
+            "birth" => TokenType::BIRTH,
+            "nowait" => TokenType::NOWAIT,
+            "await" => TokenType::AWAIT,
+            "interface" => TokenType::INTERFACE,
+            "include" => TokenType::INCLUDE,
+            "import" => TokenType::IMPORT,
+            "try" => TokenType::TRY,
+            "catch" => TokenType::CATCH,
+            "cleanup" => TokenType::CLEANUP,
+            "throw" => TokenType::THROW,
+            "local" => TokenType::LOCAL,
+            "static" => TokenType::STATIC,
+            "outbox" => TokenType::OUTBOX,
+            "not" => TokenType::NOT,
+            "override" => TokenType::OVERRIDE,
+            "from" => TokenType::FROM,
+            "weak" => TokenType::WEAK,
+            "using" => TokenType::USING,
+            "and" => TokenType::AND,
+            "or" => TokenType::OR,
+            "true" => TokenType::TRUE,
+            "false" => TokenType::FALSE,
+            "null" => TokenType::NULL,
+            _ => TokenType::IDENTIFIER(identifier.clone()),
+        };
+
+        // 12.7 Strict mode: fallback extended keywords to IDENTIFIER
+        if Self::strict_12_7() {
+            let is_extended = matches!(
+                tok,
+                TokenType::INTERFACE
+                    | TokenType::USING
+                    | TokenType::INCLUDE
+                    | TokenType::OUTBOX
+                    | TokenType::NOWAIT
+                    | TokenType::OVERRIDE
+                    | TokenType::WEAK
+                    | TokenType::PACK
+            );
+            if is_extended {
+                tok = TokenType::IDENTIFIER(identifier.clone());
+            }
+        }
+
+        // 統一文法エンジンとの差分チェック（動作は変更しない）
+        if std::env::var("NYASH_GRAMMAR_DIFF").ok().as_deref() == Some("1") {
+            if let Some(kw) = engine::get().is_keyword_str(&identifier) {
+                if let TokenType::IDENTIFIER(_) = tok {
+                    eprintln!(
+                        "[GRAMMAR-DIFF] tokenizer=IDENT, grammar=KEYWORD({}) word='{}'",
+                        kw, identifier
+                    );
+                }
+            } else if !matches!(tok, TokenType::IDENTIFIER(_)) {
+                eprintln!(
+                    "[GRAMMAR-DIFF] tokenizer=KEYWORD, grammar=IDENT word='{}'",
+                    identifier
+                );
+            }
+        }
+
+        tok
+    }
+}
+
--- a/src/tokenizer/lex_number.rs
+++ b/src/tokenizer/lex_number.rs
@ -0,0 +1,43 @@
+use super::{NyashTokenizer, TokenType, TokenizeError};
+
+impl NyashTokenizer {
+    /// 数値リテラル（整数または浮動小数点数）を読み取り
+    pub(crate) fn read_numeric_literal(&mut self) -> Result<TokenType, TokenizeError> {
+        let start_line = self.line;
+        let mut number_str = String::new();
+        let mut has_dot = false;
+
+        // 整数部分を読み取り
+        while let Some(c) = self.current_char() {
+            if c.is_ascii_digit() {
+                number_str.push(c);
+                self.advance();
+            } else if c == '.'
+                && !has_dot
+                && self.peek_char().map_or(false, |ch| ch.is_ascii_digit())
+            {
+                // 小数点の後に数字が続く場合のみ受け入れる
+                has_dot = true;
+                number_str.push(c);
+                self.advance();
+            } else {
+                break;
+            }
+        }
+
+        if has_dot {
+            // 浮動小数点数として解析
+            number_str
+                .parse::<f64>()
+                .map(TokenType::FLOAT)
+                .map_err(|_| TokenizeError::InvalidNumber { line: start_line })
+        } else {
+            // 整数として解析
+            number_str
+                .parse::<i64>()
+                .map(TokenType::NUMBER)
+                .map_err(|_| TokenizeError::InvalidNumber { line: start_line })
+        }
+    }
+}
+
--- a/src/tokenizer/lex_string.rs
+++ b/src/tokenizer/lex_string.rs
@ -0,0 +1,42 @@
+use super::{NyashTokenizer, TokenizeError};
+
+impl NyashTokenizer {
+    /// 文字列リテラルを読み取り
+    pub(crate) fn read_string(&mut self) -> Result<String, TokenizeError> {
+        let start_line = self.line;
+        self.advance(); // 開始の '"' をスキップ
+
+        let mut string_value = String::new();
+
+        while let Some(c) = self.current_char() {
+            if c == '"' {
+                self.advance(); // 終了の '"' をスキップ
+                return Ok(string_value);
+            }
+
+            // エスケープ文字の処理
+            if c == '\\' {
+                self.advance();
+                match self.current_char() {
+                    Some('n') => string_value.push('\n'),
+                    Some('t') => string_value.push('\t'),
+                    Some('r') => string_value.push('\r'),
+                    Some('\\') => string_value.push('\\'),
+                    Some('"') => string_value.push('"'),
+                    Some(c) => {
+                        string_value.push('\\');
+                        string_value.push(c);
+                    }
+                    None => break,
+                }
+            } else {
+                string_value.push(c);
+            }
+
+            self.advance();
+        }
+
+        Err(TokenizeError::UnterminatedString { line: start_line })
+    }
+}
+
--- a/src/tokenizer/mod.rs
+++ b/src/tokenizer/mod.rs
@ -0,0 +1,24 @@
+/*!
+ * Nyash Tokenizer — split modules (kinds/cursor/whitespace/lexers/engine)
+ */
+
+mod kinds;
+mod cursor;
+mod whitespace;
+mod lex_string;
+mod lex_number;
+mod lex_ident;
+mod engine;
+
+pub use kinds::{Token, TokenType, TokenizeError};
+
+/// Nyashトークナイザー
+pub struct NyashTokenizer {
+    pub(crate) input: Vec<char>,
+    pub(crate) position: usize,
+    pub(crate) line: usize,
+    pub(crate) column: usize,
+}
+
+// Public API and core logic are implemented in submodules via impl NyashTokenizer
+
--- a/src/tokenizer/whitespace.rs
+++ b/src/tokenizer/whitespace.rs
@ -0,0 +1,43 @@
+use super::{NyashTokenizer, TokenizeError};
+
+impl NyashTokenizer {
+    /// 行コメントをスキップ
+    pub(crate) fn skip_line_comment(&mut self) {
+        while let Some(c) = self.current_char() {
+            if c == '\n' {
+                break; // 改行文字は消費せずに残す
+            }
+            self.advance();
+        }
+    }
+
+    /// ブロックコメントをスキップ: /* ... */（ネスト非対応）
+    pub(crate) fn skip_block_comment(&mut self) -> Result<(), TokenizeError> {
+        // Assume current position is at '/' and next is '*'
+        self.advance(); // '/'
+        self.advance(); // '*'
+        while let Some(c) = self.current_char() {
+            // detect end '*/'
+            if c == '*' && self.peek_char() == Some('/') {
+                self.advance(); // '*'
+                self.advance(); // '/'
+                return Ok(());
+            }
+            self.advance();
+        }
+        // EOF reached without closing */
+        Err(TokenizeError::UnterminatedComment { line: self.line })
+    }
+
+    /// 空白文字をスキップ（改行は除く：改行はNEWLINEトークンとして扱う）
+    pub(crate) fn skip_whitespace(&mut self) {
+        while let Some(c) = self.current_char() {
+            if c.is_whitespace() && c != '\n' {
+                self.advance();
+            } else {
+                break;
+            }
+        }
+    }
+}
+