/*! * Nyash Tokenizer - .nyashソースコードをトークン列に変換 * * Python版nyashc_v4.pyのNyashTokenizerをRustで完全再実装 * 正規表現ベース → 高速なcharレベル処理に最適化 */ use thiserror::Error; /// トークンの種類を表すenum #[derive(Debug, Clone, PartialEq)] pub enum TokenType { // リテラル STRING(String), NUMBER(i64), FLOAT(f64), // 浮動小数点数サポート追加 TRUE, FALSE, NULL, // null リテラル // キーワード BOX, GLOBAL, SINGLETON, NEW, IF, ELSE, LOOP, BREAK, RETURN, FUNCTION, PRINT, THIS, ME, INIT, // init (初期化ブロック) PACK, // pack (コンストラクタ - 互換性) BIRTH, // birth (コンストラクタ) NOWAIT, // nowait AWAIT, // await INTERFACE, // interface COLON, // : (継承用) INCLUDE, // include (ファイル読み込み) TRY, // try CATCH, // catch FINALLY, // finally THROW, // throw LOCAL, // local (一時変数宣言) STATIC, // static (静的メソッド) OUTBOX, // outbox (所有権移転変数) NOT, // not (否定演算子) OVERRIDE, // override (明示的オーバーライド) FROM, // from (親メソッド呼び出し) WEAK, // weak (弱参照修飾子) USING, // using (名前空間インポート) // 演算子 (長いものから先に定義) ARROW, // >> EQUALS, // == NotEquals, // != LessEquals, // <= GreaterEquals, // >= AND, // && または and OR, // || または or LESS, // < GREATER, // > ASSIGN, // = PLUS, // + MINUS, // - MULTIPLY, // * DIVIDE, // / MODULO, // % // 記号 DOT, // . LPAREN, // ( RPAREN, // ) LBRACE, // { RBRACE, // } COMMA, // , NEWLINE, // \n // 識別子 IDENTIFIER(String), // 特殊 EOF, } /// トークンの位置情報を含む構造体 #[derive(Debug, Clone)] pub struct Token { pub token_type: TokenType, pub line: usize, pub column: usize, } impl Token { pub fn new(token_type: TokenType, line: usize, column: usize) -> Self { Self { token_type, line, column } } } /// トークナイズエラー #[derive(Error, Debug)] pub enum TokenizeError { #[error("Unexpected character '{char}' at line {line}, column {column}")] UnexpectedCharacter { char: char, line: usize, column: usize }, #[error("Unterminated string literal at line {line}")] UnterminatedString { line: usize }, #[error("Invalid number format at line {line}")] InvalidNumber { line: usize }, #[error("Comment not closed at line {line}")] UnterminatedComment { line: usize }, } /// Nyashトークナイザー pub struct NyashTokenizer { input: Vec, position: usize, line: usize, column: usize, } impl NyashTokenizer { /// 新しいトークナイザーを作成 pub fn new(input: impl Into) -> Self { let input_string = input.into(); Self { input: input_string.chars().collect(), position: 0, line: 1, column: 1, } } /// 完全なトークナイズを実行 pub fn tokenize(&mut self) -> Result, TokenizeError> { let mut tokens = Vec::new(); while !self.is_at_end() { // 空白をスキップ self.skip_whitespace(); if self.is_at_end() { break; } // 次のトークンを読み取り let token = self.tokenize_next()?; tokens.push(token); } // EOF トークンを追加 tokens.push(Token::new(TokenType::EOF, self.line, self.column)); Ok(tokens) } /// 次の一つのトークンを読み取り fn tokenize_next(&mut self) -> Result { let start_line = self.line; let start_column = self.column; match self.current_char() { Some('"') => { let string_value = self.read_string()?; Ok(Token::new(TokenType::STRING(string_value), start_line, start_column)) } Some(c) if c.is_ascii_digit() => { let token_type = self.read_numeric_literal()?; Ok(Token::new(token_type, start_line, start_column)) } Some(c) if c.is_alphabetic() || c == '_' => { let token_type = self.read_keyword_or_identifier(); Ok(Token::new(token_type, start_line, start_column)) } Some('/') if self.peek_char() == Some('/') => { self.skip_line_comment(); self.skip_whitespace(); // コメント後の空白もスキップ return self.tokenize_next(); } Some('#') => { self.skip_line_comment(); self.skip_whitespace(); // コメント後の空白もスキップ return self.tokenize_next(); } Some('>') if self.peek_char() == Some('>') => { self.advance(); self.advance(); Ok(Token::new(TokenType::ARROW, start_line, start_column)) } Some('=') if self.peek_char() == Some('=') => { self.advance(); self.advance(); Ok(Token::new(TokenType::EQUALS, start_line, start_column)) } Some('!') if self.peek_char() == Some('=') => { self.advance(); self.advance(); Ok(Token::new(TokenType::NotEquals, start_line, start_column)) } Some('<') if self.peek_char() == Some('=') => { self.advance(); self.advance(); Ok(Token::new(TokenType::LessEquals, start_line, start_column)) } Some('>') if self.peek_char() == Some('=') => { self.advance(); self.advance(); Ok(Token::new(TokenType::GreaterEquals, start_line, start_column)) } Some('&') if self.peek_char() == Some('&') => { self.advance(); self.advance(); Ok(Token::new(TokenType::AND, start_line, start_column)) } Some('|') if self.peek_char() == Some('|') => { self.advance(); self.advance(); Ok(Token::new(TokenType::OR, start_line, start_column)) } Some('<') => { self.advance(); Ok(Token::new(TokenType::LESS, start_line, start_column)) } Some('>') => { self.advance(); Ok(Token::new(TokenType::GREATER, start_line, start_column)) } Some('=') => { self.advance(); Ok(Token::new(TokenType::ASSIGN, start_line, start_column)) } Some('+') => { self.advance(); Ok(Token::new(TokenType::PLUS, start_line, start_column)) } Some('-') => { self.advance(); Ok(Token::new(TokenType::MINUS, start_line, start_column)) } Some('*') => { self.advance(); Ok(Token::new(TokenType::MULTIPLY, start_line, start_column)) } Some('/') => { self.advance(); Ok(Token::new(TokenType::DIVIDE, start_line, start_column)) } Some('%') => { self.advance(); Ok(Token::new(TokenType::MODULO, start_line, start_column)) } Some('.') => { self.advance(); Ok(Token::new(TokenType::DOT, start_line, start_column)) } Some('(') => { self.advance(); Ok(Token::new(TokenType::LPAREN, start_line, start_column)) } Some(')') => { self.advance(); Ok(Token::new(TokenType::RPAREN, start_line, start_column)) } Some('{') => { self.advance(); Ok(Token::new(TokenType::LBRACE, start_line, start_column)) } Some('}') => { self.advance(); Ok(Token::new(TokenType::RBRACE, start_line, start_column)) } Some(',') => { self.advance(); Ok(Token::new(TokenType::COMMA, start_line, start_column)) } Some(':') => { self.advance(); Ok(Token::new(TokenType::COLON, start_line, start_column)) } Some('\n') => { self.advance(); Ok(Token::new(TokenType::NEWLINE, start_line, start_column)) } Some(c) => { Err(TokenizeError::UnexpectedCharacter { char: c, line: self.line, column: self.column, }) } None => { Ok(Token::new(TokenType::EOF, self.line, self.column)) } } } /// 文字列リテラルを読み取り fn read_string(&mut self) -> Result { let start_line = self.line; self.advance(); // 開始の '"' をスキップ let mut string_value = String::new(); while let Some(c) = self.current_char() { if c == '"' { self.advance(); // 終了の '"' をスキップ return Ok(string_value); } // エスケープ文字の処理 if c == '\\' { self.advance(); match self.current_char() { Some('n') => string_value.push('\n'), Some('t') => string_value.push('\t'), Some('r') => string_value.push('\r'), Some('\\') => string_value.push('\\'), Some('"') => string_value.push('"'), Some(c) => { string_value.push('\\'); string_value.push(c); } None => break, } } else { string_value.push(c); } self.advance(); } Err(TokenizeError::UnterminatedString { line: start_line }) } /// 数値リテラル(整数または浮動小数点数)を読み取り fn read_numeric_literal(&mut self) -> Result { let start_line = self.line; let mut number_str = String::new(); let mut has_dot = false; // 整数部分を読み取り while let Some(c) = self.current_char() { if c.is_ascii_digit() { number_str.push(c); self.advance(); } else if c == '.' && !has_dot && self.peek_char().map_or(false, |ch| ch.is_ascii_digit()) { // 小数点の後に数字が続く場合のみ受け入れる has_dot = true; number_str.push(c); self.advance(); } else { break; } } if has_dot { // 浮動小数点数として解析 number_str.parse::() .map(TokenType::FLOAT) .map_err(|_| TokenizeError::InvalidNumber { line: start_line }) } else { // 整数として解析 number_str.parse::() .map(TokenType::NUMBER) .map_err(|_| TokenizeError::InvalidNumber { line: start_line }) } } /// キーワードまたは識別子を読み取り fn read_keyword_or_identifier(&mut self) -> TokenType { let mut identifier = String::new(); while let Some(c) = self.current_char() { if c.is_alphanumeric() || c == '_' { identifier.push(c); self.advance(); } else { break; } } // キーワードチェック match identifier.as_str() { "box" => TokenType::BOX, "global" => TokenType::GLOBAL, "singleton" => TokenType::SINGLETON, "new" => TokenType::NEW, "if" => TokenType::IF, "else" => TokenType::ELSE, "loop" => TokenType::LOOP, "break" => TokenType::BREAK, "return" => TokenType::RETURN, "function" => TokenType::FUNCTION, "print" => TokenType::PRINT, "this" => TokenType::THIS, "me" => TokenType::ME, "init" => TokenType::INIT, "pack" => TokenType::PACK, "birth" => TokenType::BIRTH, "nowait" => TokenType::NOWAIT, "await" => TokenType::AWAIT, "interface" => TokenType::INTERFACE, "include" => TokenType::INCLUDE, "try" => TokenType::TRY, "catch" => TokenType::CATCH, "finally" => TokenType::FINALLY, "throw" => TokenType::THROW, "local" => TokenType::LOCAL, "static" => TokenType::STATIC, "outbox" => TokenType::OUTBOX, "not" => TokenType::NOT, "override" => TokenType::OVERRIDE, "from" => TokenType::FROM, "weak" => TokenType::WEAK, "using" => TokenType::USING, "and" => TokenType::AND, "or" => TokenType::OR, "true" => TokenType::TRUE, "false" => TokenType::FALSE, "null" => TokenType::NULL, _ => TokenType::IDENTIFIER(identifier), } } /// 行コメントをスキップ fn skip_line_comment(&mut self) { while let Some(c) = self.current_char() { if c == '\n' { break; // 改行文字は消費せずに残す } self.advance(); } } /// 空白文字をスキップ(改行は除く) fn skip_whitespace(&mut self) { while let Some(c) = self.current_char() { if c.is_whitespace() && c != '\n' { self.advance(); } else { break; } } } /// 現在の文字を取得 fn current_char(&self) -> Option { self.input.get(self.position).copied() } /// 次の文字を先読み fn peek_char(&self) -> Option { self.input.get(self.position + 1).copied() } /// 位置を1つ進める fn advance(&mut self) { if let Some(c) = self.current_char() { if c == '\n' { self.line += 1; self.column = 1; } else { self.column += 1; } self.position += 1; } } /// 入力の終端に達したかチェック fn is_at_end(&self) -> bool { self.position >= self.input.len() } } // ===== Tests ===== #[cfg(test)] mod tests { use super::*; #[test] fn test_simple_tokens() { let mut tokenizer = NyashTokenizer::new("box new = + - *"); let tokens = tokenizer.tokenize().unwrap(); assert_eq!(tokens.len(), 7); // 6 tokens + EOF assert_eq!(tokens[0].token_type, TokenType::BOX); assert_eq!(tokens[1].token_type, TokenType::NEW); assert_eq!(tokens[2].token_type, TokenType::ASSIGN); assert_eq!(tokens[3].token_type, TokenType::PLUS); assert_eq!(tokens[4].token_type, TokenType::MINUS); assert_eq!(tokens[5].token_type, TokenType::MULTIPLY); assert_eq!(tokens[6].token_type, TokenType::EOF); } #[test] fn test_string_literal() { let mut tokenizer = NyashTokenizer::new(r#""Hello, World!""#); let tokens = tokenizer.tokenize().unwrap(); assert_eq!(tokens.len(), 2); // STRING + EOF match &tokens[0].token_type { TokenType::STRING(s) => assert_eq!(s, "Hello, World!"), _ => panic!("Expected STRING token"), } } #[test] fn test_number_literal() { let mut tokenizer = NyashTokenizer::new("42 123 0"); let tokens = tokenizer.tokenize().unwrap(); assert_eq!(tokens.len(), 4); // 3 numbers + EOF match &tokens[0].token_type { TokenType::NUMBER(n) => assert_eq!(*n, 42), _ => panic!("Expected NUMBER token"), } match &tokens[1].token_type { TokenType::NUMBER(n) => assert_eq!(*n, 123), _ => panic!("Expected NUMBER token"), } match &tokens[2].token_type { TokenType::NUMBER(n) => assert_eq!(*n, 0), _ => panic!("Expected NUMBER token"), } } #[test] fn test_identifier() { let mut tokenizer = NyashTokenizer::new("test_var myBox getValue"); let tokens = tokenizer.tokenize().unwrap(); assert_eq!(tokens.len(), 4); // 3 identifiers + EOF match &tokens[0].token_type { TokenType::IDENTIFIER(s) => assert_eq!(s, "test_var"), _ => panic!("Expected IDENTIFIER token"), } match &tokens[1].token_type { TokenType::IDENTIFIER(s) => assert_eq!(s, "myBox"), _ => panic!("Expected IDENTIFIER token"), } match &tokens[2].token_type { TokenType::IDENTIFIER(s) => assert_eq!(s, "getValue"), _ => panic!("Expected IDENTIFIER token"), } } #[test] fn test_operators() { let mut tokenizer = NyashTokenizer::new(">> == != <= >= < >"); let tokens = tokenizer.tokenize().unwrap(); assert_eq!(tokens[0].token_type, TokenType::ARROW); assert_eq!(tokens[1].token_type, TokenType::EQUALS); assert_eq!(tokens[2].token_type, TokenType::NotEquals); assert_eq!(tokens[3].token_type, TokenType::LessEquals); assert_eq!(tokens[4].token_type, TokenType::GreaterEquals); assert_eq!(tokens[5].token_type, TokenType::LESS); assert_eq!(tokens[6].token_type, TokenType::GREATER); } #[test] fn test_complex_code() { let code = r#" box TestBox { value getValue() { return this.value } } obj = new TestBox() obj.value = "test123" "#; let mut tokenizer = NyashTokenizer::new(code); let tokens = tokenizer.tokenize().unwrap(); // 基本的なトークンがある事を確認 let token_types: Vec<_> = tokens.iter().map(|t| &t.token_type).collect(); assert!(token_types.contains(&&TokenType::BOX)); assert!(token_types.contains(&&TokenType::NEW)); assert!(token_types.contains(&&TokenType::THIS)); assert!(token_types.contains(&&TokenType::RETURN)); assert!(token_types.contains(&&TokenType::DOT)); } #[test] fn test_line_numbers() { let code = "box\ntest\nvalue"; let mut tokenizer = NyashTokenizer::new(code); let tokens = tokenizer.tokenize().unwrap(); assert_eq!(tokens[0].line, 1); // box assert_eq!(tokens[1].line, 2); // test assert_eq!(tokens[2].line, 3); // value } #[test] fn test_comments() { let code = r#"box Test // this is a comment # this is also a comment value"#; let mut tokenizer = NyashTokenizer::new(code); let tokens = tokenizer.tokenize().unwrap(); // コメントは除外されている let token_types: Vec<_> = tokens.iter().map(|t| &t.token_type).collect(); assert_eq!(token_types.len(), 4); // box, Test, value, EOF } #[test] fn test_error_handling() { let mut tokenizer = NyashTokenizer::new("@#$%"); let result = tokenizer.tokenize(); assert!(result.is_err()); match result { Err(TokenizeError::UnexpectedCharacter { char, line, column }) => { assert_eq!(char, '@'); assert_eq!(line, 1); assert_eq!(column, 1); } _ => panic!("Expected UnexpectedCharacter error"), } } }