281 lines
11 KiB
Rust
281 lines
11 KiB
Rust
use super::{NyashTokenizer, Token, TokenType, TokenizeError};
|
||
|
||
impl NyashTokenizer {
|
||
#[inline]
|
||
pub(crate) fn allow_semicolon() -> bool {
|
||
// Default: ON (semicolon is an optional statement separator)
|
||
// Allow opt-out via NYASH_PARSER_ALLOW_SEMICOLON=0|false|off
|
||
match std::env::var("NYASH_PARSER_ALLOW_SEMICOLON").ok() {
|
||
Some(v) => {
|
||
let lv = v.to_ascii_lowercase();
|
||
if lv == "0" || lv == "false" || lv == "off" {
|
||
return false;
|
||
}
|
||
true
|
||
}
|
||
None => true,
|
||
}
|
||
}
|
||
|
||
#[inline]
|
||
pub(crate) fn strict_12_7() -> bool {
|
||
std::env::var("NYASH_STRICT_12_7").ok().as_deref() == Some("1")
|
||
}
|
||
|
||
/// 新しいトークナイザーを作成
|
||
pub fn new(input: impl Into<String>) -> Self {
|
||
let input_string = input.into();
|
||
Self {
|
||
input: input_string.chars().collect(),
|
||
position: 0,
|
||
line: 1,
|
||
column: 1,
|
||
}
|
||
}
|
||
|
||
/// 完全なトークナイズを実行
|
||
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizeError> {
|
||
let mut tokens = Vec::new();
|
||
|
||
while !self.is_at_end() {
|
||
// 空白・コメントをスキップ
|
||
self.skip_whitespace();
|
||
// 連続するブロックコメントや行コメントもまとめてスキップ
|
||
loop {
|
||
// block comment: /* ... */
|
||
if self.current_char() == Some('/') && self.peek_char() == Some('*') {
|
||
self.skip_block_comment()?;
|
||
self.skip_whitespace();
|
||
continue;
|
||
}
|
||
// line comments: // ... or # ...
|
||
if (self.current_char() == Some('/') && self.peek_char() == Some('/'))
|
||
|| self.current_char() == Some('#')
|
||
{
|
||
self.skip_line_comment();
|
||
self.skip_whitespace();
|
||
continue;
|
||
}
|
||
break;
|
||
}
|
||
|
||
if self.is_at_end() {
|
||
break;
|
||
}
|
||
|
||
// 次のトークンを読み取り
|
||
let token = self.tokenize_next()?;
|
||
if std::env::var("NYASH_TOK_TRACE").ok().as_deref() == Some("1") {
|
||
eprintln!("[tok] {:?}", token.token_type);
|
||
}
|
||
tokens.push(token);
|
||
}
|
||
|
||
// EOF トークンを追加
|
||
tokens.push(Token::new(TokenType::EOF, self.line, self.column));
|
||
|
||
Ok(tokens)
|
||
}
|
||
|
||
/// 次の一つのトークンを読み取り
|
||
fn tokenize_next(&mut self) -> Result<Token, TokenizeError> {
|
||
let start_line = self.line;
|
||
let start_column = self.column;
|
||
|
||
match self.current_char() {
|
||
// Optional statement separator ';' (gated)
|
||
Some(';') if Self::allow_semicolon() => {
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::SEMICOLON, start_line, start_column));
|
||
}
|
||
// Block comment should have been skipped by tokenize() pre-loop, but be defensive here
|
||
Some('/') if self.peek_char() == Some('*') => {
|
||
self.skip_block_comment()?;
|
||
// After skipping, restart tokenization for next token
|
||
return self.tokenize_next();
|
||
}
|
||
// 2文字(またはそれ以上)の演算子は最長一致で先に判定
|
||
Some('|') if self.peek_char() == Some('|') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::OR, start_line, start_column));
|
||
}
|
||
Some('&') if self.peek_char() == Some('&') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::AND, start_line, start_column));
|
||
}
|
||
Some('|') if self.peek_char() == Some('>') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::PipeForward, start_line, start_column));
|
||
}
|
||
Some('?') if self.peek_char() == Some('.') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::QmarkDot, start_line, start_column));
|
||
}
|
||
Some('?') if self.peek_char() == Some('?') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::QmarkQmark, start_line, start_column));
|
||
}
|
||
Some('?') => {
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::QUESTION, start_line, start_column));
|
||
}
|
||
Some('+') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::PlusAssign, start_line, start_column));
|
||
}
|
||
Some('-') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::MinusAssign, start_line, start_column));
|
||
}
|
||
Some('*') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::MulAssign, start_line, start_column));
|
||
}
|
||
Some('/') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::DivAssign, start_line, start_column));
|
||
}
|
||
Some('.') if self.peek_char() == Some('.') => {
|
||
self.advance();
|
||
self.advance();
|
||
return Ok(Token::new(TokenType::RANGE, start_line, start_column));
|
||
}
|
||
Some('"') => {
|
||
let string_value = self.read_string()?;
|
||
Ok(Token::new(
|
||
TokenType::STRING(string_value),
|
||
start_line,
|
||
start_column,
|
||
))
|
||
}
|
||
// Stage‑3: シングルクォート文字列(オプトイン、現行デフォルトON)
|
||
Some('\'') if crate::config::env::parser_stage3_enabled() => {
|
||
let string_value = self.read_single_quoted_string()?;
|
||
Ok(Token::new(
|
||
TokenType::STRING(string_value),
|
||
start_line,
|
||
start_column,
|
||
))
|
||
}
|
||
Some(c) if c.is_ascii_digit() => {
|
||
let token_type = self.read_numeric_literal()?;
|
||
Ok(Token::new(token_type, start_line, start_column))
|
||
}
|
||
Some(c) if c.is_alphabetic() || c == '_' => {
|
||
let token_type = self.read_keyword_or_identifier();
|
||
Ok(Token::new(token_type, start_line, start_column))
|
||
}
|
||
Some('/') if self.peek_char() == Some('/') => {
|
||
self.skip_line_comment();
|
||
self.skip_whitespace(); // コメント後の空白もスキップ
|
||
return self.tokenize_next();
|
||
}
|
||
Some('#') => {
|
||
self.skip_line_comment();
|
||
self.skip_whitespace(); // コメント後の空白もスキップ
|
||
return self.tokenize_next();
|
||
}
|
||
Some('>') if self.peek_char() == Some('>') && !Self::strict_12_7() => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(TokenType::ShiftRight, start_line, start_column))
|
||
}
|
||
Some('<') if self.peek_char() == Some('<') && !Self::strict_12_7() => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(TokenType::ShiftLeft, start_line, start_column))
|
||
}
|
||
Some(':') if self.peek_char() == Some(':') => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(TokenType::DoubleColon, start_line, start_column))
|
||
}
|
||
Some(':') => {
|
||
self.advance();
|
||
Ok(Token::new(TokenType::COLON, start_line, start_column))
|
||
}
|
||
Some('=') if self.peek_char() == Some('>') => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(TokenType::FatArrow, start_line, start_column))
|
||
}
|
||
Some('=') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(TokenType::EQUALS, start_line, start_column))
|
||
}
|
||
Some('!') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(TokenType::NotEquals, start_line, start_column))
|
||
}
|
||
Some('<') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(TokenType::LessEquals, start_line, start_column))
|
||
}
|
||
Some('>') if self.peek_char() == Some('=') => {
|
||
self.advance();
|
||
self.advance();
|
||
Ok(Token::new(
|
||
TokenType::GreaterEquals,
|
||
start_line,
|
||
start_column,
|
||
))
|
||
}
|
||
Some(c) => {
|
||
if let Some(token) = self.single_char_token(c) {
|
||
self.advance();
|
||
Ok(Token::new(token, start_line, start_column))
|
||
} else {
|
||
Err(TokenizeError::UnexpectedCharacter {
|
||
char: c,
|
||
line: start_line,
|
||
column: start_column,
|
||
})
|
||
}
|
||
}
|
||
None => Ok(Token::new(TokenType::EOF, start_line, start_column)),
|
||
}
|
||
}
|
||
|
||
// 単文字トークンのマップ(最長一致系は呼び出し元で処理済み)
|
||
fn single_char_token(&self, c: char) -> Option<TokenType> {
|
||
// '?' は上位で分岐済み、':' も同様。ここでは純粋な1文字を扱う。
|
||
match c {
|
||
'!' => Some(TokenType::NOT),
|
||
'~' => Some(TokenType::BitNot),
|
||
'<' => Some(TokenType::LESS),
|
||
'>' => Some(TokenType::GREATER),
|
||
'&' => Some(TokenType::BitAnd),
|
||
'|' => Some(TokenType::BitOr),
|
||
'^' => Some(TokenType::BitXor),
|
||
'=' => Some(TokenType::ASSIGN),
|
||
'+' => Some(TokenType::PLUS),
|
||
'-' => Some(TokenType::MINUS),
|
||
'*' => Some(TokenType::MULTIPLY),
|
||
'/' => Some(TokenType::DIVIDE),
|
||
'%' => Some(TokenType::MODULO),
|
||
'.' => Some(TokenType::DOT),
|
||
'(' => Some(TokenType::LPAREN),
|
||
')' => Some(TokenType::RPAREN),
|
||
'[' => Some(TokenType::LBRACK),
|
||
']' => Some(TokenType::RBRACK),
|
||
'{' => Some(TokenType::LBRACE),
|
||
'}' => Some(TokenType::RBRACE),
|
||
',' => Some(TokenType::COMMA),
|
||
'\n' => Some(TokenType::NEWLINE),
|
||
_ => None,
|
||
}
|
||
}
|
||
}
|