Files
hakorune/src/tokenizer/engine.rs
nyash-codex f9d100ce01 chore: Phase 25.1 完了 - LoopForm v2/Stage1 CLI/環境変数削減 + Phase 26-D からの変更
Phase 25.1 完了成果:
-  LoopForm v2 テスト・ドキュメント・コメント完備
  - 4ケース(A/B/C/D)完全テストカバレッジ
  - 最小再現ケース作成(SSAバグ調査用)
  - SSOT文書作成(loopform_ssot.md)
  - 全ソースに [LoopForm] コメントタグ追加

-  Stage-1 CLI デバッグ環境構築
  - stage1_cli.hako 実装
  - stage1_bridge.rs ブリッジ実装
  - デバッグツール作成(stage1_debug.sh/stage1_minimal.sh)
  - アーキテクチャ改善提案文書

-  環境変数削減計画策定
  - 25変数の完全調査・分類
  - 6段階削減ロードマップ(25→5、80%削減)
  - 即時削除可能変数特定(NYASH_CONFIG/NYASH_DEBUG)

Phase 26-D からの累積変更:
- PHI実装改善(ExitPhiBuilder/HeaderPhiBuilder等)
- MIRビルダーリファクタリング
- 型伝播・最適化パス改善
- その他約300ファイルの累積変更

🎯 技術的成果:
- SSAバグ根本原因特定(条件分岐内loop変数変更)
- Region+next_iパターン適用完了(UsingCollectorBox等)
- LoopFormパターン文書化・テスト化完了
- セルフホスティング基盤強化

Co-Authored-By: Claude <noreply@anthropic.com>
Co-Authored-By: ChatGPT <noreply@openai.com>
Co-Authored-By: Task Assistant <task@anthropic.com>
2025-11-21 06:25:17 +09:00

281 lines
11 KiB
Rust
Raw Blame History

This file contains ambiguous Unicode characters

This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.

use super::{NyashTokenizer, Token, TokenType, TokenizeError};
impl NyashTokenizer {
#[inline]
pub(crate) fn allow_semicolon() -> bool {
// Default: ON (semicolon is an optional statement separator)
// Allow opt-out via NYASH_PARSER_ALLOW_SEMICOLON=0|false|off
match std::env::var("NYASH_PARSER_ALLOW_SEMICOLON").ok() {
Some(v) => {
let lv = v.to_ascii_lowercase();
if lv == "0" || lv == "false" || lv == "off" {
return false;
}
true
}
None => true,
}
}
#[inline]
pub(crate) fn strict_12_7() -> bool {
std::env::var("NYASH_STRICT_12_7").ok().as_deref() == Some("1")
}
/// 新しいトークナイザーを作成
pub fn new(input: impl Into<String>) -> Self {
let input_string = input.into();
Self {
input: input_string.chars().collect(),
position: 0,
line: 1,
column: 1,
}
}
/// 完全なトークナイズを実行
pub fn tokenize(&mut self) -> Result<Vec<Token>, TokenizeError> {
let mut tokens = Vec::new();
while !self.is_at_end() {
// 空白・コメントをスキップ
self.skip_whitespace();
// 連続するブロックコメントや行コメントもまとめてスキップ
loop {
// block comment: /* ... */
if self.current_char() == Some('/') && self.peek_char() == Some('*') {
self.skip_block_comment()?;
self.skip_whitespace();
continue;
}
// line comments: // ... or # ...
if (self.current_char() == Some('/') && self.peek_char() == Some('/'))
|| self.current_char() == Some('#')
{
self.skip_line_comment();
self.skip_whitespace();
continue;
}
break;
}
if self.is_at_end() {
break;
}
// 次のトークンを読み取り
let token = self.tokenize_next()?;
if std::env::var("NYASH_TOK_TRACE").ok().as_deref() == Some("1") {
eprintln!("[tok] {:?}", token.token_type);
}
tokens.push(token);
}
// EOF トークンを追加
tokens.push(Token::new(TokenType::EOF, self.line, self.column));
Ok(tokens)
}
/// 次の一つのトークンを読み取り
fn tokenize_next(&mut self) -> Result<Token, TokenizeError> {
let start_line = self.line;
let start_column = self.column;
match self.current_char() {
// Optional statement separator ';' (gated)
Some(';') if Self::allow_semicolon() => {
self.advance();
return Ok(Token::new(TokenType::SEMICOLON, start_line, start_column));
}
// Block comment should have been skipped by tokenize() pre-loop, but be defensive here
Some('/') if self.peek_char() == Some('*') => {
self.skip_block_comment()?;
// After skipping, restart tokenization for next token
return self.tokenize_next();
}
// 2文字またはそれ以上の演算子は最長一致で先に判定
Some('|') if self.peek_char() == Some('|') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::OR, start_line, start_column));
}
Some('&') if self.peek_char() == Some('&') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::AND, start_line, start_column));
}
Some('|') if self.peek_char() == Some('>') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::PipeForward, start_line, start_column));
}
Some('?') if self.peek_char() == Some('.') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::QmarkDot, start_line, start_column));
}
Some('?') if self.peek_char() == Some('?') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::QmarkQmark, start_line, start_column));
}
Some('?') => {
self.advance();
return Ok(Token::new(TokenType::QUESTION, start_line, start_column));
}
Some('+') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::PlusAssign, start_line, start_column));
}
Some('-') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::MinusAssign, start_line, start_column));
}
Some('*') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::MulAssign, start_line, start_column));
}
Some('/') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::DivAssign, start_line, start_column));
}
Some('.') if self.peek_char() == Some('.') => {
self.advance();
self.advance();
return Ok(Token::new(TokenType::RANGE, start_line, start_column));
}
Some('"') => {
let string_value = self.read_string()?;
Ok(Token::new(
TokenType::STRING(string_value),
start_line,
start_column,
))
}
// Stage3: シングルクォート文字列(オプトイン)
Some('\'') if crate::config::env::parser_stage3() => {
let string_value = self.read_single_quoted_string()?;
Ok(Token::new(
TokenType::STRING(string_value),
start_line,
start_column,
))
}
Some(c) if c.is_ascii_digit() => {
let token_type = self.read_numeric_literal()?;
Ok(Token::new(token_type, start_line, start_column))
}
Some(c) if c.is_alphabetic() || c == '_' => {
let token_type = self.read_keyword_or_identifier();
Ok(Token::new(token_type, start_line, start_column))
}
Some('/') if self.peek_char() == Some('/') => {
self.skip_line_comment();
self.skip_whitespace(); // コメント後の空白もスキップ
return self.tokenize_next();
}
Some('#') => {
self.skip_line_comment();
self.skip_whitespace(); // コメント後の空白もスキップ
return self.tokenize_next();
}
Some('>') if self.peek_char() == Some('>') && !Self::strict_12_7() => {
self.advance();
self.advance();
Ok(Token::new(TokenType::ShiftRight, start_line, start_column))
}
Some('<') if self.peek_char() == Some('<') && !Self::strict_12_7() => {
self.advance();
self.advance();
Ok(Token::new(TokenType::ShiftLeft, start_line, start_column))
}
Some(':') if self.peek_char() == Some(':') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::DoubleColon, start_line, start_column))
}
Some(':') => {
self.advance();
Ok(Token::new(TokenType::COLON, start_line, start_column))
}
Some('=') if self.peek_char() == Some('>') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::FatArrow, start_line, start_column))
}
Some('=') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::EQUALS, start_line, start_column))
}
Some('!') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::NotEquals, start_line, start_column))
}
Some('<') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(TokenType::LessEquals, start_line, start_column))
}
Some('>') if self.peek_char() == Some('=') => {
self.advance();
self.advance();
Ok(Token::new(
TokenType::GreaterEquals,
start_line,
start_column,
))
}
Some(c) => {
if let Some(token) = self.single_char_token(c) {
self.advance();
Ok(Token::new(token, start_line, start_column))
} else {
Err(TokenizeError::UnexpectedCharacter {
char: c,
line: start_line,
column: start_column,
})
}
}
None => Ok(Token::new(TokenType::EOF, start_line, start_column)),
}
}
// 単文字トークンのマップ(最長一致系は呼び出し元で処理済み)
fn single_char_token(&self, c: char) -> Option<TokenType> {
// '?' は上位で分岐済み、':' も同様。ここでは純粋な1文字を扱う。
match c {
'!' => Some(TokenType::NOT),
'~' => Some(TokenType::BitNot),
'<' => Some(TokenType::LESS),
'>' => Some(TokenType::GREATER),
'&' => Some(TokenType::BitAnd),
'|' => Some(TokenType::BitOr),
'^' => Some(TokenType::BitXor),
'=' => Some(TokenType::ASSIGN),
'+' => Some(TokenType::PLUS),
'-' => Some(TokenType::MINUS),
'*' => Some(TokenType::MULTIPLY),
'/' => Some(TokenType::DIVIDE),
'%' => Some(TokenType::MODULO),
'.' => Some(TokenType::DOT),
'(' => Some(TokenType::LPAREN),
')' => Some(TokenType::RPAREN),
'[' => Some(TokenType::LBRACK),
']' => Some(TokenType::RBRACK),
'{' => Some(TokenType::LBRACE),
'}' => Some(TokenType::RBRACE),
',' => Some(TokenType::COMMA),
'\n' => Some(TokenType::NEWLINE),
_ => None,
}
}
}