hakorune/apps/lib/json_native/lexer/tokenizer.nyash

// JsonTokenizer — 精度重視の字句解析器（yyjson相当精度）
// 責務: 文字列をトークン列に変換、エラー検出、位置情報管理

local JsonScanner = include "apps/lib/json_native/lexer/scanner.nyash"
local JsonToken = include "apps/lib/json_native/lexer/token.nyash"
// Removed other dependencies - using self-contained methods

// 🎯 高精度JSONトークナイザー（Everything is Box）
box JsonTokenizer {
    scanner: JsonScanner   // 文字スキャナー
    tokens: ArrayBox       // 生成されたトークン配列
    errors: ArrayBox       // エラー情報配列
    
    birth(input_text) {
        me.scanner = new JsonScanner(input_text)
        me.tokens = new ArrayBox()
        me.errors = new ArrayBox()
    }
    
    // ===== メイン解析メソッド =====
    
    // 全文字列をトークン化
    tokenize() {
        // 初期化
        me.tokens = new ArrayBox()
        me.errors = new ArrayBox()
        
        // メインループ
        loop(not me.scanner.is_eof()) {
            local token = me.next_token()
            
            if token != null {
                me.tokens.push(token)
                
                // エラートークンがあれば記録
                if token.is_error() {
                    me.errors.push(token)
                }
                
                // EOFに到達したら終了
                if token.is_eof() {
                    break
                }
            } else {
                // トークン生成失敗（内部エラー）
                local error_token = new JsonToken("ERROR", "Internal tokenizer error", me.scanner.get_position(), me.scanner.get_position() + 1)
                me.tokens.push(error_token)
                me.errors.push(error_token)
                break
            }
        }
        
        // 最終的にEOFトークンを追加（まだ追加されていない場合）
        if me.tokens.length() == 0 or not me.tokens.get(me.tokens.length() - 1).is_eof() {
            me.tokens.push(new JsonToken("EOF", "", me.scanner.get_position(), me.scanner.get_position()))
        }
        
        return me.tokens
    }
    
    // 次のトークンを1つ取得
    next_token() {
        // 空白をスキップ
        me.scanner.skip_whitespace()
        
        // EOF チェック
        if me.scanner.is_eof() {
            return new JsonToken("EOF", "", me.scanner.get_position(), me.scanner.get_position())
        }
        
        local start_pos = me.scanner.get_position()
        local ch = me.scanner.current()
        
        // 構造文字（単一文字）
        local structural_type = me.char_to_token_type(ch)
        if structural_type != null {
            me.scanner.advance()
            return this.create_structural_token(structural_type, start_pos)
        }
        
        // 文字列リテラル
        if ch == "\"" {
            return me.tokenize_string()
        }
        
        // 数値リテラル
        if me.is_number_start_char(ch) {
            return me.tokenize_number()
        }
        
        // キーワード（null, true, false）
        if me.is_alpha_char(ch) {
            return me.tokenize_keyword()
        }
        
        // 不明な文字（エラー）
        me.scanner.advance()
        return new JsonToken("ERROR", "Unexpected character: '" + ch + "'", start_pos, me.scanner.get_position())
    }
    
    // ===== 専用トークナイザーメソッド =====
    
    // 文字列トークン化
    tokenize_string() {
        local start_pos = me.scanner.get_position()
        local literal = me.scanner.read_string_literal()
        
        if literal == null {
            return new JsonToken("ERROR", "Unterminated string literal", start_pos, me.scanner.get_position())
        }
        
        // エスケープ解除して値を取得
        local unescaped = me.unquote_string(literal)
        
        // 文字列妥当性検証
        if not me.validate_string(unescaped) {
            return new JsonToken("ERROR", "Invalid string content", start_pos, me.scanner.get_position())
        }
        
        return new JsonToken("STRING", unescaped, start_pos, me.scanner.get_position())
    }
    
    // 数値トークン化
    tokenize_number() {
        local start_pos = me.scanner.get_position()
        local number_str = me.scanner.read_number()
        
        if number_str == null {
            return new JsonToken("ERROR", "Invalid number format", start_pos, me.scanner.get_position())
        }
        
        // 数値の妥当性を再チェック
        if not me.validate_number_format(number_str) {
            return new JsonToken("ERROR", "Malformed number: " + number_str, start_pos, me.scanner.get_position())
        }
        
        return new JsonToken("NUMBER", number_str, start_pos, me.scanner.get_position())
    }
    
    // キーワードトークン化
    tokenize_keyword() {
        local start_pos = me.scanner.get_position()
        
        // アルファベット文字を読み取り
        local keyword = me.scanner.read_while(this.is_identifier_char)
        
        // キーワード判定
        local token_type = me.keyword_to_token_type(keyword)
        if token_type != null {
            return new JsonToken(token_type, keyword, start_pos, me.scanner.get_position())
        }
        
        // 不明なキーワード（エラー）
        return new JsonToken("ERROR", "Unknown keyword: " + keyword, start_pos, me.scanner.get_position())
    }
    
    // ===== ヘルパーメソッド =====
    
    // 構造トークン作成
    create_structural_token(token_type, start_pos) {
        return new JsonToken(token_type, this.token_type_to_char(token_type), start_pos, start_pos + 1)
    }
    
    // トークンタイプから文字を取得
    token_type_to_char(token_type) {
        if token_type == "LBRACE" {
            return "{"
        } else {
            if token_type == "RBRACE" {
                return "}"
            } else {
                if token_type == "LBRACKET" {
                    return "["
                } else {
                    if token_type == "RBRACKET" {
                        return "]"
                    } else {
                        if token_type == "COMMA" {
                            return ","
                        } else {
                            if token_type == "COLON" {
                                return ":"
                            } else {
                                return ""
                            }
                        }
                    }
                }
            }
        }
    }
    
    // 識別子文字かどうか判定
    is_identifier_char(ch) {
        return me.is_alphanumeric_char(ch) or ch == "_"
    }
    
    // 数値形式の妥当性検証
    validate_number_format(num_str) {
        // 基本的な数値パターンチェック
        if num_str.length() == 0 {
            return false
        }
        
        // JSON数値の厳密な検証
        // 先頭ゼロの禁止（"0"以外で"0"で始まる整数は無効）
        if num_str.length() > 1 and num_str.substring(0, 1) == "0" {
            local second_char = num_str.substring(1, 2)
            if me.is_digit_char(second_char) {
                return false  // "01", "02" などは無効
            }
        }
        
        // マイナス符号の後に数字があるかチェック
        if me.starts_with(num_str, "-") {
            if num_str.length() == 1 {
                return false  // "-" だけは無効
            }
            local after_minus = num_str.substring(1, 2)
            if not me.is_digit_char(after_minus) {
                return false
            }
        }
        
        return true
    }
    
    // ===== 結果取得メソッド =====
    
    get_tokens() {
        return me.tokens
    }
    
    get_errors() {
        return me.errors
    }
    
    has_errors() {
        return me.errors.length() > 0
    }
    
    get_error_count() {
        return me.errors.length()
    }
    
    // ===== デバッグ・分析メソッド =====
    
    print_tokens() {
        print("🔍 Tokenization Results:")
        print("Total tokens: " + me.tokens.length())
        print("Errors: " + me.errors.length())
        
        if me.has_errors() {
            print("\n❌ Errors found:")
            local i = 0
            loop(i < me.errors.length()) {
                local error = me.errors.get(i)
                print("  " + error.to_debug_string())
                i = i + 1
            }
        }
        
        print("\n📋 Token list:")
        local i = 0
        loop(i < me.tokens.length()) {
            local token = me.tokens.get(i)
            local prefix = "  "
            if token.is_error() {
                prefix = "❌ "
            }
            print(prefix + token.to_string())
            i = i + 1
        }
    }
    
    get_statistics() {
        local stats = new MapBox()
        
        // 基本統計
        stats.set("total_tokens", me.tokens.length())
        stats.set("error_count", me.errors.length())
        stats.set("success_rate", (me.tokens.length() - me.errors.length()) / me.tokens.length())
        
        // トークンタイプ別統計
        local type_counts = new MapBox()
        local i = 0
        loop(i < me.tokens.length()) {
            local token = me.tokens.get(i)
            local type = token.get_type()
            
            if type_counts.has(type) {
                type_counts.set(type, type_counts.get(type) + 1)
            } else {
                type_counts.set(type, 1)
            }
            i = i + 1
        }
        stats.set("type_distribution", type_counts)
        
        return stats
    }
    
    // ===== 内蔵ユーティリティメソッド =====
    
    // アルファベット判定
    is_alpha_char(ch) {
        return (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z")
    }
    
    // 数字文字判定
    is_digit_char(ch) {
        return ch >= "0" and ch <= "9"
    }
    
    // 英数字判定
    is_alphanumeric_char(ch) {
        return me.is_alpha_char(ch) or me.is_digit_char(ch)
    }
    
    // 文字列先頭判定
    starts_with(str, prefix) {
        if prefix.length() > str.length() {
            return false
        }
        return str.substring(0, prefix.length()) == prefix
    }
    
    // 簡易文字列アンクオート
    unquote_string(quoted_str) {
        if quoted_str.length() < 2 {
            return quoted_str
        }
        if quoted_str.substring(0, 1) == "\"" and quoted_str.substring(quoted_str.length() - 1, quoted_str.length()) == "\"" {
            return quoted_str.substring(1, quoted_str.length() - 1)
        }
        return quoted_str
    }
    
    // 簡易文字列検証
    validate_string(str) {
        // 簡易実装 - 実際のJSONエスケープ検証は複雑
        return str.length() >= 0  // 基本的な存在チェックのみ
    }
    
    // 文字からトークンタイプを判定
    char_to_token_type(ch) {
        return match ch {
            "{" => "LBRACE",
            "}" => "RBRACE",
            "[" => "LBRACKET",
            "]" => "RBRACKET",
            "," => "COMMA",
            ":" => "COLON",
            _ => null
        }
    }
    
    // 数値開始文字判定
    is_number_start_char(ch) {
        return ch == "-" or me.is_digit_char(ch)
    }
    
    // キーワードからトークンタイプを判定
    keyword_to_token_type(keyword) {
        return match keyword {
            "null" => "NULL",
            "true" => "TRUE",
            "false" => "FALSE",
            _ => null
        }
    }
}

// 🎯 Static Box - Nyashインクルードシステム要件
static box JsonTokenizerModule {
    create_tokenizer(input_text) {
        return new JsonTokenizer(input_text)
    }
}