2025-09-23 04:51:17 +09:00
|
|
|
|
// JsonTokenizer — 精度重視の字句解析器(yyjson相当精度)
|
|
|
|
|
|
// 責務: 文字列をトークン列に変換、エラー検出、位置情報管理
|
|
|
|
|
|
|
2025-09-25 00:41:56 +09:00
|
|
|
|
using "apps/lib/json_native/lexer/scanner.nyash" as JsonScanner
|
|
|
|
|
|
using "apps/lib/json_native/lexer/token.nyash" as JsonToken
|
2025-09-25 10:23:14 +09:00
|
|
|
|
using "apps/lib/json_native/utils/escape.nyash" as EscapeUtils
|
2025-09-23 04:51:17 +09:00
|
|
|
|
// Removed other dependencies - using self-contained methods
|
|
|
|
|
|
|
|
|
|
|
|
// 🎯 高精度JSONトークナイザー(Everything is Box)
|
|
|
|
|
|
box JsonTokenizer {
|
|
|
|
|
|
scanner: JsonScanner // 文字スキャナー
|
|
|
|
|
|
tokens: ArrayBox // 生成されたトークン配列
|
|
|
|
|
|
errors: ArrayBox // エラー情報配列
|
|
|
|
|
|
|
|
|
|
|
|
birth(input_text) {
|
2025-09-26 14:34:42 +09:00
|
|
|
|
// Avoid static module wrapper to ensure constructor args are preserved on VM path
|
|
|
|
|
|
// (create_scanner(...) lost the argument under VM fallback in some cases)
|
|
|
|
|
|
me.scanner = new JsonScanner(input_text)
|
2025-09-23 04:51:17 +09:00
|
|
|
|
me.tokens = new ArrayBox()
|
|
|
|
|
|
me.errors = new ArrayBox()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===== メイン解析メソッド =====
|
|
|
|
|
|
|
|
|
|
|
|
// 全文字列をトークン化
|
|
|
|
|
|
tokenize() {
|
|
|
|
|
|
// 初期化
|
|
|
|
|
|
me.tokens = new ArrayBox()
|
|
|
|
|
|
me.errors = new ArrayBox()
|
|
|
|
|
|
|
|
|
|
|
|
// メインループ
|
|
|
|
|
|
loop(not me.scanner.is_eof()) {
|
|
|
|
|
|
local token = me.next_token()
|
|
|
|
|
|
|
|
|
|
|
|
if token != null {
|
|
|
|
|
|
me.tokens.push(token)
|
|
|
|
|
|
|
|
|
|
|
|
// エラートークンがあれば記録
|
|
|
|
|
|
if token.is_error() {
|
|
|
|
|
|
me.errors.push(token)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// EOFに到達したら終了
|
|
|
|
|
|
if token.is_eof() {
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
} else {
|
|
|
|
|
|
// トークン生成失敗(内部エラー)
|
|
|
|
|
|
local error_token = new JsonToken("ERROR", "Internal tokenizer error", me.scanner.get_position(), me.scanner.get_position() + 1)
|
|
|
|
|
|
me.tokens.push(error_token)
|
|
|
|
|
|
me.errors.push(error_token)
|
|
|
|
|
|
break
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 最終的にEOFトークンを追加(まだ追加されていない場合)
|
|
|
|
|
|
if me.tokens.length() == 0 or not me.tokens.get(me.tokens.length() - 1).is_eof() {
|
|
|
|
|
|
me.tokens.push(new JsonToken("EOF", "", me.scanner.get_position(), me.scanner.get_position()))
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return me.tokens
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 次のトークンを1つ取得
|
|
|
|
|
|
next_token() {
|
|
|
|
|
|
// 空白をスキップ
|
|
|
|
|
|
me.scanner.skip_whitespace()
|
|
|
|
|
|
|
|
|
|
|
|
// EOF チェック
|
|
|
|
|
|
if me.scanner.is_eof() {
|
2025-09-26 00:42:55 +09:00
|
|
|
|
return new JsonToken("EOF", "", me.scanner.get_position(), me.scanner.get_position()).set_line_column(me.scanner.get_line(), me.scanner.get_column())
|
2025-09-23 04:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
local start_pos = me.scanner.get_position()
|
2025-09-26 00:42:55 +09:00
|
|
|
|
local start_line = me.scanner.get_line()
|
|
|
|
|
|
local start_col = me.scanner.get_column()
|
2025-09-23 04:51:17 +09:00
|
|
|
|
local ch = me.scanner.current()
|
|
|
|
|
|
|
|
|
|
|
|
// 構造文字(単一文字)
|
|
|
|
|
|
local structural_type = me.char_to_token_type(ch)
|
|
|
|
|
|
if structural_type != null {
|
|
|
|
|
|
me.scanner.advance()
|
2025-09-26 00:42:55 +09:00
|
|
|
|
return this.create_structural_token(structural_type, start_pos).set_line_column(start_line, start_col)
|
2025-09-23 04:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 文字列リテラル
|
|
|
|
|
|
if ch == "\"" {
|
2025-09-26 00:42:55 +09:00
|
|
|
|
return me.tokenize_string().set_line_column(start_line, start_col)
|
2025-09-23 04:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 数値リテラル
|
|
|
|
|
|
if me.is_number_start_char(ch) {
|
2025-09-26 00:42:55 +09:00
|
|
|
|
return me.tokenize_number().set_line_column(start_line, start_col)
|
2025-09-23 04:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// キーワード(null, true, false)
|
|
|
|
|
|
if me.is_alpha_char(ch) {
|
2025-09-26 00:42:55 +09:00
|
|
|
|
return me.tokenize_keyword().set_line_column(start_line, start_col)
|
2025-09-23 04:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 不明な文字(エラー)
|
|
|
|
|
|
me.scanner.advance()
|
2025-09-26 00:42:55 +09:00
|
|
|
|
return new JsonToken("ERROR", "Unexpected character: '" + ch + "'", start_pos, me.scanner.get_position()).set_line_column(start_line, start_col)
|
2025-09-23 04:51:17 +09:00
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===== 専用トークナイザーメソッド =====
|
|
|
|
|
|
|
|
|
|
|
|
// 文字列トークン化
|
|
|
|
|
|
tokenize_string() {
|
|
|
|
|
|
local start_pos = me.scanner.get_position()
|
|
|
|
|
|
local literal = me.scanner.read_string_literal()
|
|
|
|
|
|
|
|
|
|
|
|
if literal == null {
|
|
|
|
|
|
return new JsonToken("ERROR", "Unterminated string literal", start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-09-25 10:23:14 +09:00
|
|
|
|
// エスケープ解除して値を取得(厳密版)
|
|
|
|
|
|
local unescaped = EscapeUtils.unquote_string(literal)
|
2025-09-23 04:51:17 +09:00
|
|
|
|
|
|
|
|
|
|
// 文字列妥当性検証
|
|
|
|
|
|
if not me.validate_string(unescaped) {
|
|
|
|
|
|
return new JsonToken("ERROR", "Invalid string content", start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return new JsonToken("STRING", unescaped, start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 数値トークン化
|
|
|
|
|
|
tokenize_number() {
|
|
|
|
|
|
local start_pos = me.scanner.get_position()
|
|
|
|
|
|
local number_str = me.scanner.read_number()
|
|
|
|
|
|
|
|
|
|
|
|
if number_str == null {
|
|
|
|
|
|
return new JsonToken("ERROR", "Invalid number format", start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 数値の妥当性を再チェック
|
|
|
|
|
|
if not me.validate_number_format(number_str) {
|
|
|
|
|
|
return new JsonToken("ERROR", "Malformed number: " + number_str, start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return new JsonToken("NUMBER", number_str, start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// キーワードトークン化
|
|
|
|
|
|
tokenize_keyword() {
|
|
|
|
|
|
local start_pos = me.scanner.get_position()
|
|
|
|
|
|
|
2025-09-25 10:23:14 +09:00
|
|
|
|
// アルファベット/数字/下線を読み取り(関数参照を避ける安全版)
|
|
|
|
|
|
local keyword = me.scanner.read_identifier()
|
2025-09-23 04:51:17 +09:00
|
|
|
|
|
|
|
|
|
|
// キーワード判定
|
|
|
|
|
|
local token_type = me.keyword_to_token_type(keyword)
|
|
|
|
|
|
if token_type != null {
|
|
|
|
|
|
return new JsonToken(token_type, keyword, start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 不明なキーワード(エラー)
|
|
|
|
|
|
return new JsonToken("ERROR", "Unknown keyword: " + keyword, start_pos, me.scanner.get_position())
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===== ヘルパーメソッド =====
|
|
|
|
|
|
|
|
|
|
|
|
// 構造トークン作成
|
|
|
|
|
|
create_structural_token(token_type, start_pos) {
|
|
|
|
|
|
return new JsonToken(token_type, this.token_type_to_char(token_type), start_pos, start_pos + 1)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// トークンタイプから文字を取得
|
|
|
|
|
|
token_type_to_char(token_type) {
|
|
|
|
|
|
if token_type == "LBRACE" {
|
|
|
|
|
|
return "{"
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if token_type == "RBRACE" {
|
|
|
|
|
|
return "}"
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if token_type == "LBRACKET" {
|
|
|
|
|
|
return "["
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if token_type == "RBRACKET" {
|
|
|
|
|
|
return "]"
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if token_type == "COMMA" {
|
|
|
|
|
|
return ","
|
|
|
|
|
|
} else {
|
|
|
|
|
|
if token_type == "COLON" {
|
|
|
|
|
|
return ":"
|
|
|
|
|
|
} else {
|
|
|
|
|
|
return ""
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
2025-09-25 10:23:14 +09:00
|
|
|
|
|
2025-09-23 04:51:17 +09:00
|
|
|
|
|
|
|
|
|
|
// 数値形式の妥当性検証
|
|
|
|
|
|
validate_number_format(num_str) {
|
|
|
|
|
|
// 基本的な数値パターンチェック
|
|
|
|
|
|
if num_str.length() == 0 {
|
|
|
|
|
|
return false
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// JSON数値の厳密な検証
|
|
|
|
|
|
// 先頭ゼロの禁止("0"以外で"0"で始まる整数は無効)
|
|
|
|
|
|
if num_str.length() > 1 and num_str.substring(0, 1) == "0" {
|
|
|
|
|
|
local second_char = num_str.substring(1, 2)
|
|
|
|
|
|
if me.is_digit_char(second_char) {
|
|
|
|
|
|
return false // "01", "02" などは無効
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// マイナス符号の後に数字があるかチェック
|
|
|
|
|
|
if me.starts_with(num_str, "-") {
|
|
|
|
|
|
if num_str.length() == 1 {
|
|
|
|
|
|
return false // "-" だけは無効
|
|
|
|
|
|
}
|
|
|
|
|
|
local after_minus = num_str.substring(1, 2)
|
|
|
|
|
|
if not me.is_digit_char(after_minus) {
|
|
|
|
|
|
return false
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
return true
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===== 結果取得メソッド =====
|
|
|
|
|
|
|
|
|
|
|
|
get_tokens() {
|
|
|
|
|
|
return me.tokens
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
get_errors() {
|
|
|
|
|
|
return me.errors
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
has_errors() {
|
|
|
|
|
|
return me.errors.length() > 0
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
get_error_count() {
|
|
|
|
|
|
return me.errors.length()
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===== デバッグ・分析メソッド =====
|
|
|
|
|
|
|
|
|
|
|
|
print_tokens() {
|
|
|
|
|
|
print("🔍 Tokenization Results:")
|
|
|
|
|
|
print("Total tokens: " + me.tokens.length())
|
|
|
|
|
|
print("Errors: " + me.errors.length())
|
|
|
|
|
|
|
|
|
|
|
|
if me.has_errors() {
|
|
|
|
|
|
print("\n❌ Errors found:")
|
|
|
|
|
|
local i = 0
|
|
|
|
|
|
loop(i < me.errors.length()) {
|
|
|
|
|
|
local error = me.errors.get(i)
|
|
|
|
|
|
print(" " + error.to_debug_string())
|
|
|
|
|
|
i = i + 1
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
print("\n📋 Token list:")
|
|
|
|
|
|
local i = 0
|
|
|
|
|
|
loop(i < me.tokens.length()) {
|
|
|
|
|
|
local token = me.tokens.get(i)
|
|
|
|
|
|
local prefix = " "
|
|
|
|
|
|
if token.is_error() {
|
|
|
|
|
|
prefix = "❌ "
|
|
|
|
|
|
}
|
|
|
|
|
|
print(prefix + token.to_string())
|
|
|
|
|
|
i = i + 1
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
get_statistics() {
|
|
|
|
|
|
local stats = new MapBox()
|
|
|
|
|
|
|
|
|
|
|
|
// 基本統計
|
|
|
|
|
|
stats.set("total_tokens", me.tokens.length())
|
|
|
|
|
|
stats.set("error_count", me.errors.length())
|
|
|
|
|
|
stats.set("success_rate", (me.tokens.length() - me.errors.length()) / me.tokens.length())
|
|
|
|
|
|
|
|
|
|
|
|
// トークンタイプ別統計
|
|
|
|
|
|
local type_counts = new MapBox()
|
|
|
|
|
|
local i = 0
|
|
|
|
|
|
loop(i < me.tokens.length()) {
|
|
|
|
|
|
local token = me.tokens.get(i)
|
|
|
|
|
|
local type = token.get_type()
|
|
|
|
|
|
|
|
|
|
|
|
if type_counts.has(type) {
|
|
|
|
|
|
type_counts.set(type, type_counts.get(type) + 1)
|
|
|
|
|
|
} else {
|
|
|
|
|
|
type_counts.set(type, 1)
|
|
|
|
|
|
}
|
|
|
|
|
|
i = i + 1
|
|
|
|
|
|
}
|
|
|
|
|
|
stats.set("type_distribution", type_counts)
|
|
|
|
|
|
|
|
|
|
|
|
return stats
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// ===== 内蔵ユーティリティメソッド =====
|
|
|
|
|
|
|
|
|
|
|
|
// アルファベット判定
|
|
|
|
|
|
is_alpha_char(ch) {
|
|
|
|
|
|
return (ch >= "a" and ch <= "z") or (ch >= "A" and ch <= "Z")
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 数字文字判定
|
|
|
|
|
|
is_digit_char(ch) {
|
|
|
|
|
|
return ch >= "0" and ch <= "9"
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 英数字判定
|
|
|
|
|
|
is_alphanumeric_char(ch) {
|
|
|
|
|
|
return me.is_alpha_char(ch) or me.is_digit_char(ch)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 文字列先頭判定
|
|
|
|
|
|
starts_with(str, prefix) {
|
|
|
|
|
|
if prefix.length() > str.length() {
|
|
|
|
|
|
return false
|
|
|
|
|
|
}
|
|
|
|
|
|
return str.substring(0, prefix.length()) == prefix
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 簡易文字列アンクオート
|
|
|
|
|
|
unquote_string(quoted_str) {
|
|
|
|
|
|
if quoted_str.length() < 2 {
|
|
|
|
|
|
return quoted_str
|
|
|
|
|
|
}
|
|
|
|
|
|
if quoted_str.substring(0, 1) == "\"" and quoted_str.substring(quoted_str.length() - 1, quoted_str.length()) == "\"" {
|
|
|
|
|
|
return quoted_str.substring(1, quoted_str.length() - 1)
|
|
|
|
|
|
}
|
|
|
|
|
|
return quoted_str
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 簡易文字列検証
|
|
|
|
|
|
validate_string(str) {
|
|
|
|
|
|
// 簡易実装 - 実際のJSONエスケープ検証は複雑
|
|
|
|
|
|
return str.length() >= 0 // 基本的な存在チェックのみ
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 文字からトークンタイプを判定
|
|
|
|
|
|
char_to_token_type(ch) {
|
|
|
|
|
|
return match ch {
|
|
|
|
|
|
"{" => "LBRACE",
|
|
|
|
|
|
"}" => "RBRACE",
|
|
|
|
|
|
"[" => "LBRACKET",
|
|
|
|
|
|
"]" => "RBRACKET",
|
|
|
|
|
|
"," => "COMMA",
|
|
|
|
|
|
":" => "COLON",
|
|
|
|
|
|
_ => null
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 数値開始文字判定
|
|
|
|
|
|
is_number_start_char(ch) {
|
|
|
|
|
|
return ch == "-" or me.is_digit_char(ch)
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// キーワードからトークンタイプを判定
|
|
|
|
|
|
keyword_to_token_type(keyword) {
|
|
|
|
|
|
return match keyword {
|
|
|
|
|
|
"null" => "NULL",
|
|
|
|
|
|
"true" => "TRUE",
|
|
|
|
|
|
"false" => "FALSE",
|
|
|
|
|
|
_ => null
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
|
|
// 🎯 Static Box - Nyashインクルードシステム要件
|
|
|
|
|
|
static box JsonTokenizerModule {
|
|
|
|
|
|
create_tokenizer(input_text) {
|
|
|
|
|
|
return new JsonTokenizer(input_text)
|
|
|
|
|
|
}
|
2025-09-25 00:41:56 +09:00
|
|
|
|
}
|