diff --git a/apps/tests/string_cp_mode_min.hako b/apps/tests/string_cp_mode_min.hako new file mode 100644 index 00000000..36867037 --- /dev/null +++ b/apps/tests/string_cp_mode_min.hako @@ -0,0 +1,9 @@ +static box Main { + main() { + local s = "aĆ©š„ž" + local sb = new StringBox("aĆ©š„ž") + + print(1000 + s.length()) + print(2000 + sb.length()) + } +} diff --git a/docs/reference/environment-variables.md b/docs/reference/environment-variables.md index df1ebbde..4053b244 100644 --- a/docs/reference/environment-variables.md +++ b/docs/reference/environment-variables.md @@ -108,6 +108,14 @@ NYASH_USE_STAGE1_CLI=1 STAGE1_EMIT_MIR_JSON=1 \ --- +## String / Unicode + +| 変数 | ćƒ‡ćƒ•ć‚©ćƒ«ćƒˆ | é©ē”ØēµŒč·Æ | čŖ¬ę˜Ž | +| --- | --- | --- | --- | +| `NYASH_STR_CP=1` | OFF | Any | ę–‡å­—åˆ—ć® `length` / `indexOf` / `lastIndexOf` / `substring` ć‚’ć‚³ćƒ¼ćƒ‰ćƒć‚¤ćƒ³ćƒˆåŸŗęŗ–ć«ć™ć‚‹ļ¼ˆę—¢å®šćÆćƒć‚¤ćƒˆåŸŗęŗ–ļ¼‰ | + +--- + ## PHI ćƒ‡ćƒćƒƒć‚°é–¢é€£ (Phase 277 P2 ēµ±åˆē‰ˆ) **Phase 277 P2** 恧 PHI é–¢é€£ē’°å¢ƒå¤‰ę•°ć‚’ **8個 → 3個** ć«ēµ±åˆć—ć¾ć—ćŸć€‚ diff --git a/src/backend/mir_interpreter/handlers/boxes_string.rs b/src/backend/mir_interpreter/handlers/boxes_string.rs index cd4767b7..3a2d54f3 100644 --- a/src/backend/mir_interpreter/handlers/boxes_string.rs +++ b/src/backend/mir_interpreter/handlers/boxes_string.rs @@ -3,6 +3,7 @@ use super::string_method_helpers::{ parse_index_of_args, parse_last_index_of_args, parse_substring_args, ArgParsePolicy, }; use crate::boxes::string_ops; +use crate::config::env::string_codepoint_mode; pub(super) fn try_handle_string_box( this: &mut MirInterpreter, @@ -20,7 +21,7 @@ pub(super) fn try_handle_string_box( && std::env::var("NYASH_VM_FAST").ok().as_deref() == Some("1") { if let VMValue::String(ref raw) = recv { - let use_cp = std::env::var("NYASH_STR_CP").ok().as_deref() == Some("1"); + let use_cp = string_codepoint_mode(); let n = if use_cp { raw.chars().count() as i64 } else { @@ -54,7 +55,7 @@ pub(super) fn try_handle_string_box( "length" | "size" => { // Bench/profile fast path: return VMValue::Integer directly (avoid boxing overhead) if std::env::var("NYASH_VM_FAST").ok().as_deref() == Some("1") { - let use_cp = std::env::var("NYASH_STR_CP").ok().as_deref() == Some("1"); + let use_cp = string_codepoint_mode(); let n = if use_cp { sb_norm.value.chars().count() as i64 } else { diff --git a/src/boxes/basic/string_box.rs b/src/boxes/basic/string_box.rs index 3e5670fb..dd8afd20 100644 --- a/src/boxes/basic/string_box.rs +++ b/src/boxes/basic/string_box.rs @@ -107,7 +107,7 @@ impl StringBox { /// otherwise use UTF-8 byte length (legacy/default). pub fn length(&self) -> Box { use crate::box_trait::IntegerBox; - let use_cp = std::env::var("NYASH_STR_CP").ok().as_deref() == Some("1"); + let use_cp = crate::config::env::string_codepoint_mode(); let n = if use_cp { self.value.chars().count() as i64 } else { @@ -144,10 +144,9 @@ impl StringBox { /// Get substring from start to end (exclusive) pub fn substring(&self, start: usize, end: usize) -> Box { - let chars: Vec = self.value.chars().collect(); - let actual_end = end.min(chars.len()); - let actual_start = start.min(actual_end); - let substring: String = chars[actual_start..actual_end].iter().collect(); + let mode = crate::boxes::string_ops::index_mode_from_env(); + let substring = + crate::boxes::string_ops::substring(&self.value, start as i64, Some(end as i64), mode); Box::new(StringBox::new(substring)) } } diff --git a/src/boxes/string_ops.rs b/src/boxes/string_ops.rs index f0eac34c..647024e0 100644 --- a/src/boxes/string_ops.rs +++ b/src/boxes/string_ops.rs @@ -7,7 +7,7 @@ pub enum StringIndexMode { } pub fn index_mode_from_env() -> StringIndexMode { - if std::env::var("NYASH_STR_CP").ok().as_deref() == Some("1") { + if crate::config::env::string_codepoint_mode() { StringIndexMode::CodePoint } else { StringIndexMode::Byte diff --git a/src/config/env.rs b/src/config/env.rs index 5c29155d..939a4fdc 100644 --- a/src/config/env.rs +++ b/src/config/env.rs @@ -25,6 +25,7 @@ //! | `using_flags` | `NYASH_USING_*` | Using / Namespace 設定 | //! | `verification_flags` | `NYASH_VERIFY_*` | Verification 設定 | //! | `selfhost_flags` | `NYASH_NY_COMPILER_*` | Selfhost compiler 設定 | +//! | `string_flags` | `NYASH_STR_*` | String / Unicode 設定 | //! //! ## ę–°č¦ē’°å¢ƒå¤‰ę•°čæ½åŠ ć®ę‰‹é † //! @@ -79,6 +80,7 @@ //! - `using_flags` - Using/namespace flags (10+ functions) //! - `verification_flags` - Verification flags (8+ functions) //! - `selfhost_flags` - Selfhost compiler flags (10+ functions) +//! - `string_flags` - String/Unicode flags (1+ functions) //! //! All functions are re-exported at the top level for backward compatibility. @@ -96,6 +98,7 @@ mod macro_flags; mod mir_flags; mod parser_flags; mod selfhost_flags; +mod string_flags; mod using_flags; mod verification_flags; mod vm_backend_flags; @@ -114,6 +117,7 @@ pub use macro_flags::*; pub use mir_flags::*; pub use parser_flags::*; pub use selfhost_flags::*; +pub use string_flags::*; pub use using_flags::*; pub use verification_flags::*; pub use vm_backend_flags::*; diff --git a/src/config/env/catalog.rs b/src/config/env/catalog.rs index bd6c928a..1578d340 100644 --- a/src/config/env/catalog.rs +++ b/src/config/env/catalog.rs @@ -236,6 +236,13 @@ pub fn env_vars() -> Vec { applies_to: AppliesTo::BoxFactory, default: None, }, + // Runtime (Phase 29ab) + EnvVarMeta { + name: "NYASH_STR_CP", + description: "String index mode: 1=codepoint, 0=byte (default)", + applies_to: AppliesTo::Runtime, + default: Some("0"), + }, // Selfhost (Phase 286B) EnvVarMeta { name: "NYASH_NY_COMPILER_TIMEOUT_MS", diff --git a/src/config/env/string_flags.rs b/src/config/env/string_flags.rs new file mode 100644 index 00000000..4b0c315f --- /dev/null +++ b/src/config/env/string_flags.rs @@ -0,0 +1,9 @@ +//! String-related environment flags. + +use super::env_bool; + +/// NYASH_STR_CP=1: use Unicode code point indexing for string operations. +/// Default: OFF (byte indexing). +pub fn string_codepoint_mode() -> bool { + env_bool("NYASH_STR_CP") +} diff --git a/tools/smokes/v2/profiles/integration/apps/string_cp_mode_min_vm.sh b/tools/smokes/v2/profiles/integration/apps/string_cp_mode_min_vm.sh new file mode 100644 index 00000000..c24f0c63 --- /dev/null +++ b/tools/smokes/v2/profiles/integration/apps/string_cp_mode_min_vm.sh @@ -0,0 +1,45 @@ +#!/bin/bash +# String CP/byte mode parity for primitive String and StringBox. + +source "$(dirname "$0")/../../../lib/test_runner.sh" +export SMOKES_USE_PYVM=0 +require_env || exit 2 + +INPUT="$NYASH_ROOT/apps/tests/string_cp_mode_min.hako" +RUN_TIMEOUT_SECS=${RUN_TIMEOUT_SECS:-10} + +run_case() { + local mode="$1" + local expect_len="$2" + + set +e + OUTPUT=$(timeout "$RUN_TIMEOUT_SECS" env -u NYASH_ROOT NYASH_STR_CP="$mode" "$NYASH_BIN" "$INPUT" 2>&1) + EXIT_CODE=$? + set -e + + if [ "$EXIT_CODE" -eq 124 ]; then + test_fail "string_cp_mode_min_vm: hakorune timed out (>${RUN_TIMEOUT_SECS}s)" + exit 1 + fi + + OUTPUT_CLEAN="$OUTPUT" + + expect_line "$((1000 + expect_len))" + expect_line "$((2000 + expect_len))" +} + +expect_line() { + local line="$1" + if ! echo "$OUTPUT_CLEAN" | grep -Fxq "$line"; then + echo "[FAIL] Missing expected line: $line" + echo "[INFO] Output (clean):" + echo "$OUTPUT_CLEAN" | tail -n 20 || true + test_fail "string_cp_mode_min_vm: output mismatch" + exit 1 + fi +} + +run_case 0 7 +run_case 1 3 + +test_pass "string_cp_mode_min_vm: NYASH_STR_CP byte/cp modes match for String and StringBox"