refactor: unify string helpers and pattern2 derived slot

This commit is contained in:
2025-12-28 13:22:02 +09:00
parent 84e1cd7c7b
commit 10e6a15552
41 changed files with 2044 additions and 585 deletions

View File

@ -41,10 +41,9 @@ impl StringBox {
/// Find substring and return position (or -1 if not found)
pub fn find(&self, search: &str) -> Box<dyn NyashBox> {
use crate::box_trait::IntegerBox;
match self.value.find(search) {
Some(pos) => Box::new(IntegerBox::new(pos as i64)),
None => Box::new(IntegerBox::new(-1)),
}
let mode = crate::boxes::string_ops::index_mode_from_env();
let idx = crate::boxes::string_ops::index_of(&self.value, search, None, mode);
Box::new(IntegerBox::new(idx))
}
/// Replace all occurrences of old with new

View File

@ -65,6 +65,7 @@ pub mod integer_box;
pub mod math_box;
pub mod random_box;
pub mod string_box;
pub mod string_ops;
pub mod time_box;
// These boxes use web APIs that require special handling in WASM
pub mod aot_compiler_box;

View File

@ -13,6 +13,7 @@
* - `toLowerCase()` - 小文字変換
* - `trim()` - 前後の空白除去
* - `indexOf(search)` - 文字列検索
* - `indexOf(search, fromIndex)` - 指定位置から検索
* - `replace(from, to)` - 文字列置換
* - `charAt(index)` - 指定位置の文字取得
*
@ -71,18 +72,18 @@ impl StringBox {
/// Env gate: NYASH_STR_CP=1 → return codepoint index; default is byte index
pub fn find(&self, search: &str) -> Box<dyn NyashBox> {
use crate::boxes::integer_box::IntegerBox;
match self.value.find(search) {
Some(byte_pos) => {
let use_cp = std::env::var("NYASH_STR_CP").ok().as_deref() == Some("1");
let idx = if use_cp {
self.value[..byte_pos].chars().count() as i64
} else {
byte_pos as i64
};
Box::new(IntegerBox::new(idx))
}
None => Box::new(IntegerBox::new(-1)),
}
let mode = crate::boxes::string_ops::index_mode_from_env();
let idx = crate::boxes::string_ops::index_of(&self.value, search, None, mode);
Box::new(IntegerBox::new(idx))
}
/// Find substring starting from a given index (or -1 if not found)
/// Env gate: NYASH_STR_CP=1 → indices are codepoint-based; default is byte index
pub fn find_from(&self, search: &str, start: i64) -> Box<dyn NyashBox> {
use crate::boxes::integer_box::IntegerBox;
let mode = crate::boxes::string_ops::index_mode_from_env();
let idx = crate::boxes::string_ops::index_of(&self.value, search, Some(start), mode);
Box::new(IntegerBox::new(idx))
}
/// Replace all occurrences of old with new
@ -94,18 +95,9 @@ impl StringBox {
/// Env gate: NYASH_STR_CP=1 → return codepoint index; default is byte index.
pub fn lastIndexOf(&self, search: &str) -> Box<dyn NyashBox> {
use crate::boxes::integer_box::IntegerBox;
match self.value.rfind(search) {
Some(byte_pos) => {
let use_cp = std::env::var("NYASH_STR_CP").ok().as_deref() == Some("1");
let idx = if use_cp {
self.value[..byte_pos].chars().count() as i64
} else {
byte_pos as i64
};
Box::new(IntegerBox::new(idx))
}
None => Box::new(IntegerBox::new(-1)),
}
let mode = crate::boxes::string_ops::index_mode_from_env();
let idx = crate::boxes::string_ops::last_index_of(&self.value, search, mode);
Box::new(IntegerBox::new(idx))
}
/// Trim whitespace from both ends

101
src/boxes/string_ops.rs Normal file
View File

@ -0,0 +1,101 @@
//! Shared string indexing helpers (byte vs codepoint).
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
pub enum StringIndexMode {
Byte,
CodePoint,
}
pub fn index_mode_from_env() -> StringIndexMode {
if std::env::var("NYASH_STR_CP").ok().as_deref() == Some("1") {
StringIndexMode::CodePoint
} else {
StringIndexMode::Byte
}
}
pub fn index_of(haystack: &str, needle: &str, start: Option<i64>, mode: StringIndexMode) -> i64 {
match mode {
StringIndexMode::Byte => index_of_bytes(haystack, needle, start),
StringIndexMode::CodePoint => index_of_codepoints(haystack, needle, start),
}
}
pub fn last_index_of(haystack: &str, needle: &str, mode: StringIndexMode) -> i64 {
match mode {
StringIndexMode::Byte => haystack.rfind(needle).map(|i| i as i64).unwrap_or(-1),
StringIndexMode::CodePoint => haystack
.rfind(needle)
.map(|byte_pos| haystack[..byte_pos].chars().count() as i64)
.unwrap_or(-1),
}
}
pub fn substring(haystack: &str, start: i64, end: Option<i64>, mode: StringIndexMode) -> String {
match mode {
StringIndexMode::Byte => substring_bytes(haystack, start, end),
StringIndexMode::CodePoint => substring_codepoints(haystack, start, end),
}
}
fn index_of_bytes(haystack: &str, needle: &str, start: Option<i64>) -> i64 {
let start_idx = start.unwrap_or(0).max(0) as usize;
if start_idx > haystack.len() {
return -1;
}
haystack[start_idx..]
.find(needle)
.map(|i| (start_idx + i) as i64)
.unwrap_or(-1)
}
fn index_of_codepoints(haystack: &str, needle: &str, start: Option<i64>) -> i64 {
let start_idx = start.unwrap_or(0).max(0) as usize;
let Some(byte_start) = byte_offset_for_cp(haystack, start_idx) else {
return -1;
};
if byte_start > haystack.len() {
return -1;
}
haystack[byte_start..]
.find(needle)
.map(|rel| {
let abs = byte_start + rel;
haystack[..abs].chars().count() as i64
})
.unwrap_or(-1)
}
fn substring_bytes(haystack: &str, start: i64, end: Option<i64>) -> String {
let len = haystack.len() as i64;
let start = start.max(0).min(len);
let end = end.unwrap_or(len).max(0).min(len);
if start > end {
return String::new();
}
let bytes = haystack.as_bytes();
String::from_utf8(bytes[start as usize..end as usize].to_vec()).unwrap_or_default()
}
fn substring_codepoints(haystack: &str, start: i64, end: Option<i64>) -> String {
let len = haystack.chars().count() as i64;
let start = start.max(0).min(len) as usize;
let end = end.unwrap_or(len).max(start as i64).min(len) as usize;
let chars: Vec<char> = haystack.chars().collect();
chars[start..end].iter().collect()
}
fn byte_offset_for_cp(haystack: &str, cp_index: usize) -> Option<usize> {
let mut count = 0usize;
for (byte_pos, _) in haystack.char_indices() {
if count == cp_index {
return Some(byte_pos);
}
count += 1;
}
if count == cp_index {
Some(haystack.len())
} else {
None
}
}