feat(mir): Phase 143 P1 - Add parse_string pattern to canonicalizer
Expand loop canonicalizer to recognize parse_string patterns with both
continue (escape handling) and return (quote found) statements.
## Implementation
### New Pattern Detection (ast_feature_extractor.rs)
- Add `detect_parse_string_pattern()` function
- Support nested continue detection using `has_continue_node()` helper
- Recognize both return and continue in same loop body
- Return ParseStringInfo { carrier_name, delta, body_stmts }
- ~120 lines added
### Canonicalizer Integration (canonicalizer.rs)
- Try parse_string pattern first (most specific)
- Build LoopSkeleton with HeaderCond, Body, Update steps
- Set ExitContract: has_continue=true, has_return=true
- Route to Pattern4Continue (both exits present)
- ~45 lines modified
### Export Chain
- Add re-exports through 7 module levels:
ast_feature_extractor → patterns → joinir → control_flow → builder → mir
- 10 lines total across 7 files
### Unit Test
- Add `test_parse_string_pattern_recognized()` in canonicalizer.rs
- Verify skeleton structure (3+ steps)
- Verify carrier (name="p", delta=1, role=Counter)
- Verify exit contract (continue=true, return=true, break=false)
- Verify routing decision (Pattern4Continue, no missing_caps)
- ~180 lines added
## Target Pattern
`tools/selfhost/test_pattern4_parse_string.hako`
Pattern structure:
- Check for closing quote → return
- Check for escape sequence → continue (nested inside another if)
- Regular character processing → p++
## Results
- ✅ Strict parity green: Pattern4Continue
- ✅ All 19 unit tests pass
- ✅ Nested continue detection working
- ✅ ExitContract correctly set (first pattern with both continue+return)
- ✅ Default behavior unchanged
## Technical Challenges
1. Nested continue detection required recursive search
2. First pattern with both has_continue=true AND has_return=true
3. Variable step updates (p++ vs p+=2) handled with base delta
## Statistics
- New patterns: 1 (parse_string)
- Total patterns: 4 (skip_whitespace, parse_number, continue, parse_string)
- New capabilities: 0 (uses existing ConstStep)
- Lines added: ~300
- Files modified: 9
- Parity status: Green ✅
Phase 143 P1: Complete
🤖 Generated with [Claude Code](https://claude.com/claude-code)
Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com>
This commit is contained in:
@ -24,3 +24,4 @@ pub(crate) use patterns::{detect_continue_pattern, ContinuePatternInfo};
|
||||
|
||||
// Phase 143-P0: Re-export parse_number pattern detection for loop_canonicalizer
|
||||
pub(crate) use patterns::{detect_parse_number_pattern, ParseNumberInfo};
|
||||
pub(crate) use patterns::{detect_parse_string_pattern, ParseStringInfo};
|
||||
|
||||
@ -677,6 +677,198 @@ pub fn detect_parse_number_pattern(body: &[ASTNode]) -> Option<ParseNumberInfo>
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Phase 143-P1: Parse String Pattern Detection
|
||||
// ============================================================================
|
||||
|
||||
/// Parse string pattern information
|
||||
///
|
||||
/// This struct holds the extracted information from a recognized parse_string pattern.
|
||||
#[derive(Debug, Clone, PartialEq)]
|
||||
pub struct ParseStringInfo {
|
||||
/// Carrier variable name (e.g., "p")
|
||||
pub carrier_name: String,
|
||||
/// Base constant step increment (e.g., 1 for `p = p + 1`)
|
||||
pub delta: i64,
|
||||
/// Body statements before the return/continue checks
|
||||
pub body_stmts: Vec<ASTNode>,
|
||||
}
|
||||
|
||||
/// Detect parse_string pattern in loop body
|
||||
///
|
||||
/// Phase 143-P1: Pattern with both continue (escape handling) AND return (quote found)
|
||||
///
|
||||
/// Pattern structure:
|
||||
/// ```
|
||||
/// loop(p < len) {
|
||||
/// local ch = s.substring(p, p + 1)
|
||||
///
|
||||
/// // Check for closing quote (return)
|
||||
/// if ch == "\"" {
|
||||
/// return result
|
||||
/// }
|
||||
///
|
||||
/// // Check for escape sequence (continue after processing)
|
||||
/// if ch == "\\" {
|
||||
/// result = result + ch
|
||||
/// p = p + 1
|
||||
/// if p < len {
|
||||
/// result = result + s.substring(p, p + 1)
|
||||
/// p = p + 1
|
||||
/// continue
|
||||
/// }
|
||||
/// }
|
||||
///
|
||||
/// // Regular character
|
||||
/// result = result + ch
|
||||
/// p = p + 1
|
||||
/// }
|
||||
/// ```
|
||||
///
|
||||
/// Recognized characteristics:
|
||||
/// - Has return statement (early exit on quote)
|
||||
/// - Has continue statement (skip after escape processing)
|
||||
/// - Variable step update (p++ normally, but p+=2 on escape)
|
||||
///
|
||||
/// # Arguments
|
||||
///
|
||||
/// * `body` - Loop body statements to analyze
|
||||
///
|
||||
/// # Returns
|
||||
///
|
||||
/// `Some(ParseStringInfo)` if the pattern matches, `None` otherwise
|
||||
///
|
||||
/// # Notes
|
||||
///
|
||||
/// This is more complex than parse_number or continue patterns due to:
|
||||
/// - Multiple exit types (return AND continue)
|
||||
/// - Variable step increment (conditional on escape sequence)
|
||||
/// - Nested control flow (escape has nested if inside)
|
||||
pub fn detect_parse_string_pattern(body: &[ASTNode]) -> Option<ParseStringInfo> {
|
||||
if body.is_empty() {
|
||||
return None;
|
||||
}
|
||||
|
||||
// We need to find:
|
||||
// 1. An if statement with return in then_body
|
||||
// 2. An if statement with continue in then_body (nested inside)
|
||||
// 3. Carrier updates (normal and escape-case)
|
||||
|
||||
let mut has_return = false;
|
||||
let mut has_continue = false;
|
||||
let mut carrier_name = None;
|
||||
let mut delta = None;
|
||||
|
||||
// Scan for return statement
|
||||
for stmt in body {
|
||||
if let ASTNode::If { then_body, .. } = stmt {
|
||||
if then_body
|
||||
.iter()
|
||||
.any(|s| matches!(s, ASTNode::Return { .. }))
|
||||
{
|
||||
has_return = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !has_return {
|
||||
return None;
|
||||
}
|
||||
|
||||
// Scan for continue statement and carrier update (with recursive check for nested continue)
|
||||
for stmt in body {
|
||||
if let ASTNode::If { then_body, .. } = stmt {
|
||||
// Check for continue in then_body (including nested)
|
||||
if then_body.iter().any(|s| has_continue_node(s)) {
|
||||
has_continue = true;
|
||||
}
|
||||
|
||||
// Extract carrier update from then_body
|
||||
for s in then_body {
|
||||
if let ASTNode::Assignment { target, value, .. } = s {
|
||||
if let ASTNode::Variable { name, .. } = target.as_ref() {
|
||||
if let ASTNode::BinaryOp {
|
||||
operator: BinaryOperator::Add,
|
||||
left,
|
||||
right,
|
||||
..
|
||||
} = value.as_ref()
|
||||
{
|
||||
if let ASTNode::Variable {
|
||||
name: left_name, ..
|
||||
} = left.as_ref()
|
||||
{
|
||||
if left_name == name {
|
||||
if let ASTNode::Literal {
|
||||
value: LiteralValue::Integer(n),
|
||||
..
|
||||
} = right.as_ref()
|
||||
{
|
||||
carrier_name = Some(name.clone());
|
||||
delta = Some(*n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
// Also check for carrier update in main body
|
||||
if let ASTNode::Assignment { target, value, .. } = stmt {
|
||||
if let ASTNode::Variable { name, .. } = target.as_ref() {
|
||||
if let ASTNode::BinaryOp {
|
||||
operator: BinaryOperator::Add,
|
||||
left,
|
||||
right,
|
||||
..
|
||||
} = value.as_ref()
|
||||
{
|
||||
if let ASTNode::Variable {
|
||||
name: left_name, ..
|
||||
} = left.as_ref()
|
||||
{
|
||||
if left_name == name {
|
||||
if let ASTNode::Literal {
|
||||
value: LiteralValue::Integer(n),
|
||||
..
|
||||
} = right.as_ref()
|
||||
{
|
||||
if carrier_name.is_none() {
|
||||
carrier_name = Some(name.clone());
|
||||
delta = Some(*n);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
if !has_return || !has_continue {
|
||||
return None;
|
||||
}
|
||||
|
||||
let carrier_name = carrier_name?;
|
||||
let delta = delta?;
|
||||
|
||||
// Extract body statements (for now, just the first statement which should be ch assignment)
|
||||
let body_stmts = if !body.is_empty() {
|
||||
vec![body[0].clone()]
|
||||
} else {
|
||||
vec![]
|
||||
};
|
||||
|
||||
Some(ParseStringInfo {
|
||||
carrier_name,
|
||||
delta,
|
||||
body_stmts,
|
||||
})
|
||||
}
|
||||
|
||||
// ============================================================================
|
||||
// Phase 140-P4-A: Skip Whitespace Pattern Detection (SSOT)
|
||||
// ============================================================================
|
||||
|
||||
@ -73,3 +73,6 @@ pub(crate) use ast_feature_extractor::{detect_continue_pattern, ContinuePatternI
|
||||
|
||||
// Phase 143-P0: Re-export parse_number pattern detection for loop_canonicalizer
|
||||
pub(crate) use ast_feature_extractor::{detect_parse_number_pattern, ParseNumberInfo};
|
||||
|
||||
// Phase 143-P1: Re-export parse_string pattern detection for loop_canonicalizer
|
||||
pub(crate) use ast_feature_extractor::{detect_parse_string_pattern, ParseStringInfo};
|
||||
|
||||
@ -62,6 +62,7 @@ pub(crate) use joinir::{detect_continue_pattern, ContinuePatternInfo};
|
||||
|
||||
// Phase 143-P0: Re-export parse_number pattern detection for loop_canonicalizer
|
||||
pub(crate) use joinir::{detect_parse_number_pattern, ParseNumberInfo};
|
||||
pub(crate) use joinir::{detect_parse_string_pattern, ParseStringInfo};
|
||||
|
||||
impl super::MirBuilder {
|
||||
/// Control-flow: block
|
||||
|
||||
Reference in New Issue
Block a user