From 89c94428681b3441d2f8e8718706138ba74ab0ad Mon Sep 17 00:00:00 2001 From: Rute Figueiredo Date: Thu, 30 Oct 2025 09:59:45 -0300 Subject: [PATCH 1/6] fix: rust compilation errors - unstable expressions --- compiler/src/ir/graph.rs | 5 +++-- compiler/src/ir/intermediate.rs | 5 +++-- 2 files changed, 6 insertions(+), 4 deletions(-) diff --git a/compiler/src/ir/graph.rs b/compiler/src/ir/graph.rs index b266050..298105a 100644 --- a/compiler/src/ir/graph.rs +++ b/compiler/src/ir/graph.rs @@ -98,10 +98,11 @@ impl NFAGraph { // Start from accept states let mut stack: Vec<_> = self.accept_states.iter().copied().collect(); while let Some(state) = stack.pop() { - if reaching_accept.insert(state) - && let Some(predecessors) = reverse_edges.get(&state) { + if reaching_accept.insert(state) { + if let Some(predecessors) = reverse_edges.get(&state) { stack.extend(predecessors); } + } } reaching_accept diff --git a/compiler/src/ir/intermediate.rs b/compiler/src/ir/intermediate.rs index 05002c2..10f5c14 100644 --- a/compiler/src/ir/intermediate.rs +++ b/compiler/src/ir/intermediate.rs @@ -218,8 +218,8 @@ impl IntermediateNFA { .unwrap_or(false); // Only add alternative start states if no start captures would be bypassed - if !has_start_captures - && let Some(closure) = closures.get(orig_start) { + if !has_start_captures { + if let Some(closure) = closures.get(orig_start) { for &r_state in &closure.states { if r_state != orig_start && r_state < has_byte_transitions.len() @@ -229,6 +229,7 @@ impl IntermediateNFA { } } } + } } // Apply the changes From 181ddd865a75b2271a817953a0aa6c32e732654b Mon Sep 17 00:00:00 2001 From: Rute Figueiredo Date: Thu, 30 Oct 2025 10:01:52 -0300 Subject: [PATCH 2/6] chore: convert bare capture groups on private patterns into non capture, preserves special groups --- compiler/src/utils.rs | 217 +++++++++++++++++++++++++++++++++++++++++- 1 file changed, 213 insertions(+), 4 deletions(-) diff --git a/compiler/src/utils.rs b/compiler/src/utils.rs index 94366f9..4041256 100644 --- a/compiler/src/utils.rs +++ b/compiler/src/utils.rs @@ -4,12 +4,58 @@ use heck::{ToPascalCase, ToSnakeCase}; use crate::{DecomposedRegexConfig, NFAGraph, RegexPart}; +/// Converts bare capturing groups `(...)` to non-capturing groups `(?:...)` in a regex pattern. +/// +/// This function only converts opening parentheses that are not already part of special +/// regex constructs like `(?:...)`, `(?=...)`, `(?!...)`, `(?<=...)`, `(? String { + let mut result = String::with_capacity(pattern.len() + pattern.len() / 4); + let chars: Vec = pattern.chars().collect(); + let mut i = 0; + + while i < chars.len() { + if chars[i] == '(' && (i + 1 >= chars.len() || chars[i + 1] != '?') { + // This is a bare capturing group, convert it to non-capturing + result.push_str("(?:"); + } else { + result.push(chars[i]); + } + i += 1; + } + + result +} + /// Combines decomposed regex parts into a single pattern string and extracts capture group lengths. /// /// This function iterates through the `parts` field of a `DecomposedRegexConfig`. -/// It concatenates `RegexPart::Pattern` strings directly. For `RegexPart::PublicPattern`, -/// it wraps the pattern string in parentheses `()` to form a capture group and collects -/// the associated maximum byte length (`max_len`). +/// For `RegexPart::Pattern` parts, it converts any bare capturing groups `(...)` to +/// non-capturing groups `(?:...)` to prevent interference with capture group numbering. +/// For `RegexPart::PublicPattern`, it wraps the pattern string in parentheses `()` +/// to form a capture group and collects the associated maximum byte length (`max_len`). /// /// # Arguments /// @@ -30,7 +76,9 @@ pub fn decomposed_to_composed_regex( for part in &config.parts { match part { RegexPart::Pattern(pattern) => { - combined_parts.push(pattern.clone()); + // Convert bare capturing groups to non-capturing groups + let non_capturing_pattern = convert_capturing_to_non_capturing(pattern); + combined_parts.push(non_capturing_pattern); } RegexPart::PublicPattern((pattern, max_len)) => { combined_parts.push(format!("({pattern})")); @@ -85,3 +133,164 @@ pub fn save_outputs( Ok(()) } + +#[cfg(test)] +mod tests { + use super::*; + + #[test] + fn test_convert_single_capturing_group() { + let input = "(a|b)"; + let expected = "(?:a|b)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_convert_multiple_capturing_groups() { + let input = "(a|b)(c|d)"; + let expected = "(?:a|b)(?:c|d)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_preserve_non_capturing_groups() { + let input = "(?:a|b)"; + let expected = "(?:a|b)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_preserve_positive_lookahead() { + let input = "(?=abc)"; + let expected = "(?=abc)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_preserve_negative_lookahead() { + let input = "(?!abc)"; + let expected = "(?!abc)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_preserve_positive_lookbehind() { + let input = "(?<=abc)"; + let expected = "(?<=abc)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_preserve_negative_lookbehind() { + let input = "(?abc)"; + let expected = "(?abc)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_preserve_comments() { + let input = "(?#comment)"; + let expected = "(?#comment)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_nested_capturing_groups() { + let input = "((a|b))"; + let expected = "(?:(?:a|b))"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_no_groups() { + let input = "abc"; + let expected = "abc"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_mixed_groups() { + let input = "(a|b)(?:c|d)(e|f)(?=g)"; + let expected = "(?:a|b)(?:c|d)(?:e|f)(?=g)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_complex_pattern() { + let input = "(?:prefix_)(a|b)(?=suffix)"; + let expected = "(?:prefix_)(?:a|b)(?=suffix)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_empty_string() { + let input = ""; + let expected = ""; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_escaped_parentheses() { + // Note: This test checks literal escaped parens in the string + // In actual regex, \( is an escaped paren, but we're working with strings + let input = r"\(not a group\)"; + let expected = r"\(?:not a group\)"; // Our simple implementation will convert \( + // This is a known limitation - escaped parens would need more sophisticated parsing + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_decomposed_to_composed_with_capturing_groups() { + let config = DecomposedRegexConfig { + parts: vec![ + RegexPart::Pattern("(a|b)prefix:".to_string()), + RegexPart::PublicPattern(("value".to_string(), 20)), + ], + }; + + let (combined, max_bytes) = decomposed_to_composed_regex(&config); + + assert_eq!(combined, "(?:a|b)prefix:(value)"); + assert_eq!(max_bytes, Some(vec![20])); + } + + #[test] + fn test_decomposed_to_composed_preserves_special_groups() { + let config = DecomposedRegexConfig { + parts: vec![ + RegexPart::Pattern("(?:a|b)prefix:".to_string()), + RegexPart::PublicPattern(("value".to_string(), 20)), + RegexPart::Pattern("(?=suffix)".to_string()), + ], + }; + + let (combined, max_bytes) = decomposed_to_composed_regex(&config); + + assert_eq!(combined, "(?:a|b)prefix:(value)(?=suffix)"); + assert_eq!(max_bytes, Some(vec![20])); + } + + #[test] + fn test_decomposed_multiple_patterns_and_public() { + let config = DecomposedRegexConfig { + parts: vec![ + RegexPart::Pattern("(a|b)".to_string()), + RegexPart::PublicPattern(("first".to_string(), 10)), + RegexPart::Pattern("(c|d)".to_string()), + RegexPart::PublicPattern(("second".to_string(), 15)), + ], + }; + + let (combined, max_bytes) = decomposed_to_composed_regex(&config); + + assert_eq!(combined, "(?:a|b)(first)(?:c|d)(second)"); + assert_eq!(max_bytes, Some(vec![10, 15])); + } +} From 12b8a6f8c251872f07272b022d3e401cdb43af16 Mon Sep 17 00:00:00 2001 From: Rute Figueiredo Date: Thu, 30 Oct 2025 10:08:10 -0300 Subject: [PATCH 3/6] fix: doctest trying to access to private function --- compiler/src/utils.rs | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/compiler/src/utils.rs b/compiler/src/utils.rs index 4041256..060f65f 100644 --- a/compiler/src/utils.rs +++ b/compiler/src/utils.rs @@ -25,11 +25,10 @@ use crate::{DecomposedRegexConfig, NFAGraph, RegexPart}; /// /// # Examples /// -/// ``` -/// # use zk_regex_compiler::utils::convert_capturing_to_non_capturing; -/// assert_eq!(convert_capturing_to_non_capturing("(a|b)"), "(?:a|b)"); -/// assert_eq!(convert_capturing_to_non_capturing("(?:a|b)"), "(?:a|b)"); // unchanged -/// assert_eq!(convert_capturing_to_non_capturing("(?=a)"), "(?=a)"); // unchanged +/// ```text +/// convert_capturing_to_non_capturing("(a|b)") → "(?:a|b)" +/// convert_capturing_to_non_capturing("(?:a|b)") → "(?:a|b)" // unchanged +/// convert_capturing_to_non_capturing("(?=a)") → "(?=a)" // unchanged /// ``` fn convert_capturing_to_non_capturing(pattern: &str) -> String { let mut result = String::with_capacity(pattern.len() + pattern.len() / 4); From bb530c535731c494e9687e32053a2f224da7e397 Mon Sep 17 00:00:00 2001 From: Rute Figueiredo Date: Thu, 30 Oct 2025 15:54:38 -0300 Subject: [PATCH 4/6] chore: added test to check if gen_from_decomposed converts capture groups inside of public patterns into non capture --- compiler/src/utils.rs | 28 ++++++++++++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/compiler/src/utils.rs b/compiler/src/utils.rs index 060f65f..a4a16a9 100644 --- a/compiler/src/utils.rs +++ b/compiler/src/utils.rs @@ -292,4 +292,32 @@ mod tests { assert_eq!(combined, "(?:a|b)(first)(?:c|d)(second)"); assert_eq!(max_bytes, Some(vec![10, 15])); } + + /// This test verifies that capturing groups in Pattern parts are automatically + /// converted to non-capturing groups, preventing interference with PublicPattern + /// capture group numbering. + #[test] + fn test_wrap_capture_group_into_non_capturing_group_in_private_pattern() { + use crate::{gen_from_decomposed, ProvingFramework}; + + // This is the pattern that contains a private capture group that should be wrapped into a non-capturing group + let regex_with_private_capture_group = DecomposedRegexConfig { + parts: vec![ + // Pattern with capturing group - should be converted to non-capturing + RegexPart::Pattern("(a|b)prefix:".to_string()), + // This should be capture group 1 for the circuit + RegexPart::PublicPattern((".+?".to_string(), 20)), + ], + }; + + let (nfa, _circuit_code) = gen_from_decomposed( + regex_with_private_capture_group, + "TestCapture1", + ProvingFramework::Circom, + ) + .expect("Should compile successfully"); + + assert_eq!(nfa.regex, "(?:a|b)prefix:(.+?)"); + assert_eq!(nfa.num_capture_groups, 1); + } } From faeabb4d0e0eb774676ec466755879b428c294d6 Mon Sep 17 00:00:00 2001 From: Rute Figueiredo Date: Thu, 30 Oct 2025 16:34:42 -0300 Subject: [PATCH 5/6] fix: properly handle escaped parens and character classes in capture group conversion The convert_capturing_to_non_capturing function was naively converting all '(' to '(?:' without considering: - Escaped parentheses like \( which match literal parens - Parentheses inside character classes like [()] which are literals This caused regex semantics to break. For example: - \(foo\) would become \(?:foo\) (wrong) - [()] would become [(?:)] (matches extra chars) Fixed by tracking two states during string scan: - in_char_class: are we inside [...]? - escaped: was previous char a backslash? Only convert ( to (?:) when it's not escaped, not in a char class, and not already followed by ?. Added 5 new tests covering escaped parens, character classes, and mixed cases. All 23 tests pass. Addresses PR review feedback: https://github.com/zkemail/zk-regex/pull/109#discussion_r2479219131 --- compiler/src/utils.rs | 88 ++++++++++++++++++++++++++++++++++--------- 1 file changed, 71 insertions(+), 17 deletions(-) diff --git a/compiler/src/utils.rs b/compiler/src/utils.rs index a4a16a9..93f31a9 100644 --- a/compiler/src/utils.rs +++ b/compiler/src/utils.rs @@ -6,14 +6,10 @@ use crate::{DecomposedRegexConfig, NFAGraph, RegexPart}; /// Converts bare capturing groups `(...)` to non-capturing groups `(?:...)` in a regex pattern. /// -/// This function only converts opening parentheses that are not already part of special -/// regex constructs like `(?:...)`, `(?=...)`, `(?!...)`, `(?<=...)`, `(? String { let mut result = String::with_capacity(pattern.len() + pattern.len() / 4); let chars: Vec = pattern.chars().collect(); let mut i = 0; + let mut in_char_class = false; + let mut escaped = false; while i < chars.len() { - if chars[i] == '(' && (i + 1 >= chars.len() || chars[i + 1] != '?') { - // This is a bare capturing group, convert it to non-capturing - result.push_str("(?:"); + let ch = chars[i]; + + if escaped { + // Previous char was backslash, this char is escaped + result.push(ch); + escaped = false; + } else if ch == '\\' { + // Start escape sequence + result.push(ch); + escaped = true; + } else if ch == '[' && !in_char_class { + // Entering character class + result.push(ch); + in_char_class = true; + } else if ch == ']' && in_char_class { + // Exiting character class + result.push(ch); + in_char_class = false; + } else if ch == '(' && !in_char_class { + // Check if this is a bare capturing group + if i + 1 >= chars.len() || chars[i + 1] != '?' { + // This is a bare capturing group, convert it to non-capturing + result.push_str("(?:"); + } else { + // This is already a special group (?...), keep as-is + result.push(ch); + } } else { - result.push(chars[i]); + result.push(ch); } + i += 1; } @@ -237,11 +261,41 @@ mod tests { #[test] fn test_escaped_parentheses() { - // Note: This test checks literal escaped parens in the string - // In actual regex, \( is an escaped paren, but we're working with strings + // Escaped parentheses should be preserved as-is (they match literal parens) let input = r"\(not a group\)"; - let expected = r"\(?:not a group\)"; // Our simple implementation will convert \( - // This is a known limitation - escaped parens would need more sophisticated parsing + let expected = r"\(not a group\)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_character_class_with_parentheses() { + // Parentheses inside character classes should be preserved + let input = "[()]"; + let expected = "[()]"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_character_class_complex() { + // Complex pattern with character class containing parens + let input = "before[()]after"; + let expected = "before[()]after"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_mixed_escaped_and_capturing() { + // Mix of escaped parens, character classes, and capturing groups + let input = r"\((a|b)[()]"; + let expected = r"\((?:a|b)[()]"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_nested_character_classes() { + // Character class followed by capturing group + let input = "[()]{2}(abc)"; + let expected = "[()]{2}(?:abc)"; assert_eq!(convert_capturing_to_non_capturing(input), expected); } From dfe8085663c7153312c41ffaa46f0149c0cd704d Mon Sep 17 00:00:00 2001 From: Rute Figueiredo Date: Thu, 30 Oct 2025 17:41:01 -0300 Subject: [PATCH 6/6] fix: convert named captures to non-capturing in private patterns Extended convert_capturing_to_non_capturing to handle both PCRE-style (?...) and Rust-style (?P...) named capture groups, not just bare capturing groups. This prevents named captures in private patterns from interfering with public capture group numbering. Key changes: - Detect and convert both named capture syntaxes to (?:...) - Distinguish lookbehind assertions (?<=, ?...)` (PCRE style) to `(?:...)` +/// - Named captures `(?P...)` (Rust style) to `(?:...)` /// /// This function properly handles: /// - Escaped parentheses like `\(` and `\)` (preserved as-is) @@ -17,15 +22,17 @@ use crate::{DecomposedRegexConfig, NFAGraph, RegexPart}; /// /// # Returns /// -/// A new string with bare capturing groups converted to non-capturing groups +/// A new string with all capturing groups converted to non-capturing groups /// /// # Examples /// /// ```text -/// convert_capturing_to_non_capturing("(a|b)") → "(?:a|b)" -/// convert_capturing_to_non_capturing("(?:a|b)") → "(?:a|b)" // unchanged -/// convert_capturing_to_non_capturing(r"\(a\)") → r"\(a\)" // escaped parens preserved -/// convert_capturing_to_non_capturing("[()]") → "[()]" // char class preserved +/// convert_capturing_to_non_capturing("(a|b)") → "(?:a|b)" +/// convert_capturing_to_non_capturing("(?abc)") → "(?:abc)" +/// convert_capturing_to_non_capturing("(?Pabc)") → "(?:abc)" +/// convert_capturing_to_non_capturing("(?:a|b)") → "(?:a|b)" // unchanged +/// convert_capturing_to_non_capturing(r"\(a\)") → r"\(a\)" // escaped parens preserved +/// convert_capturing_to_non_capturing("[()]") → "[()]" // char class preserved /// ``` fn convert_capturing_to_non_capturing(pattern: &str) -> String { let mut result = String::with_capacity(pattern.len() + pattern.len() / 4); @@ -54,12 +61,43 @@ fn convert_capturing_to_non_capturing(pattern: &str) -> String { result.push(ch); in_char_class = false; } else if ch == '(' && !in_char_class { - // Check if this is a bare capturing group + // Check if this is a capturing group that needs conversion if i + 1 >= chars.len() || chars[i + 1] != '?' { - // This is a bare capturing group, convert it to non-capturing + // Bare capturing group: (...) + result.push_str("(?:"); + } else if i + 2 < chars.len() && chars[i + 2] == '<' { + // Could be: (?<=...) positive lookbehind, or (?...) PCRE named capture + if i + 3 < chars.len() && (chars[i + 3] == '=' || chars[i + 3] == '!') { + // Lookbehind assertion: (?<=...) or (?...) + // Convert to non-capturing and skip the name + result.push_str("(?:"); + i += 2; // Skip '?' and '<' + // Skip until we find the closing '>' + while i + 1 < chars.len() && chars[i + 1] != '>' { + i += 1; + } + if i + 1 < chars.len() && chars[i + 1] == '>' { + i += 1; // Skip the '>' + } + } + } else if i + 3 < chars.len() && chars[i + 2] == 'P' && chars[i + 3] == '<' { + // Rust named capture: (?P...) + // Convert to non-capturing and skip the name result.push_str("(?:"); + i += 3; // Skip '?', 'P', and '<' + // Skip until we find the closing '>' + while i + 1 < chars.len() && chars[i + 1] != '>' { + i += 1; + } + if i + 1 < chars.len() && chars[i + 1] == '>' { + i += 1; // Skip the '>' + } } else { - // This is already a special group (?...), keep as-is + // Other special group like (?:...), (?=...), etc. result.push(ch); } } else { @@ -211,9 +249,9 @@ mod tests { } #[test] - fn test_preserve_named_groups() { + fn test_convert_pcre_named_groups() { let input = "(?abc)"; - let expected = "(?abc)"; + let expected = "(?:abc)"; assert_eq!(convert_capturing_to_non_capturing(input), expected); } @@ -299,6 +337,55 @@ mod tests { assert_eq!(convert_capturing_to_non_capturing(input), expected); } + #[test] + fn test_convert_rust_named_groups() { + let input = "(?Pabc)"; + let expected = "(?:abc)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_convert_multiple_named_groups() { + let input = "(?[0-9]+)(?P[a-z]+)"; + let expected = "(?:[0-9]+)(?:[a-z]+)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_convert_mixed_bare_and_named_groups() { + let input = "(bare)(?abc)(?Pdef)"; + let expected = "(?:bare)(?:abc)(?:def)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_convert_named_groups_with_complex_content() { + let input = "(?[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,})"; + let expected = "(?:[a-zA-Z0-9._%+-]+@[a-zA-Z0-9.-]+\\.[a-zA-Z]{2,})"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_convert_nested_named_and_bare_groups() { + let input = "(?(inner))"; + let expected = "(?:(?:inner))"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_convert_named_groups_with_underscores_and_numbers() { + let input = "(?a)(?Pb)"; + let expected = "(?:a)(?:b)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + + #[test] + fn test_mixed_named_and_special_groups() { + let input = "(?abc)(?:def)(?=ghi)(?Pjkl)"; + let expected = "(?:abc)(?:def)(?=ghi)(?:jkl)"; + assert_eq!(convert_capturing_to_non_capturing(input), expected); + } + #[test] fn test_decomposed_to_composed_with_capturing_groups() { let config = DecomposedRegexConfig {