Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,18 @@ and this project adheres to [Semantic Versioning](http://semver.org/spec/v2.0.0.

- enable frozen string literals (#42)

### Fixed

- handling of backreference numbers in conjunction with conditionals
- e.g. `(a)?(b)(?(1)c|d)\2` expanded with a wrong number for the false branch
- thanks to https://github.com/slevithan for reporting all of the following:
- handling of backreferences to non-participating groups, e.g. `\1()` (#26)
- handling of backreference multiplexing, e.g. `(?<n>a)(?<n>b)\k<n>` (#28)
- handling of backreferences to subexp calls, e.g. `(?<a>[ab])\g<a>\k<a>` (#29)
- handling of indirect recursion, e.g. `(a\g<2>?b)(c\g<1>?d)` (#35)
- handling of UTF-8 hex escapes, e.g. `\xEF\xC0\xBB` (#37)
- improved extended grapheme type handling (`\X`) (#38)

## [3.13.0] - 2025-01-27

### Added
Expand Down
6 changes: 3 additions & 3 deletions Gemfile
Original file line number Diff line number Diff line change
Expand Up @@ -7,8 +7,8 @@ gemspec

gem 'debug'
gem 'gouteur', '~> 1.0'
gem 'mini_racer', '~> 0.16'
gem 'rake', '~> 13.0'
gem 'mini_racer', '~> 0.19'
gem 'rake', '~> 13.3'
gem 'rspec', '~> 3.13'
gem 'rubocop', '~> 1.68'
gem 'rubocop', '~> 1.79'
gem 'simplecov-cobertura'
2 changes: 1 addition & 1 deletion js_regex.gemspec
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,6 @@ Gem::Specification.new do |s|
s.required_ruby_version = '>= 2.1.0'

s.add_dependency 'character_set', '~> 1.4'
s.add_dependency 'regexp_parser', '~> 2.10'
s.add_dependency 'regexp_parser', '~> 2.11'
s.add_dependency 'regexp_property_values', '~> 1.0'
end
62 changes: 58 additions & 4 deletions lib/js_regex/converter/backreference_converter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -21,20 +21,41 @@ def convert_data
end

def convert_name_ref
if context.es_2018_or_higher?
# ES 2018+ supports named backrefs, but only the angled-bracket syntax
Node.new("\\k<#{expression.name}>", reference: new_position, type: :backref)
# Check if this is a multiplexed named group reference
if expression.referenced_expressions.count > 1
convert_multiplexed_name_ref
else
# Always use numeric backrefs since we convert all named groups to numbered
# (see comment in GroupConverter)
convert_to_plain_num_ref
end
end

def convert_to_plain_num_ref
position = new_position

# Check if this backreference refers to a group that was recursively called
original_group = target_position
if (recursive_position = context.get_recursive_group_position(original_group))
# Use the position of the group created by the recursive call
position = recursive_position
end

text = "\\#{position}#{'(?:)' if expression.x?}"
Node.new(text, reference: position, type: :backref)
end

def convert_multiplexed_name_ref
# Create alternation of all groups with the same name
positions = expression.referenced_expressions.map do |ref_exp|
context.new_capturing_group_position(ref_exp.number)
end

# Build alternation like (?:\1|\2)
alternation = positions.map { |pos| "\\#{pos}" }.join('|')
Node.new("(?:#{alternation})")
end

def new_position
context.new_capturing_group_position(target_position)
end
Expand All @@ -50,16 +71,49 @@ def convert_call
end

context.count_recursion(expression)

# Track groups before the wrapper group is added
groups_before_wrapper = context.capturing_group_count

context.increment_local_capturing_group_count
target_copy = expression.referenced_expression.unquantified_clone
# avoid "Duplicate capture group name" error in JS
target_copy.token = :capture if target_copy.is?(:named, :group)
context.start_subexp_recursion
result = convert_expression(target_copy)
context.end_subexp_recursion
# wrap in group if it is a full-pattern recursion

# Track all groups created during this recursive call
# This handles both the directly called group and any nested groups within it
# Get all group numbers from the referenced expression
original_groups = collect_group_numbers(expression.referenced_expression)

# The first new group number is groups_before_wrapper + 1
# (the wrapper group from increment_local_capturing_group_count doesn't appear in output)
first_new_group = groups_before_wrapper + 1

# Map each original group to its corresponding new group
# For example, if we recursively called group 1 which contains group 2,
# and this created groups 3 and 4, then:
# - group 1 -> group 3
# - group 2 -> group 4
original_groups.each_with_index do |old_group_num, index|
new_group_num = first_new_group + index
context.track_recursive_group_call(old_group_num, new_group_num)
end

# wrap in passive group if it is a full-pattern recursion
expression.reference == 0 ? Node.new('(?:', result, ')') : result
end

def collect_group_numbers(exp)
return [] if exp.terminal?

numbers = []
numbers << exp.number if exp.capturing?
exp.each_expression { |sub| numbers += collect_group_numbers(sub) }
numbers
end
end
end
end
11 changes: 9 additions & 2 deletions lib/js_regex/converter/base.rb
Original file line number Diff line number Diff line change
Expand Up @@ -46,7 +46,9 @@ def apply_quantifier(node)
end

def convert_subexpressions
Node.new(*expression.map { |subexp| convert_expression(subexp) })
# mark alternation and conditional branches for processing in second pass
type = expression.is?(:sequence) ? :branch : :plain
Node.new(*expression.map { |subexp| convert_expression(subexp) }, type: type)
end

def convert_expression(expression)
Expand Down Expand Up @@ -80,10 +82,15 @@ def drop
def wrap_in_backrefed_lookahead(content)
number = context.capturing_group_count + 1
backref_node = Node.new("\\#{number}", reference: number, type: :backref)
backrefed_group = Node.new('(', *content, ')', reference: number, type: :captured_group)
context.increment_local_capturing_group_count
# The surrounding group is added so that quantifiers apply to the whole.
# Without it, `(?:)` would need to be appended as literal digits may follow.
Node.new('(?:(?=(', *content, '))', backref_node, ')')
Node.new('(?:(?=', backrefed_group, ')', backref_node, ')')
end

def unmatchable_substitution
'(?!)'
end
end
end
Expand Down
28 changes: 26 additions & 2 deletions lib/js_regex/converter/context.rb
Original file line number Diff line number Diff line change
Expand Up @@ -20,8 +20,10 @@ def initialize(case_insensitive_root: false, fail_fast: false, target: nil)
self.capturing_group_count = 0
self.fail_fast = fail_fast
self.recursions_per_expression = {}
self.recursion_stack = []
self.required_options_hash = {}
self.warnings = []
self.recursive_group_map = {}

self.case_insensitive_root = case_insensitive_root
self.target = target
Expand Down Expand Up @@ -73,11 +75,12 @@ def increment_local_capturing_group_count
end

def recursions(exp)
recursions_per_expression[recursion_id(exp)] || 0
# Count recursions in the current stack path only
recursion_stack.count { |e| recursion_id(e) == recursion_id(exp) }
end

def count_recursion(exp)
recursions_per_expression[recursion_id(exp)] = recursions(exp) + 1
recursion_stack.push(exp)
end

def recursion_id(exp)
Expand All @@ -86,10 +89,18 @@ def recursion_id(exp)

def start_subexp_recursion
self.in_subexp_recursion = true
self.recursion_start_group_count = capturing_group_count
end

def end_subexp_recursion
self.in_subexp_recursion = false
# Pop the last recursion from stack when exiting
recursion_stack.pop if recursion_stack.any?
end

# Get the number of groups at the start of the current recursion
def recursion_start_group_count
self.recursion_start_group_count || 0
end

# takes and returns 1-indexed group positions.
Expand All @@ -106,18 +117,31 @@ def original_capturing_group_count
capturing_group_count - total_added_capturing_groups
end

# Track that a group was created by a recursive call
def track_recursive_group_call(original_group_num, new_group_num)
recursive_group_map[original_group_num] = new_group_num
end

# Get the group number created by a recursive call
def get_recursive_group_position(original_group_num)
recursive_group_map[original_group_num]
end

private

attr_accessor :added_capturing_groups_after_group,
:recursions_per_expression,
:recursion_stack,
:required_options_hash,
:recursive_group_map,
:target

attr_writer :capturing_group_count,
:case_insensitive_root,
:fail_fast,
:in_atomic_group,
:in_subexp_recursion,
:recursion_start_group_count,
:warnings

def total_added_capturing_groups
Expand Down
2 changes: 1 addition & 1 deletion lib/js_regex/converter/escape_converter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -40,7 +40,7 @@ def convert_data
case subtype
when :codepoint_list
convert_codepoint_list
when :control, :meta_sequence
when :control, :meta_sequence, :utf8_hex
unicode_escape_codepoint
when :literal
LiteralConverter.convert_data(expression.char, context)
Expand Down
14 changes: 4 additions & 10 deletions lib/js_regex/converter/group_converter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -24,12 +24,10 @@ def convert_data
end

def build_named_group
if context.es_2018_or_higher?
# ES 2018+ supports named groups, but only the angled-bracket syntax
build_group(head: "(?<#{expression.name}>")
else
build_group
end
# Always convert named groups to numbered groups. ES2018+ supports named
# groups, but can not handle repeated names in multiplexing or conditional
# expansion scenarios.
build_group
end

def emulate_atomic_group
Expand Down Expand Up @@ -71,10 +69,6 @@ def unmatchable_absence_group?
expression.empty?
end

def unmatchable_substitution
'(?!)'
end

def build_absence_group
head = "(?:(?:.|\\n){,#{expression.inner_match_length.min - 1}}|(?:(?!"
tail = ')(?:.|\n))*)'
Expand Down
31 changes: 29 additions & 2 deletions lib/js_regex/converter/type_converter.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,10 +12,37 @@ class TypeConverter < JsRegex::Converter::Base
NONHEX_EXPANSION = '[^0-9A-Fa-f]'
I_MODE_HEX_EXPANSION = '[0-9A-F]'
I_MODE_NONHEX_EXPANSION = '[^0-9A-F]'
LINEBREAK_EXPANSION = '(?:\r\n|[\n\v\f\r\u0085\u2028\u2029])'
ES2018_HEX_EXPANSION = '\p{AHex}'
ES2018_NONHEX_EXPANSION = '\P{AHex}'
ES2018_XGRAPHEME_EXPANSION = '[\P{M}\P{Lm}](?:(?:[\u035C\u0361]\P{M}\p{M}*)|\u200d|\p{M}|\p{Lm}|\p{Emoji_Modifier})*'
LINEBREAK_EXPANSION = '(?:\r\n|[\n\v\f\r\u0085\u2028\u2029])'
# partially taken from https://unicode.org/reports/tr51/#EBNF_and_Regex
ES2018_XGRAPHEME_EXPANSION = <<-'REGEXP'.gsub(/\s+/, '')
(?:
\r\n
|
\p{RI}\p{RI}
|
\p{Emoji}
(?:
\p{EMod}
|
\uFE0F\u20E3?
|
[\u{E0020}-\u{E007E}]+\u{E007F}
)?
(?:
\u200D
(?:
\p{RI}\p{RI}
|
\p{Emoji}(?:\p{EMod}|\uFE0F\u20E3?|[\u{E0020}-\u{E007E}]+\u{E007F})?
)
)*
|
[\P{M}\P{Lm}](?:\u200d|\p{M}|\p{Lm}|\p{Emoji_Modifier})*
)
REGEXP


def self.directly_compatible?(expression, _context = nil)
case expression.token
Expand Down
7 changes: 6 additions & 1 deletion lib/js_regex/node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ class Node

TYPES = %i[
backref
branch
captured_group
conditional
dropped
Expand Down Expand Up @@ -49,7 +50,7 @@ def to_s
case type
when :dropped
''
when :backref, :captured_group, :plain
when :backref, :branch, :captured_group, :plain
children.join << quantifier.to_s
else
raise TypeError.new(
Expand All @@ -65,6 +66,10 @@ def update(attrs)
self
end

def optional?
quantifier && quantifier.min == 0
end

private

TypeError = Class.new(::TypeError).extend(JsRegex::Error)
Expand Down
Loading