Skip to content

Commit bca5cce

Browse files
committed
Cleanup: extract shared conversion code into utils
1 parent c09b609 commit bca5cce

File tree

15 files changed

+195
-148
lines changed

15 files changed

+195
-148
lines changed

lib/js_regex/conversion.rb

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10,7 +10,6 @@ class Conversion
1010
require 'regexp_parser'
1111
require_relative 'converter'
1212
require_relative 'error'
13-
require_relative 'node'
1413
require_relative 'second_pass'
1514
require_relative 'target'
1615

lib/js_regex/converter/base.rb

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,8 @@
11
# frozen_string_literal: true
22

3+
require_relative '../node'
4+
require_relative '../utils'
5+
36
class JsRegex
47
module Converter
58
#

lib/js_regex/converter/escape_converter.rb

Lines changed: 3 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -1,39 +1,13 @@
11
# frozen_string_literal: true
22

33
require_relative 'base'
4-
require_relative 'literal_converter'
54

65
class JsRegex
76
module Converter
87
#
98
# Template class implementation.
109
#
1110
class EscapeConverter < JsRegex::Converter::Base
12-
ESCAPES_SHARED_BY_RUBY_AND_JS = %i[
13-
alternation
14-
backslash
15-
backspace
16-
bol
17-
carriage
18-
codepoint
19-
dot
20-
eol
21-
form_feed
22-
group_close
23-
group_open
24-
hex
25-
interval_close
26-
interval_open
27-
newline
28-
one_or_more
29-
set_close
30-
set_open
31-
tab
32-
vertical_tab
33-
zero_or_more
34-
zero_or_one
35-
].freeze
36-
3711
private
3812

3913
def convert_data
@@ -43,10 +17,10 @@ def convert_data
4317
when :control, :meta_sequence, :utf8_hex
4418
unicode_escape_codepoint
4519
when :literal
46-
LiteralConverter.convert_data(expression.char, context)
20+
Utils::Literals.convert_data(expression.char, context)
4721
when :bell, :escape, :hex, :octal
4822
hex_escape_codepoint
49-
when *ESCAPES_SHARED_BY_RUBY_AND_JS
23+
when *Utils::Escapes::ESCAPES_SHARED_BY_RUBY_AND_JS
5024
pass_through
5125
else
5226
warn_of_unsupported_feature
@@ -58,7 +32,7 @@ def convert_codepoint_list
5832
split_codepoint_list
5933
else
6034
expression.chars.each_with_object(Node.new) do |char, node|
61-
node << LiteralConverter.convert_data(Regexp.escape(char), context)
35+
node << Utils::Literals.convert_data(Regexp.escape(char), context)
6236
end
6337
end
6438
end

lib/js_regex/converter/literal_converter.rb

Lines changed: 1 addition & 42 deletions
Original file line numberDiff line numberDiff line change
@@ -8,51 +8,10 @@ module Converter
88
# Template class implementation.
99
#
1010
class LiteralConverter < JsRegex::Converter::Base
11-
ASTRAL_PLANE_CODEPOINT_PATTERN = /[\u{10000}-\u{10FFFF}]/.freeze
12-
LITERAL_REQUIRING_ESCAPE_PATTERN = /[\/\f\n\r\t\v]/.freeze
13-
14-
class << self
15-
def convert_data(data, context)
16-
if !context.u? && data =~ ASTRAL_PLANE_CODEPOINT_PATTERN
17-
if context.enable_u_option
18-
escape_incompatible_bmp_literals(data)
19-
else
20-
convert_astral_data(data)
21-
end
22-
else
23-
escape_incompatible_bmp_literals(data)
24-
end
25-
end
26-
27-
def convert_astral_data(data)
28-
data.each_char.each_with_object(Node.new) do |char, node|
29-
if char.ord > 0xFFFF
30-
node << surrogate_substitution_for(char)
31-
else
32-
node << escape_incompatible_bmp_literals(char)
33-
end
34-
end
35-
end
36-
37-
ESCAPES = Hash.new { |h, k| raise KeyError, "#{h}[#{k.inspect}]" }
38-
.merge("\f\n\r\t\v".chars.to_h { |c| [c, Regexp.escape(c)] })
39-
.merge('/' => '\\/')
40-
41-
def escape_incompatible_bmp_literals(data)
42-
data.gsub(LITERAL_REQUIRING_ESCAPE_PATTERN, ESCAPES)
43-
end
44-
45-
private
46-
47-
def surrogate_substitution_for(char)
48-
CharacterSet::Writer.write_surrogate_ranges([], [char.codepoints])
49-
end
50-
end
51-
5211
private
5312

5413
def convert_data
55-
result = self.class.convert_data(data, context)
14+
result = Utils::Literals.convert_data(data, context)
5615
if context.case_insensitive_root && !expression.case_insensitive?
5716
warn_of_unsupported_feature('nested case-sensitive literal')
5817
elsif !context.case_insensitive_root && expression.case_insensitive?

lib/js_regex/converter/meta_converter.rb

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -8,19 +8,6 @@ module Converter
88
# Template class implementation.
99
#
1010
class MetaConverter < JsRegex::Converter::Base
11-
DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\n\uD800-\uDFFF])'
12-
ML_DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\uD800-\uDFFF])'
13-
# Possible improvements for dot conversion:
14-
#
15-
# In ES2015, the 'u' flag allows dots to match astral chars. Unfortunately
16-
# the dot keeps matching lone surrogates even with this flag, so the use
17-
# of an expansion is still necessary to get the same behavior as in Ruby.
18-
#
19-
# ES2018 has the dotall flag 's', but it is tricky to use in conversions.
20-
# 's' activates matching of BOTH astral chars and "\n", whereas the dot in
21-
# Ruby doesn't match "\n" by default, and even with the 'm' flag set on
22-
# the root, subexps might still exclude "\n" like so: /.(?-m:.)./m
23-
2411
private
2512

2613
def convert_data
@@ -34,6 +21,19 @@ def convert_data
3421
end
3522
end
3623

24+
DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\n\uD800-\uDFFF])'
25+
ML_DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\uD800-\uDFFF])'
26+
# Possible improvements for dot conversion:
27+
#
28+
# In ES2015, the 'u' flag allows dots to match astral chars. Unfortunately
29+
# the dot keeps matching lone surrogates even with this flag, so the use
30+
# of an expansion is still necessary to get the same behavior as in Ruby.
31+
#
32+
# ES2018 has the dotall flag 's', but it is tricky to use in conversions.
33+
# 's' activates matching of BOTH astral chars and "\n", whereas the dot in
34+
# Ruby doesn't match "\n" by default, and even with the 'm' flag set on
35+
# the root, subexps might still exclude "\n" like so: /.(?-m:.)./m
36+
3737
def convert_alternatives
3838
kept_any_previous_branch = nil
3939

lib/js_regex/converter/property_converter.rb

Lines changed: 1 addition & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -12,16 +12,11 @@ module Converter
1212
# codepoints matched by the property and build a set string from them.
1313
#
1414
class PropertyConverter < JsRegex::Converter::Base
15-
# A map of normalized Ruby property names to names supported by ES2018+.
16-
def self.map
17-
@map ||= File.read("#{__dir__}/property_map.csv").scan(/(.+),(.+)/).to_h
18-
end
19-
2015
private
2116

2217
def convert_data
2318
if context.es_2018_or_higher? &&
24-
(prop_name_in_js = self.class.map[subtype.to_s.tr('_', '')])
19+
(prop_name_in_js = Utils::Properties.name_in_js(subtype))
2520
context.enable_u_option
2621
"\\#{expression.negative? ? 'P' : 'p'}{#{prop_name_in_js}}"
2722
else

lib/js_regex/converter/set_converter.rb

Lines changed: 6 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,6 @@
11
# frozen_string_literal: true
22

33
require_relative 'base'
4-
require_relative 'escape_converter'
5-
require_relative 'type_converter'
64
require 'character_set'
75

86
class JsRegex
@@ -50,37 +48,36 @@ def simple_convert_child(exp)
5048
simple_convert_child(op) or return false
5149
end.join('-')
5250
when :type
53-
TypeConverter.directly_compatible?(exp, context) &&
54-
exp.text
51+
Utils::CharTypes.directly_compatible?(exp) && exp.text
5552
when :escape
5653
return exp.text if SET_SPECIFIC_ESCAPES_PATTERN.match?(exp.text)
5754

5855
case exp.token
5956
when *CONVERTIBLE_ESCAPE_TOKENS
60-
EscapeConverter.new.convert(exp, context)
57+
JsRegex::Converter.convert(exp, context)
6158
when :literal
6259
exp.char.ord <= 0xFFFF &&
63-
LiteralConverter.escape_incompatible_bmp_literals(exp.char)
60+
Utils::Literals.escape_incompatible_bmp_literals(exp.char)
6461
end
6562
end
6663
end
6764

6865
def simple_convert_literal_child(exp)
6966
if !context.u? &&
70-
exp.text =~ LiteralConverter::ASTRAL_PLANE_CODEPOINT_PATTERN &&
67+
exp.text =~ Utils::Literals::ASTRAL_PLANE_CODEPOINT_PATTERN &&
7168
!context.enable_u_option
7269
false
7370
elsif SET_LITERALS_REQUIRING_ESCAPE_PATTERN.match?(exp.text)
7471
"\\#{exp.text}"
7572
else
76-
LiteralConverter.escape_incompatible_bmp_literals(exp.text)
73+
Utils::Literals.escape_incompatible_bmp_literals(exp.text)
7774
end
7875
end
7976

8077
SET_LITERALS_REQUIRING_ESCAPE_PATTERN = Regexp.union(%w<( ) [ ] { } / - |>)
8178
SET_SPECIFIC_ESCAPES_PATTERN = /[\^\-]/.freeze
8279
CONVERTIBLE_ESCAPE_TOKENS = %i[control meta_sequence bell escape octal] +
83-
EscapeConverter::ESCAPES_SHARED_BY_RUBY_AND_JS
80+
Utils::Escapes::ESCAPES_SHARED_BY_RUBY_AND_JS
8481

8582
def full_recalculation
8683
# Fetch codepoints as if the set was case-sensitive, then re-add

lib/js_regex/converter/type_converter.rb

Lines changed: 42 additions & 47 deletions
Original file line numberDiff line numberDiff line change
@@ -8,51 +8,6 @@ module Converter
88
# Template class implementation.
99
#
1010
class TypeConverter < JsRegex::Converter::Base
11-
HEX_EXPANSION = '[0-9A-Fa-f]'
12-
NONHEX_EXPANSION = '[^0-9A-Fa-f]'
13-
I_MODE_HEX_EXPANSION = '[0-9A-F]'
14-
I_MODE_NONHEX_EXPANSION = '[^0-9A-F]'
15-
LINEBREAK_EXPANSION = '(?:\r\n|[\n\v\f\r\u0085\u2028\u2029])'
16-
ES2018_HEX_EXPANSION = '\p{AHex}'
17-
ES2018_NONHEX_EXPANSION = '\P{AHex}'
18-
# partially taken from https://unicode.org/reports/tr51/#EBNF_and_Regex
19-
ES2018_XGRAPHEME_EXPANSION = <<-'REGEXP'.gsub(/\s+/, '')
20-
(?:
21-
\r\n
22-
|
23-
\p{RI}\p{RI}
24-
|
25-
\p{Emoji}
26-
(?:
27-
\p{EMod}
28-
|
29-
\uFE0F\u20E3?
30-
|
31-
[\u{E0020}-\u{E007E}]+\u{E007F}
32-
)?
33-
(?:
34-
\u200D
35-
(?:
36-
\p{RI}\p{RI}
37-
|
38-
\p{Emoji}(?:\p{EMod}|\uFE0F\u20E3?|[\u{E0020}-\u{E007E}]+\u{E007F})?
39-
)
40-
)*
41-
|
42-
[\P{M}\P{Lm}](?:\u200d|\p{M}|\p{Lm}|\p{Emoji_Modifier})*
43-
)
44-
REGEXP
45-
46-
47-
def self.directly_compatible?(expression, _context = nil)
48-
case expression.token
49-
when :space, :nonspace
50-
!expression.ascii_classes?
51-
when :digit, :nondigit, :word, :nonword
52-
!expression.unicode_classes?
53-
end
54-
end
55-
5611
private
5712

5813
def convert_data
@@ -62,10 +17,12 @@ def convert_data
6217
when :linebreak then linebreak_expansion
6318
when :xgrapheme then xgrapheme
6419
when :digit, :space, :word
65-
return pass_through if self.class.directly_compatible?(expression)
20+
return pass_through if Utils::CharTypes.directly_compatible?(expression)
21+
6622
set_substitution
6723
when :nondigit, :nonspace, :nonword
68-
return pass_through if self.class.directly_compatible?(expression)
24+
return pass_through if Utils::CharTypes.directly_compatible?(expression)
25+
6926
negative_set_substitution
7027
else
7128
warn_of_unsupported_feature
@@ -82,6 +39,10 @@ def hex_expansion
8239
end
8340
end
8441

42+
ES2018_HEX_EXPANSION = '\p{AHex}'
43+
I_MODE_HEX_EXPANSION = '[0-9A-F]'
44+
HEX_EXPANSION = '[0-9A-Fa-f]'
45+
8546
def nonhex_expansion
8647
if context.es_2018_or_higher? && context.enable_u_option
8748
ES2018_NONHEX_EXPANSION
@@ -92,10 +53,16 @@ def nonhex_expansion
9253
end
9354
end
9455

56+
NONHEX_EXPANSION = '[^0-9A-Fa-f]'
57+
I_MODE_NONHEX_EXPANSION = '[^0-9A-F]'
58+
ES2018_NONHEX_EXPANSION = '\P{AHex}'
59+
9560
def linebreak_expansion
9661
wrap_in_backrefed_lookahead(LINEBREAK_EXPANSION)
9762
end
9863

64+
LINEBREAK_EXPANSION = '(?:\r\n|[\n\v\f\r\u0085\u2028\u2029])'
65+
9966
def negative_set_substitution
10067
# ::of_expression returns an inverted set for negative expressions,
10168
# so we need to un-invert before wrapping in [^ and ]. Kinda lame.
@@ -117,6 +84,34 @@ def xgrapheme
11784
warn_of_unsupported_feature
11885
end
11986
end
87+
88+
# partially taken from https://unicode.org/reports/tr51/#EBNF_and_Regex
89+
ES2018_XGRAPHEME_EXPANSION = <<-'REGEXP'.gsub(/\s+/, '')
90+
(?:
91+
\r\n
92+
|
93+
\p{RI}\p{RI}
94+
|
95+
\p{Emoji}
96+
(?:
97+
\p{EMod}
98+
|
99+
\uFE0F\u20E3?
100+
|
101+
[\u{E0020}-\u{E007E}]+\u{E007F}
102+
)?
103+
(?:
104+
\u200D
105+
(?:
106+
\p{RI}\p{RI}
107+
|
108+
\p{Emoji}(?:\p{EMod}|\uFE0F\u20E3?|[\u{E0020}-\u{E007E}]+\u{E007F})?
109+
)
110+
)*
111+
|
112+
[\P{M}\P{Lm}](?:\u200d|\p{M}|\p{Lm}|\p{Emoji_Modifier})*
113+
)
114+
REGEXP
120115
end
121116
end
122117
end

lib/js_regex/utils.rb

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,4 @@
1+
require_relative 'utils/char_types'
2+
require_relative 'utils/escapes'
3+
require_relative 'utils/literals'
4+
require_relative 'utils/properties'

lib/js_regex/utils/char_types.rb

Lines changed: 18 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,18 @@
1+
# frozen_string_literal: true
2+
3+
class JsRegex
4+
module Utils
5+
module CharTypes
6+
def self.directly_compatible?(expression)
7+
case expression.token
8+
when :space, :nonspace
9+
!expression.ascii_classes?
10+
when :digit, :nondigit, :word, :nonword
11+
!expression.unicode_classes?
12+
else # :hex, :nonhex, :linebreak, :xgrapheme
13+
false
14+
end
15+
end
16+
end
17+
end
18+
end

0 commit comments

Comments
 (0)