Cleanup: extract shared conversion code into utils

jaynetics · jaynetics · commit bca5cceddc9c · 2025-08-03T17:31:54.000+02:00
diff --git a/lib/js_regex/conversion.rb b/lib/js_regex/conversion.rb
@@ -10,7 +10,6 @@ class Conversion
     require 'regexp_parser'
     require_relative 'converter'
     require_relative 'error'
-    require_relative 'node'
     require_relative 'second_pass'
     require_relative 'target'
 
diff --git a/lib/js_regex/converter/base.rb b/lib/js_regex/converter/base.rb
@@ -1,5 +1,8 @@
 # frozen_string_literal: true
 
+require_relative '../node'
+require_relative '../utils'
+
 class JsRegex
   module Converter
     #
diff --git a/lib/js_regex/converter/escape_converter.rb b/lib/js_regex/converter/escape_converter.rb
@@ -1,39 +1,13 @@
 # frozen_string_literal: true
 
 require_relative 'base'
-require_relative 'literal_converter'
 
 class JsRegex
   module Converter
     #
     # Template class implementation.
     #
     class EscapeConverter < JsRegex::Converter::Base
-      ESCAPES_SHARED_BY_RUBY_AND_JS = %i[
-        alternation
-        backslash
-        backspace
-        bol
-        carriage
-        codepoint
-        dot
-        eol
-        form_feed
-        group_close
-        group_open
-        hex
-        interval_close
-        interval_open
-        newline
-        one_or_more
-        set_close
-        set_open
-        tab
-        vertical_tab
-        zero_or_more
-        zero_or_one
-      ].freeze
-
       private
 
       def convert_data
@@ -43,10 +17,10 @@ def convert_data
         when :control, :meta_sequence, :utf8_hex
           unicode_escape_codepoint
         when :literal
-          LiteralConverter.convert_data(expression.char, context)
+          Utils::Literals.convert_data(expression.char, context)
         when :bell, :escape, :hex, :octal
           hex_escape_codepoint
-        when *ESCAPES_SHARED_BY_RUBY_AND_JS
+        when *Utils::Escapes::ESCAPES_SHARED_BY_RUBY_AND_JS
           pass_through
         else
           warn_of_unsupported_feature
@@ -58,7 +32,7 @@ def convert_codepoint_list
           split_codepoint_list
         else
           expression.chars.each_with_object(Node.new) do |char, node|
-            node << LiteralConverter.convert_data(Regexp.escape(char), context)
+            node << Utils::Literals.convert_data(Regexp.escape(char), context)
           end
         end
       end
diff --git a/lib/js_regex/converter/literal_converter.rb b/lib/js_regex/converter/literal_converter.rb
@@ -8,51 +8,10 @@ module Converter
     # Template class implementation.
     #
     class LiteralConverter < JsRegex::Converter::Base
-      ASTRAL_PLANE_CODEPOINT_PATTERN = /[\u{10000}-\u{10FFFF}]/.freeze
-      LITERAL_REQUIRING_ESCAPE_PATTERN = /[\/\f\n\r\t\v]/.freeze
-
-      class << self
-        def convert_data(data, context)
-          if !context.u? && data =~ ASTRAL_PLANE_CODEPOINT_PATTERN
-            if context.enable_u_option
-              escape_incompatible_bmp_literals(data)
-            else
-              convert_astral_data(data)
-            end
-          else
-            escape_incompatible_bmp_literals(data)
-          end
-        end
-
-        def convert_astral_data(data)
-          data.each_char.each_with_object(Node.new) do |char, node|
-            if char.ord > 0xFFFF
-              node << surrogate_substitution_for(char)
-            else
-              node << escape_incompatible_bmp_literals(char)
-            end
-          end
-        end
-
-        ESCAPES = Hash.new { |h, k| raise KeyError, "#{h}[#{k.inspect}]" }
-          .merge("\f\n\r\t\v".chars.to_h { |c| [c, Regexp.escape(c)] })
-          .merge('/' => '\\/')
-
-        def escape_incompatible_bmp_literals(data)
-          data.gsub(LITERAL_REQUIRING_ESCAPE_PATTERN, ESCAPES)
-        end
-
-        private
-
-        def surrogate_substitution_for(char)
-          CharacterSet::Writer.write_surrogate_ranges([], [char.codepoints])
-        end
-      end
-
       private
 
       def convert_data
-        result = self.class.convert_data(data, context)
+        result = Utils::Literals.convert_data(data, context)
         if context.case_insensitive_root && !expression.case_insensitive?
           warn_of_unsupported_feature('nested case-sensitive literal')
         elsif !context.case_insensitive_root && expression.case_insensitive?
diff --git a/lib/js_regex/converter/meta_converter.rb b/lib/js_regex/converter/meta_converter.rb
@@ -8,19 +8,6 @@ module Converter
     # Template class implementation.
     #
     class MetaConverter < JsRegex::Converter::Base
-      DOT_EXPANSION    = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\n\uD800-\uDFFF])'
-      ML_DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\uD800-\uDFFF])'
-      # Possible improvements for dot conversion:
-      #
-      # In ES2015, the 'u' flag allows dots to match astral chars. Unfortunately
-      # the dot keeps matching lone surrogates even with this flag, so the use
-      # of an expansion is still necessary to get the same behavior as in Ruby.
-      #
-      # ES2018 has the dotall flag 's', but it is tricky to use in conversions.
-      # 's' activates matching of BOTH astral chars and "\n", whereas the dot in
-      # Ruby doesn't match "\n" by default, and even with the 'm' flag set on
-      # the root, subexps might still exclude "\n" like so: /.(?-m:.)./m
-
       private
 
       def convert_data
@@ -34,6 +21,19 @@ def convert_data
         end
       end
 
+      DOT_EXPANSION    = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\n\uD800-\uDFFF])'
+      ML_DOT_EXPANSION = '(?:[\uD800-\uDBFF][\uDC00-\uDFFF]|[^\uD800-\uDFFF])'
+      # Possible improvements for dot conversion:
+      #
+      # In ES2015, the 'u' flag allows dots to match astral chars. Unfortunately
+      # the dot keeps matching lone surrogates even with this flag, so the use
+      # of an expansion is still necessary to get the same behavior as in Ruby.
+      #
+      # ES2018 has the dotall flag 's', but it is tricky to use in conversions.
+      # 's' activates matching of BOTH astral chars and "\n", whereas the dot in
+      # Ruby doesn't match "\n" by default, and even with the 'm' flag set on
+      # the root, subexps might still exclude "\n" like so: /.(?-m:.)./m
+
       def convert_alternatives
         kept_any_previous_branch = nil
 
diff --git a/lib/js_regex/converter/property_converter.rb b/lib/js_regex/converter/property_converter.rb
@@ -12,16 +12,11 @@ module Converter
     # codepoints matched by the property and build a set string from them.
     #
     class PropertyConverter < JsRegex::Converter::Base
-      # A map of normalized Ruby property names to names supported by ES2018+.
-      def self.map
-        @map ||= File.read("#{__dir__}/property_map.csv").scan(/(.+),(.+)/).to_h
-      end
-
       private
 
       def convert_data
         if context.es_2018_or_higher? &&
-            (prop_name_in_js = self.class.map[subtype.to_s.tr('_', '')])
+            (prop_name_in_js = Utils::Properties.name_in_js(subtype))
           context.enable_u_option
           "\\#{expression.negative? ? 'P' : 'p'}{#{prop_name_in_js}}"
         else
diff --git a/lib/js_regex/converter/set_converter.rb b/lib/js_regex/converter/set_converter.rb
@@ -1,8 +1,6 @@
 # frozen_string_literal: true
 
 require_relative 'base'
-require_relative 'escape_converter'
-require_relative 'type_converter'
 require 'character_set'
 
 class JsRegex
@@ -50,37 +48,36 @@ def simple_convert_child(exp)
             simple_convert_child(op) or return false
           end.join('-')
         when :type
-          TypeConverter.directly_compatible?(exp, context) &&
-            exp.text
+          Utils::CharTypes.directly_compatible?(exp) && exp.text
         when :escape
           return exp.text if SET_SPECIFIC_ESCAPES_PATTERN.match?(exp.text)
 
           case exp.token
           when *CONVERTIBLE_ESCAPE_TOKENS
-            EscapeConverter.new.convert(exp, context)
+            JsRegex::Converter.convert(exp, context)
           when :literal
             exp.char.ord <= 0xFFFF &&
-              LiteralConverter.escape_incompatible_bmp_literals(exp.char)
+              Utils::Literals.escape_incompatible_bmp_literals(exp.char)
           end
         end
       end
 
       def simple_convert_literal_child(exp)
         if !context.u? &&
-           exp.text =~ LiteralConverter::ASTRAL_PLANE_CODEPOINT_PATTERN &&
+           exp.text =~ Utils::Literals::ASTRAL_PLANE_CODEPOINT_PATTERN &&
            !context.enable_u_option
           false
         elsif SET_LITERALS_REQUIRING_ESCAPE_PATTERN.match?(exp.text)
           "\\#{exp.text}"
         else
-          LiteralConverter.escape_incompatible_bmp_literals(exp.text)
+          Utils::Literals.escape_incompatible_bmp_literals(exp.text)
         end
       end
 
       SET_LITERALS_REQUIRING_ESCAPE_PATTERN = Regexp.union(%w<( ) [ ] { } / - |>)
       SET_SPECIFIC_ESCAPES_PATTERN = /[\^\-]/.freeze
       CONVERTIBLE_ESCAPE_TOKENS = %i[control meta_sequence bell escape octal] +
-        EscapeConverter::ESCAPES_SHARED_BY_RUBY_AND_JS
+        Utils::Escapes::ESCAPES_SHARED_BY_RUBY_AND_JS
 
       def full_recalculation
         # Fetch codepoints as if the set was case-sensitive, then re-add
diff --git a/lib/js_regex/converter/type_converter.rb b/lib/js_regex/converter/type_converter.rb
@@ -8,51 +8,6 @@ module Converter
     # Template class implementation.
     #
     class TypeConverter < JsRegex::Converter::Base
-      HEX_EXPANSION              = '[0-9A-Fa-f]'
-      NONHEX_EXPANSION           = '[^0-9A-Fa-f]'
-      I_MODE_HEX_EXPANSION       = '[0-9A-F]'
-      I_MODE_NONHEX_EXPANSION    = '[^0-9A-F]'
-      LINEBREAK_EXPANSION        = '(?:\r\n|[\n\v\f\r\u0085\u2028\u2029])'
-      ES2018_HEX_EXPANSION       = '\p{AHex}'
-      ES2018_NONHEX_EXPANSION    = '\P{AHex}'
-      # partially taken from https://unicode.org/reports/tr51/#EBNF_and_Regex
-      ES2018_XGRAPHEME_EXPANSION = <<-'REGEXP'.gsub(/\s+/, '')
-        (?:
-          \r\n
-        |
-          \p{RI}\p{RI}
-        |
-          \p{Emoji}
-          (?:
-            \p{EMod}
-          |
-            \uFE0F\u20E3?
-          |
-            [\u{E0020}-\u{E007E}]+\u{E007F}
-          )?
-          (?:
-            \u200D
-            (?:
-              \p{RI}\p{RI}
-            |
-              \p{Emoji}(?:\p{EMod}|\uFE0F\u20E3?|[\u{E0020}-\u{E007E}]+\u{E007F})?
-            )
-          )*
-        |
-          [\P{M}\P{Lm}](?:\u200d|\p{M}|\p{Lm}|\p{Emoji_Modifier})*
-        )
-      REGEXP
-
-
-      def self.directly_compatible?(expression, _context = nil)
-        case expression.token
-        when :space, :nonspace
-          !expression.ascii_classes?
-        when :digit, :nondigit, :word, :nonword
-          !expression.unicode_classes?
-        end
-      end
-
       private
 
       def convert_data
@@ -62,10 +17,12 @@ def convert_data
         when :linebreak then linebreak_expansion
         when :xgrapheme then xgrapheme
         when :digit, :space, :word
-          return pass_through if self.class.directly_compatible?(expression)
+          return pass_through if Utils::CharTypes.directly_compatible?(expression)
+
           set_substitution
         when :nondigit, :nonspace, :nonword
-          return pass_through if self.class.directly_compatible?(expression)
+          return pass_through if Utils::CharTypes.directly_compatible?(expression)
+
           negative_set_substitution
         else
           warn_of_unsupported_feature
@@ -82,6 +39,10 @@ def hex_expansion
         end
       end
 
+      ES2018_HEX_EXPANSION = '\p{AHex}'
+      I_MODE_HEX_EXPANSION = '[0-9A-F]'
+      HEX_EXPANSION        = '[0-9A-Fa-f]'
+
       def nonhex_expansion
         if context.es_2018_or_higher? && context.enable_u_option
           ES2018_NONHEX_EXPANSION
@@ -92,10 +53,16 @@ def nonhex_expansion
         end
       end
 
+      NONHEX_EXPANSION        = '[^0-9A-Fa-f]'
+      I_MODE_NONHEX_EXPANSION = '[^0-9A-F]'
+      ES2018_NONHEX_EXPANSION = '\P{AHex}'
+
       def linebreak_expansion
         wrap_in_backrefed_lookahead(LINEBREAK_EXPANSION)
       end
 
+      LINEBREAK_EXPANSION = '(?:\r\n|[\n\v\f\r\u0085\u2028\u2029])'
+
       def negative_set_substitution
         # ::of_expression returns an inverted set for negative expressions,
         # so we need to un-invert before wrapping in [^ and ]. Kinda lame.
@@ -117,6 +84,34 @@ def xgrapheme
           warn_of_unsupported_feature
         end
       end
+
+      # partially taken from https://unicode.org/reports/tr51/#EBNF_and_Regex
+      ES2018_XGRAPHEME_EXPANSION = <<-'REGEXP'.gsub(/\s+/, '')
+        (?:
+          \r\n
+        |
+          \p{RI}\p{RI}
+        |
+          \p{Emoji}
+          (?:
+            \p{EMod}
+          |
+            \uFE0F\u20E3?
+          |
+            [\u{E0020}-\u{E007E}]+\u{E007F}
+          )?
+          (?:
+            \u200D
+            (?:
+              \p{RI}\p{RI}
+            |
+              \p{Emoji}(?:\p{EMod}|\uFE0F\u20E3?|[\u{E0020}-\u{E007E}]+\u{E007F})?
+            )
+          )*
+        |
+          [\P{M}\P{Lm}](?:\u200d|\p{M}|\p{Lm}|\p{Emoji_Modifier})*
+        )
+      REGEXP
     end
   end
 end
diff --git a/lib/js_regex/utils.rb b/lib/js_regex/utils.rb
@@ -0,0 +1,4 @@
+require_relative 'utils/char_types'
+require_relative 'utils/escapes'
+require_relative 'utils/literals'
+require_relative 'utils/properties'
diff --git a/lib/js_regex/utils/char_types.rb b/lib/js_regex/utils/char_types.rb
@@ -0,0 +1,18 @@
+# frozen_string_literal: true
+
+class JsRegex
+  module Utils
+    module CharTypes
+      def self.directly_compatible?(expression)
+        case expression.token
+        when :space, :nonspace
+          !expression.ascii_classes?
+        when :digit, :nondigit, :word, :nonword
+          !expression.unicode_classes?
+        else # :hex, :nonhex, :linebreak, :xgrapheme
+          false
+        end
+      end
+    end
+  end
+end
diff --git a/lib/js_regex/utils/escapes.rb b/lib/js_regex/utils/escapes.rb
diff --git a/lib/js_regex/utils/literals.rb b/lib/js_regex/utils/literals.rb
diff --git a/lib/js_regex/utils/properties.rb b/lib/js_regex/utils/properties.rb
diff --git a/lib/js_regex/utils/property_map.csv b/lib/js_regex/utils/property_map.csv
diff --git a/tasks/build_prop_map.rake b/tasks/build_prop_map.rake

Original file line number	Diff line number	Diff line change
`@@ -1,5 +1,8 @@`
`1`	`1`	`# frozen_string_literal: true`
`2`	`2`
	`3`	`+require_relative '../node'`
	`4`	`+require_relative '../utils'`
	`5`	`+`
`3`	`6`	`class JsRegex`
`4`	`7`	`module Converter`
`5`	`8`	`#`