From e6375aee21087e12e6914f6b48bebbd208e6837f Mon Sep 17 00:00:00 2001 From: Johannes Opper Date: Tue, 20 Jan 2026 14:18:49 +0100 Subject: [PATCH] Fix adjacent emphasis tags producing invalid markdown (#99) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit When HTML contains adjacent emphasis tags like word, the output was _wo__rd_ which is invalid markdown. The double underscore breaks parsing in most markdown renderers. Add merge_adjacent_emphasis() method in Cleaner that merges adjacent identical emphasis markers during post-processing: - _a__b_ → _ab_ - **a****b** → **ab** Co-Authored-By: Claude Opus 4.5 --- lib/reverse_markdown/cleaner.rb | 22 +++++++++++ spec/lib/reverse_markdown/cleaner_spec.rb | 37 +++++++++++++++++++ .../reverse_markdown/converters/em_spec.rb | 14 +++++++ 3 files changed, 73 insertions(+) diff --git a/lib/reverse_markdown/cleaner.rb b/lib/reverse_markdown/cleaner.rb index 404b91f..a91e50c 100644 --- a/lib/reverse_markdown/cleaner.rb +++ b/lib/reverse_markdown/cleaner.rb @@ -5,10 +5,32 @@ def tidy(string) result = remove_inner_whitespaces(string) result = remove_newlines(result) result = remove_leading_newlines(result) + result = merge_adjacent_emphasis(result) result = clean_tag_borders(result) clean_punctuation_characters(result) end + def merge_adjacent_emphasis(string) + result = string + + # Merge adjacent underscore emphasis: _X__Y_ → _XY_ + # Apply repeatedly for multiple adjacent tags + loop do + new_result = result.gsub(/_([^_\n]+)__([^_\n]+)_/, '_\1\2_') + break if new_result == result + result = new_result + end + + # Merge adjacent strong emphasis: **X****Y** → **XY** + loop do + new_result = result.gsub(/\*\*([^*\n]+)\*\*\*\*([^*\n]+)\*\*/, '**\1\2**') + break if new_result == result + result = new_result + end + + result + end + def remove_newlines(string) string.gsub(/\n{3,}/, "\n\n") end diff --git a/spec/lib/reverse_markdown/cleaner_spec.rb b/spec/lib/reverse_markdown/cleaner_spec.rb index 30bee5e..2b3c010 100644 --- a/spec/lib/reverse_markdown/cleaner_spec.rb +++ b/spec/lib/reverse_markdown/cleaner_spec.rb @@ -60,6 +60,43 @@ end end + describe '#merge_adjacent_emphasis' do + it 'merges two adjacent underscore emphasis tags' do + result = cleaner.merge_adjacent_emphasis('_a__b_') + expect(result).to eq '_ab_' + end + + it 'merges three adjacent underscore emphasis tags' do + result = cleaner.merge_adjacent_emphasis('_a__b__c_') + expect(result).to eq '_abc_' + end + + it 'merges two adjacent strong emphasis tags' do + result = cleaner.merge_adjacent_emphasis('**a****b**') + expect(result).to eq '**ab**' + end + + it 'merges three adjacent strong emphasis tags' do + result = cleaner.merge_adjacent_emphasis('**a****b****c**') + expect(result).to eq '**abc**' + end + + it 'does not merge emphasis tags separated by whitespace' do + result = cleaner.merge_adjacent_emphasis('_a_ _b_') + expect(result).to eq '_a_ _b_' + end + + it 'does not merge strong tags separated by whitespace' do + result = cleaner.merge_adjacent_emphasis('**a** **b**') + expect(result).to eq '**a** **b**' + end + + it 'handles mixed content correctly' do + result = cleaner.merge_adjacent_emphasis('text _a__b_ more **c****d** end') + expect(result).to eq 'text _ab_ more **cd** end' + end + end + describe '#clean_tag_borders' do context 'with default_border is set to space' do before { ReverseMarkdown.config.tag_border = ' ' } diff --git a/spec/lib/reverse_markdown/converters/em_spec.rb b/spec/lib/reverse_markdown/converters/em_spec.rb index ab98c2e..5bdeed2 100644 --- a/spec/lib/reverse_markdown/converters/em_spec.rb +++ b/spec/lib/reverse_markdown/converters/em_spec.rb @@ -25,4 +25,18 @@ expect(result).to include('_hello_') expect(result).to include('_world_') end + + it 'merges adjacent em tags into single emphasis' do + # Issue #99: Adjacent emphasis tags like word + # should produce _word_ not _wo__rd_ + expect(ReverseMarkdown.convert('word')).to eq '_word_' + end + + it 'merges multiple adjacent em tags' do + expect(ReverseMarkdown.convert('abc')).to eq '_abc_' + end + + it 'keeps separate emphasis when tags have whitespace between them' do + expect(ReverseMarkdown.convert('a b')).to eq '_a_ _b_' + end end