From 9095dedb8fb44df393171c464a188d9c1fecd6f8 Mon Sep 17 00:00:00 2001 From: Johannes Opper Date: Tue, 20 Jan 2026 09:06:09 +0100 Subject: [PATCH] Fix whitespace collapsing between inline elements Preserve trailing newlines as spaces when there's following inline content. This properly handles HTML whitespace collapsing for cases like `A\nB` which should render as "A B". The fix recursively traverses parent nodes to detect following content even when text is deeply nested inside inline elements. Fixes #34 Co-Authored-By: Claude Opus 4.5 --- lib/reverse_markdown/converters/text.rb | 43 +++++++++++++++++-- spec/components/basic_spec.rb | 22 ++++++++++ spec/components/from_the_wild_spec.rb | 2 +- .../reverse_markdown/converters/text_spec.rb | 2 +- 4 files changed, 64 insertions(+), 5 deletions(-) diff --git a/lib/reverse_markdown/converters/text.rb b/lib/reverse_markdown/converters/text.rb index 27e40b1..fa03ee3 100644 --- a/lib/reverse_markdown/converters/text.rb +++ b/lib/reverse_markdown/converters/text.rb @@ -11,12 +11,20 @@ def convert(node, options = {}) private + INLINE_ELEMENTS = [:a, :abbr, :b, :bdi, :bdo, :cite, :code, :data, :del, + :dfn, :em, :i, :ins, :kbd, :mark, :q, :rp, :rt, :ruby, + :s, :samp, :small, :span, :strong, :sub, :sup, :time, + :u, :var, :wbr, :font, :tt].freeze + def treat_empty(node) parent = node.parent.name.to_sym if [:ol, :ul].include?(parent) # Otherwise the identation is broken '' elsif node.text == ' ' # Regular whitespace text node ' ' + elsif INLINE_ELEMENTS.include?(parent) && node.text =~ /\n/ + # Preserve newlines between inline elements as space (HTML whitespace collapsing) + ' ' else '' end @@ -25,7 +33,7 @@ def treat_empty(node) def treat_text(node) text = node.text text = preserve_nbsp(text) - text = remove_border_newlines(text) + text = remove_border_newlines(text, node) text = remove_inner_newlines(text) text = escape_keychars(text) @@ -43,8 +51,37 @@ def preserve_tags(text) text.gsub(/[<>]/, '>' => '\>', '<' => '\<') end - def remove_border_newlines(text) - text.gsub(/\A\n+/, '').gsub(/\n+\z/, '') + def remove_border_newlines(text, node) + result = text.gsub(/\A\n+/, '') + # Only convert trailing newlines to space if there's following inline content + # This handles HTML whitespace collapsing between inline elements + if has_following_inline_content?(node) + result.gsub(/\n+\z/, ' ') + else + result.gsub(/\n+\z/, '') + end + end + + def has_following_inline_content?(node) + # Check if node has a following sibling that is inline content + sibling = node.next_sibling + while sibling + if sibling.text? + return true unless sibling.text.strip.empty? + elsif INLINE_ELEMENTS.include?(sibling.name.to_sym) + return true + else + # Block element - no space needed before it + return false + end + sibling = sibling.next_sibling + end + + # Recursively check if inline parent has following content + parent = node.parent + return false unless INLINE_ELEMENTS.include?(parent.name.to_sym) + + has_following_inline_content?(parent) end def remove_inner_newlines(text) diff --git a/spec/components/basic_spec.rb b/spec/components/basic_spec.rb index 53cc5cf..6295f16 100644 --- a/spec/components/basic_spec.rb +++ b/spec/components/basic_spec.rb @@ -40,4 +40,26 @@ it { is_expected.to match /before hr \n\* \* \*\n after hr/ } it { is_expected.to match /section 1\n ?\nsection 2/ } + + describe 'whitespace handling between inline elements' do + it 'preserves whitespace (including newlines) between spans' do + input = "Hello\nWorld" + result = ReverseMarkdown.convert(input) + expect(result).to eq "Hello World" + end + + it 'preserves whitespace between inline elements in paragraphs' do + input = "

Hello\nWorld

" + result = ReverseMarkdown.convert(input) + expect(result).to eq "Hello World\n\n" + end + + it 'preserves whitespace between nested inline elements' do + # The text "A" is nested inside inside , but has a following sibling + # This requires traversing up through parent nodes to find following content + input = "

A\nB

" + result = ReverseMarkdown.convert(input) + expect(result).to eq "_A_ B\n\n" + end + end end diff --git a/spec/components/from_the_wild_spec.rb b/spec/components/from_the_wild_spec.rb index 821b21a..762a674 100644 --- a/spec/components/from_the_wild_spec.rb +++ b/spec/components/from_the_wild_spec.rb @@ -6,7 +6,7 @@ subject { ReverseMarkdown.convert(input) } it "should make sense of strong-crazy markup (as seen in the wild)" do - expect(subject).to include "**. \n \\*\\*\\* intentcast** : logo design \n **.**\n\n" + expect(subject).to include "**. \n \\*\\*\\* intentcast** : logo design \n **.**\n\n" end it "should not over escape * or _" do diff --git a/spec/lib/reverse_markdown/converters/text_spec.rb b/spec/lib/reverse_markdown/converters/text_spec.rb index 8dd7125..1bc6159 100644 --- a/spec/lib/reverse_markdown/converters/text_spec.rb +++ b/spec/lib/reverse_markdown/converters/text_spec.rb @@ -22,7 +22,7 @@ expect(result).to eq 'foo bar' end - it 'removes trailing newlines' do + it 'removes trailing newlines when no following content' do input = node_for("

foo bar\n\n

") result = converter.convert(input) expect(result).to eq 'foo bar'