From 84dbf483e6619150dbeda50f3c652a317c084fcd Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20R=C3=B6nnqvist?= Date: Fri, 5 Dec 2025 11:15:34 +0100 Subject: [PATCH 1/3] Add test about parsing inline HTML except for comments --- Sources/DocCHTML/MarkdownRenderer.swift | 12 ++++- .../DocCHTMLTests/MarkdownRendererTests.swift | 48 +++++++++++++++++++ 2 files changed, 58 insertions(+), 2 deletions(-) diff --git a/Sources/DocCHTML/MarkdownRenderer.swift b/Sources/DocCHTML/MarkdownRenderer.swift index 8ee60d8db..9ea7f7c38 100644 --- a/Sources/DocCHTML/MarkdownRenderer.swift +++ b/Sources/DocCHTML/MarkdownRenderer.swift @@ -695,7 +695,10 @@ package struct MarkdownRenderer { } // Next, check if its empty element (for example `
` or `
`) that's complete on its own. - if let parsed = try? XMLElement(xmlString: rawHTML) { + + // On non-Darwin platforms, `XMLElement(xmlString:)` sometimes crashes for certain invalid / incomplete XML string. + // To minimize the risk of this happening, don't try to parse the XML string as an empty HTML element unless it ends with "/>" + if rawHTML.hasSuffix("/>"), let parsed = try? XMLElement(xmlString: rawHTML) { children.append(parsed) continue } @@ -703,6 +706,8 @@ package struct MarkdownRenderer { // This could be an HTML element with content or it could be invalid HTML. // Don't modify `elements` until we know that we've parsed a valid HTML element. var copy = elements + let tagName = rawHTML.dropFirst().prefix(while: \.isLetter) + let expectedClosingTag = "" // Gradually check a longer and longer series of markup elements to see if they form a valid HTML element. inner: while !copy.isEmpty, let next = copy.first as? any InlineMarkup { @@ -714,7 +719,10 @@ package struct MarkdownRenderer { } rawHTML += next.format() - if let parsed = try? XMLElement(xmlString: rawHTML) { + if let maybeClosingHTML = next as? InlineHTML, + maybeClosingHTML.rawHTML == expectedClosingTag, + let parsed = try? XMLElement(xmlString: rawHTML) + { children.append(parsed) // Include the valid HTML element in the output. elements = copy // Skip over all the elements that were used to create that HTML element. continue outer diff --git a/Tests/DocCHTMLTests/MarkdownRendererTests.swift b/Tests/DocCHTMLTests/MarkdownRendererTests.swift index 0b0071617..2f8eeec8d 100644 --- a/Tests/DocCHTMLTests/MarkdownRendererTests.swift +++ b/Tests/DocCHTMLTests/MarkdownRendererTests.swift @@ -548,6 +548,54 @@ struct MarkdownRendererTests { ) } + @Test + func testParsesAndPreservesHTMLExceptComments() { + assert( + rendering: "This is a formatted paragraph.", + matches: "

This is a formatted paragraph.

" + ) + + assert( + rendering: "This
is a formattedparagraph.", + matches: "

This
is a formatted paragraph.

" + ) + + assert( + rendering: "This is a custom formatted paragraph.", + matches: "

This is a custom formatted paragraph.

" + ) + + // This markup doesn't properly close the `` tag (it uses an `` tag. + // In this case we drop both tags but not their content in between. This matches what DocC does for inline HTML with regards to the Render JSON output. + assert( + rendering: "This is a custom formatted paragraph.", + matches: "

This is a custom formatted paragraph.

" + ) + + // Any content _within_ HTML tags in the markdown isn't parsed as markdown content. + assert( + rendering: "This is a custom **not** formatted paragraph.", + matches: "

This is a custom **not** formatted paragraph.

" + ) + + assert( + rendering: """ +
+ Some summary + +

Some longer description

+
+ + """, + matches: """ +
+ Some summary +

Some longer description

+
+ """ + ) + } + private func assert( rendering markdownContent: String, elementToReturn: LinkedElement? = nil, From e540ba39f94c4e7cca081b604c000fa26fd28ab4 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?David=20R=C3=B6nnqvist?= Date: Fri, 5 Dec 2025 11:50:39 +0100 Subject: [PATCH 2/3] Extract inner HTML parsing code into a private function --- Sources/DocCHTML/MarkdownRenderer.swift | 77 +++++++++++++------------ 1 file changed, 40 insertions(+), 37 deletions(-) diff --git a/Sources/DocCHTML/MarkdownRenderer.swift b/Sources/DocCHTML/MarkdownRenderer.swift index 9ea7f7c38..00d55f1ee 100644 --- a/Sources/DocCHTML/MarkdownRenderer.swift +++ b/Sources/DocCHTML/MarkdownRenderer.swift @@ -675,66 +675,69 @@ package struct MarkdownRenderer { // - An empty element like `
` or `
` that's complete on its own. // - An element with children like `Something` that needs to be created out of multiple markup elements. // - // FIXME: See if this can be extracted into 2 private functions to make the code easier to read. // Because it may take multiple markdown elements to create an HTML element, we pop elements rather than iterating - var elements = Array(container) - outer: while !elements.isEmpty { - let element = elements.removeFirst() - - guard let start = element as? InlineHTML else { + var remainder = Array(container)[...] + while let element = remainder.popFirst() { + guard let openingHTML = element as? InlineHTML else { // If the markup _isn't_ inline HTML we can simply visit it to transform it. children.append(visit(element)) continue } // Otherwise, we need to determine how long this markdown element it. - var rawHTML = start.rawHTML + let rawHTML = openingHTML.rawHTML + // Simply skip any HTML/XML comments. guard !rawHTML.hasPrefix("