From ef71d306432af8e97802005bb9affa135e609dec Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 6 Mar 2026 23:06:44 +0000
Subject: [PATCH 1/6] Initial plan
From 3450b7650f99af203448313fc849b3ad42a21bbe Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 6 Mar 2026 23:12:06 +0000
Subject: [PATCH 2/6] chore: initial plan for HTML selection preservation
Co-authored-by: kraftbj <88897+kraftbj@users.noreply.github.com>
---
package-lock.json | 4 ++++
1 file changed, 4 insertions(+)
diff --git a/package-lock.json b/package-lock.json
index fcada17..df79f71 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -34,6 +34,10 @@
"grunt-checktextdomain": "^1.0.1",
"grunt-wp-i18n": "^1.0.4",
"terser": "^5.44.1"
+ },
+ "engines": {
+ "node": ">=20.10.0",
+ "npm": ">=10.2.3"
}
},
"node_modules/@ampproject/remapping": {
From 818d998488001b809f1cde846e56615a8f57c112 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Fri, 6 Mar 2026 23:21:39 +0000
Subject: [PATCH 3/6] fix: preserve HTML formatting when capturing bookmarklet
text selection
- Bookmarklet now captures selected HTML using getRangeAt()/cloneContents()
and sends it as `sel_html` alongside the plain-text `s` field
- Added htmlToBlocks() utility that converts HTML to Gutenberg block markup
(handles headings h1-h6, paragraphs, ul/ol lists with nesting, blockquotes,
and pre/code blocks; strips scripts/event handlers for safety)
- buildSuggestedContent() now renders HTML selections as formatted blocks
instead of plain-text in a quote block
- App.js postMessage handler passes sel_html through to content builder
- Added 20 new unit tests for htmlToBlocks and updated buildSuggestedContent
- Added bookmarklet test for HTML selection capture
- Rebuilt minified bookmarklet
Co-authored-by: kraftbj <88897+kraftbj@users.noreply.github.com>
---
assets/bookmarklet.js | 18 +-
assets/bookmarklet.min.js | 2 +-
src/App.js | 18 +-
src/components/BlockTransformShortcuts.js | 6 +-
src/utils/html-parser.js | 240 +++++++++++++++++++++-
src/utils/index.js | 1 +
tests/bookmarklet/bookmarklet.test.js | 24 +++
tests/utils/html-parser.test.js | 224 ++++++++++++++++++++
8 files changed, 520 insertions(+), 13 deletions(-)
create mode 100644 tests/utils/html-parser.test.js
diff --git a/assets/bookmarklet.js b/assets/bookmarklet.js
index 234cda1..3380029 100644
--- a/assets/bookmarklet.js
+++ b/assets/bookmarklet.js
@@ -15,7 +15,7 @@
encURI = window.encodeURIComponent,
head = document.getElementsByTagName( 'head' )[0],
target = '_press_this_app',
- windowWidth, windowHeight, selection,
+ windowWidth, windowHeight, selection, selectionHtml,
metas, links, content, images, iframes, img, scripts,
scrapedData = {},
popup;
@@ -32,7 +32,16 @@
}
if ( window.getSelection ) {
- selection = window.getSelection() + '';
+ var sel = window.getSelection();
+ if ( sel && sel.rangeCount > 0 ) {
+ selection = sel.toString();
+ // Capture HTML to preserve formatting (bold, lists, headings, etc.).
+ var range = sel.getRangeAt( 0 );
+ var fragment = range.cloneContents();
+ var tempDiv = document.createElement( 'div' );
+ tempDiv.appendChild( fragment );
+ selectionHtml = tempDiv.innerHTML;
+ }
} else if ( document.getSelection ) {
selection = document.getSelection() + '';
} else if ( document.selection ) {
@@ -299,6 +308,11 @@
add( 's', selection );
}
+ // Add HTML selection to preserve formatting (bold, lists, headings, etc.).
+ if ( selectionHtml && selectionHtml !== selection ) {
+ add( 'sel_html', selectionHtml );
+ }
+
/**
* Send scraped data to the Press This popup via postMessage.
* Uses polling to wait for the popup to be ready.
diff --git a/assets/bookmarklet.min.js b/assets/bookmarklet.min.js
index 00bb55c..c43d75b 100644
--- a/assets/bookmarklet.min.js
+++ b/assets/bookmarklet.min.js
@@ -1 +1 @@
-!function(e,t,i,a){var n,o,r,l,c,s,g,m,d,f,h,u=e.encodeURIComponent,p=t.getElementsByTagName("head")[0],y={};if(a)if(i.match(/^https?:/)){a+="&u="+u(i),e.getSelection?r=e.getSelection()+"":t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||""),a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,T("pt_version",11),l=p.getElementsByTagName("meta")||[];for(var v=0;v200);v++){var b=l[v],O=b.getAttribute("name"),_=b.getAttribute("property"),x=b.getAttribute("content");x&&(O?T("_meta["+O+"]",x):_&&(T("_meta["+_+"]",x),"og:video"!==_&&"og:video:url"!==_&&"og:video:secure_url"!==_||T("_og_video[]",x)))}c=p.getElementsByTagName("link")||[];for(var E=0;E=50);E++){var A=c[E],w=A.getAttribute("rel");"canonical"!==w&&"icon"!==w&&"shortlink"!==w||T("_links["+w+"]",A.getAttribute("href")),"alternate"===w&&"x-default"===A.getAttribute("hreflang")&&T("_links[alternate_canonical]",A.getAttribute("href"))}!function(){f=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);N++)(d=g[N]).src.indexOf("avatar")>-1||d.className.indexOf("avatar")>-1||d.width&&d.width<256||d.height&&d.height<128||d.src&&0!==d.src.indexOf("data:")&&T("_images[]",d.src);m=t.body.getElementsByTagName("iframe")||[];for(var j=0;j=50);j++){var B=m[j].src;B&&"about:blank"!==B&&(B.indexOf("jetpack-comment")>-1||B.indexOf("disqus.com")>-1||B.indexOf("facebook.com/plugins")>-1||B.indexOf("platform.twitter.com/widgets")>-1||B.indexOf("google.com/recaptcha")>-1||B.indexOf("googletagmanager.com")>-1||B.indexOf("doubleclick.net")>-1||B.indexOf("googlesyndication.com")>-1||B.indexOf("amazon-adsystem.com")>-1||B.indexOf("quantserve.com")>-1||B.indexOf("scorecardresearch.com")>-1||B.indexOf("addthis.com")>-1||B.indexOf("sharethis.com")>-1||B.indexOf("addtoany.com")>-1||T("_embeds[]",B))}var k,P;t.title&&T("t",t.title),r&&T("s",r),h=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),k=0,P=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(k++,h&&!h.closed){try{h.postMessage({type:"press-this-data",version:11,data:y},P)}catch(e){}k<50&&setTimeout(e,100)}},200)}else top.location.href=a;function T(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return y[a]||(y[a]=[]),void y[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return y[o]||(y[o]={}),void(y[o][r]=t)}y[e]=t}}function S(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&T("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&T("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?T("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&T("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&T("_jsonld[headline]",e.headline),e.description&&T("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&T("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url);
\ No newline at end of file
+!function(e,t,i,a){var n,o,r,l,c,s,g,d,m,f,h,p,u=e.encodeURIComponent,y=t.getElementsByTagName("head")[0],v={};if(a)if(i.match(/^https?:/)){if(a+="&u="+u(i),e.getSelection){var b=e.getSelection();if(b&&b.rangeCount>0){r=b.toString();var O=b.getRangeAt(0).cloneContents(),_=t.createElement("div");_.appendChild(O),l=_.innerHTML}}else t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||"");a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,q("pt_version",11),c=y.getElementsByTagName("meta")||[];for(var x=0;x200);x++){var E=c[x],A=E.getAttribute("name"),w=E.getAttribute("property"),N=E.getAttribute("content");N&&(A?q("_meta["+A+"]",N):w&&(q("_meta["+w+"]",N),"og:video"!==w&&"og:video:url"!==w&&"og:video:secure_url"!==w||q("_og_video[]",N)))}s=y.getElementsByTagName("link")||[];for(var j=0;j=50);j++){var B=s[j],T=B.getAttribute("rel");"canonical"!==T&&"icon"!==T&&"shortlink"!==T||q("_links["+T+"]",B.getAttribute("href")),"alternate"===T&&"x-default"===B.getAttribute("hreflang")&&q("_links[alternate_canonical]",B.getAttribute("href"))}!function(){h=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);k++)(f=d[k]).src.indexOf("avatar")>-1||f.className.indexOf("avatar")>-1||f.width&&f.width<256||f.height&&f.height<128||f.src&&0!==f.src.indexOf("data:")&&q("_images[]",f.src);m=t.body.getElementsByTagName("iframe")||[];for(var P=0;P=50);P++){var C=m[P].src;C&&"about:blank"!==C&&(C.indexOf("jetpack-comment")>-1||C.indexOf("disqus.com")>-1||C.indexOf("facebook.com/plugins")>-1||C.indexOf("platform.twitter.com/widgets")>-1||C.indexOf("google.com/recaptcha")>-1||C.indexOf("googletagmanager.com")>-1||C.indexOf("doubleclick.net")>-1||C.indexOf("googlesyndication.com")>-1||C.indexOf("amazon-adsystem.com")>-1||C.indexOf("quantserve.com")>-1||C.indexOf("scorecardresearch.com")>-1||C.indexOf("addthis.com")>-1||C.indexOf("sharethis.com")>-1||C.indexOf("addtoany.com")>-1||q("_embeds[]",C))}var S,U;t.title&&q("t",t.title),r&&q("s",r),l&&l!==r&&q("sel_html",l),p=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),S=0,U=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(S++,p&&!p.closed){try{p.postMessage({type:"press-this-data",version:11,data:v},U)}catch(e){}S<50&&setTimeout(e,100)}},200)}else top.location.href=a;function q(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return v[a]||(v[a]=[]),void v[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return v[o]||(v[o]={}),void(v[o][r]=t)}v[e]=t}}function H(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&q("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&q("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?q("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&q("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&q("_jsonld[headline]",e.headline),e.description&&q("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&q("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url);
\ No newline at end of file
diff --git a/src/App.js b/src/App.js
index 32ba56f..9b9bb6a 100644
--- a/src/App.js
+++ b/src/App.js
@@ -221,12 +221,17 @@ export default function App() {
// Build suggested content from bookmarklet metadata.
// Extract description from meta tags.
const meta = messageData._meta || {};
- const description =
- messageData.s || // User selection takes priority.
- meta[ 'twitter:description' ] ||
- meta[ 'og:description' ] ||
- meta.description ||
- '';
+
+ // HTML selection takes highest priority (preserves formatting).
+ // Plain-text selection is a fallback, then meta descriptions.
+ const selectionHtml = messageData.sel_html || '';
+ const description = selectionHtml
+ ? '' // HTML selection is used directly; skip plain-text fallback.
+ : messageData.s || // Plain-text user selection.
+ meta[ 'twitter:description' ] ||
+ meta[ 'og:description' ] ||
+ meta.description ||
+ '';
const title =
messageData.t ||
@@ -243,6 +248,7 @@ export default function App() {
const suggestedContent = buildSuggestedContentFromMetadata( {
title,
description,
+ selectionHtml,
siteName: meta[ 'og:site_name' ] || '',
canonical,
url: receivedSourceUrl,
diff --git a/src/components/BlockTransformShortcuts.js b/src/components/BlockTransformShortcuts.js
index a61a39b..3f3cf20 100644
--- a/src/components/BlockTransformShortcuts.js
+++ b/src/components/BlockTransformShortcuts.js
@@ -137,7 +137,11 @@ export default function BlockTransformShortcuts() {
if ( innerBlocks.length > 0 ) {
// Replace the quote with its inner blocks directly.
const replacementBlocks = innerBlocks.map( ( inner ) =>
- createBlock( inner.name, { ...inner.attributes }, inner.innerBlocks )
+ createBlock(
+ inner.name,
+ { ...inner.attributes },
+ inner.innerBlocks
+ )
);
replaceBlocks( currentClientId, replacementBlocks );
} else {
diff --git a/src/utils/html-parser.js b/src/utils/html-parser.js
index 2b1f82d..2f3033a 100644
--- a/src/utils/html-parser.js
+++ b/src/utils/html-parser.js
@@ -447,6 +447,234 @@ function getCanonical( doc, meta ) {
return linkCanonical?.getAttribute( 'href' ) || meta[ 'og:url' ] || '';
}
+/**
+ * Sanitize inline HTML, keeping only safe formatting elements.
+ *
+ * Strips script/style elements and event handler attributes.
+ * Preserves safe inline elements: strong, em, b, i, u, s, a (href only),
+ * code, mark, sub, sup, span, br.
+ *
+ * @param {Element} element DOM element to sanitize (in-place).
+ */
+function sanitizeInlineContent( element ) {
+ // Remove script and style elements.
+ const dangerous = element.querySelectorAll(
+ 'script, style, object, embed, iframe'
+ );
+ dangerous.forEach( ( el ) => el.remove() );
+
+ // Strip event handlers and javascript: hrefs from all elements.
+ const allEls = element.querySelectorAll( '*' );
+ allEls.forEach( ( el ) => {
+ // Remove all event handler attributes.
+ Array.from( el.attributes ).forEach( ( attr ) => {
+ if ( attr.name.startsWith( 'on' ) ) {
+ el.removeAttribute( attr.name );
+ }
+ } );
+
+ // Strip javascript: URLs from href and src.
+ const href = el.getAttribute( 'href' );
+ if ( href && /^\s*javascript:/i.test( href ) ) {
+ el.removeAttribute( 'href' );
+ }
+ const src = el.getAttribute( 'src' );
+ if ( src && /^\s*javascript:/i.test( src ) ) {
+ el.removeAttribute( 'src' );
+ }
+ } );
+}
+
+/**
+ * Convert a list element (ul/ol) to Gutenberg list block markup.
+ *
+ * @param {Element} listEl The list element (ul or ol).
+ * @param {boolean} ordered Whether this is an ordered list.
+ * @return {string} Gutenberg list block markup.
+ */
+function listElementToBlock( listEl, ordered ) {
+ const tag = ordered ? 'ol' : 'ul';
+ const attr = ordered ? ' {"ordered":true}' : '';
+ let items = '';
+
+ listEl.childNodes.forEach( ( child ) => {
+ if (
+ child.nodeType !== 1 /* ELEMENT_NODE */ ||
+ child.tagName.toLowerCase() !== 'li'
+ ) {
+ return;
+ }
+
+ // Clone to work with it non-destructively.
+ const li = child.cloneNode( true );
+
+ // Handle only direct-child nested lists within the li to avoid double-processing.
+ let nestedBlocks = '';
+ Array.from( li.children ).forEach( ( childEl ) => {
+ const childTag = childEl.tagName.toLowerCase();
+ if ( childTag === 'ul' || childTag === 'ol' ) {
+ const isOrdered = childTag === 'ol';
+ nestedBlocks += '\n' + listElementToBlock( childEl, isOrdered );
+ childEl.remove();
+ }
+ } );
+
+ sanitizeInlineContent( li );
+ const liContent = li.innerHTML.trim();
+
+ items += `\n${ liContent }${ nestedBlocks }\n\n`;
+ } );
+
+ return `\n<${ tag } class="wp-block-list">\n${ items }${ tag }>\n\n\n`;
+}
+
+/**
+ * Convert an HTML string to Gutenberg block markup.
+ *
+ * Handles block-level elements: paragraphs, headings (h1-h6), unordered and
+ * ordered lists with nesting, blockquotes, and preformatted/code blocks.
+ * Inline elements (strong, em, a, code, etc.) are preserved within blocks.
+ * Script/style elements and event handlers are stripped for safety.
+ *
+ * Falls back to a paragraph block for unrecognised or purely inline content.
+ *
+ * @param {string} html HTML string to convert.
+ * @return {string} Gutenberg block markup string, or empty string if no content.
+ */
+export function htmlToBlocks( html ) {
+ if ( ! html || typeof html !== 'string' ) {
+ return '';
+ }
+
+ const parser = new DOMParser();
+ const doc = parser.parseFromString( `${ html }`, 'text/html' );
+ const body = doc.body;
+
+ let blocks = '';
+ let inlineBuffer = '';
+
+ /**
+ * Flush any accumulated inline/text content as a paragraph block.
+ */
+ function flushInlineBuffer() {
+ const trimmed = inlineBuffer.trim();
+ if ( trimmed ) {
+ blocks += `\n${ trimmed }
\n\n\n`;
+ }
+ inlineBuffer = '';
+ }
+
+ /**
+ * Set of block-level tag names that start a new block.
+ */
+ const BLOCK_TAGS = new Set( [
+ 'p',
+ 'ul',
+ 'ol',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'blockquote',
+ 'pre',
+ 'figure',
+ 'div',
+ ] );
+
+ body.childNodes.forEach( ( node ) => {
+ if ( node.nodeType === 3 /* TEXT_NODE */ ) {
+ const text = node.textContent;
+ // Accumulate non-empty text into the inline buffer.
+ if ( text.trim() ) {
+ inlineBuffer += escapeHtml( text );
+ }
+ return;
+ }
+
+ if ( node.nodeType !== 1 /* ELEMENT_NODE */ ) {
+ return;
+ }
+
+ const tag = node.tagName.toLowerCase();
+
+ // Skip dangerous elements.
+ if ( tag === 'script' || tag === 'style' ) {
+ return;
+ }
+
+ if ( ! BLOCK_TAGS.has( tag ) ) {
+ // Inline element – add to buffer.
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ inlineBuffer += tempEl.innerHTML;
+ return;
+ }
+
+ // We're about to emit a block – flush any pending inline content first.
+ flushInlineBuffer();
+
+ // Headings.
+ if ( /^h[1-6]$/.test( tag ) ) {
+ const level = tag[ 1 ];
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ blocks += `\n<${ tag } class="wp-block-heading">${ tempEl.firstChild.innerHTML }${ tag }>\n\n\n`;
+ return;
+ }
+
+ // Lists.
+ if ( tag === 'ul' || tag === 'ol' ) {
+ blocks += listElementToBlock( node, tag === 'ol' );
+ return;
+ }
+
+ // Blockquotes.
+ if ( tag === 'blockquote' ) {
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ const inner = htmlToBlocks( tempEl.firstChild.innerHTML );
+ const innerBlocks =
+ inner ||
+ `\n${ escapeHtml(
+ node.textContent.trim()
+ ) }
\n\n`;
+ blocks += `\n${ innerBlocks }
\n\n\n`;
+ return;
+ }
+
+ // Preformatted / code blocks.
+ if ( tag === 'pre' ) {
+ const codeEl = node.querySelector( 'code' );
+ const codeContent = escapeHtml( ( codeEl || node ).textContent );
+ blocks += `\n${ codeContent }
\n\n\n`;
+ return;
+ }
+
+ // Paragraphs and generic block elements (div, figure, etc.).
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ const innerHtml = tempEl.firstChild.innerHTML.trim();
+ if ( innerHtml ) {
+ blocks += `\n${ innerHtml }
\n\n\n`;
+ }
+ } );
+
+ // Flush any remaining inline content.
+ flushInlineBuffer();
+
+ return blocks.trim();
+}
+
/**
* Build suggested content from server-returned metadata.
*
@@ -478,9 +706,15 @@ ${ escapeHtml( sourceUrl ) }
`;
}
- // Add quote block with description if available.
- // Escape description.
- if ( data.description ) {
+ // Add HTML selection as formatted blocks when available.
+ // Falls back to a plain-text quote block for meta descriptions.
+ if ( data.selectionHtml ) {
+ const selectionBlocks = htmlToBlocks( data.selectionHtml );
+ if ( selectionBlocks ) {
+ content += selectionBlocks + '\n\n';
+ }
+ } else if ( data.description ) {
+ // Escape description.
content += `
${ escapeHtml( data.description ) }
diff --git a/src/utils/index.js b/src/utils/index.js
index 4843855..e2650ac 100644
--- a/src/utils/index.js
+++ b/src/utils/index.js
@@ -8,6 +8,7 @@ export {
parseHtmlMetadata,
buildSuggestedContent,
buildSuggestedContentFromMetadata,
+ htmlToBlocks,
escapeHtml,
escapeAttr,
} from './html-parser';
diff --git a/tests/bookmarklet/bookmarklet.test.js b/tests/bookmarklet/bookmarklet.test.js
index 2e0263e..c1353eb 100644
--- a/tests/bookmarklet/bookmarklet.test.js
+++ b/tests/bookmarklet/bookmarklet.test.js
@@ -207,6 +207,30 @@ describe( 'Bookmarklet Functionality', () => {
).toBe( true );
} );
+ test( 'HTML selection capture preserves formatting', () => {
+ // Check for getRangeAt usage to capture the selection range.
+ expect( bookmarkletSource ).toContain( 'getRangeAt' );
+
+ // Check for cloneContents to extract selected DOM fragment.
+ expect( bookmarkletSource ).toContain( 'cloneContents' );
+
+ // Check for innerHTML to serialise the selection as HTML.
+ expect( bookmarkletSource ).toContain( 'innerHTML' );
+
+ // Check that sel_html is sent alongside plain-text selection.
+ expect(
+ bookmarkletSource.includes( "'sel_html'" ) ||
+ bookmarkletSource.includes( '"sel_html"' ) ||
+ bookmarkletSource.includes( 'sel_html' )
+ ).toBe( true );
+
+ // sel_html is added via the add() helper.
+ expect(
+ bookmarkletSource.includes( "add( 'sel_html'" ) ||
+ bookmarkletSource.includes( "add('sel_html'" )
+ ).toBe( true );
+ } );
+
test( 'Enhanced data extraction - Open Graph video', () => {
// Check for og:video detection.
expect( bookmarkletSource ).toContain( 'og:video' );
diff --git a/tests/utils/html-parser.test.js b/tests/utils/html-parser.test.js
new file mode 100644
index 0000000..25434ee
--- /dev/null
+++ b/tests/utils/html-parser.test.js
@@ -0,0 +1,224 @@
+/**
+ * Tests for HTML parser utilities, especially htmlToBlocks.
+ *
+ * @package press-this
+ */
+
+import { htmlToBlocks, buildSuggestedContent } from '../../src/utils/html-parser';
+
+describe( 'htmlToBlocks', () => {
+ test( 'returns empty string for empty input', () => {
+ expect( htmlToBlocks( '' ) ).toBe( '' );
+ expect( htmlToBlocks( null ) ).toBe( '' );
+ expect( htmlToBlocks( undefined ) ).toBe( '' );
+ } );
+
+ test( 'converts a paragraph to a paragraph block', () => {
+ const result = htmlToBlocks( 'Hello world
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Hello world
' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'converts plain text to a paragraph block', () => {
+ const result = htmlToBlocks( 'Plain text content' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Plain text content' );
+ } );
+
+ test( 'converts h1-h6 headings to heading blocks', () => {
+ [ 1, 2, 3, 4, 5, 6 ].forEach( ( level ) => {
+ const result = htmlToBlocks( `Heading ${ level }` );
+ expect( result ).toContain( `` );
+ expect( result ).toContain( `` );
+ expect( result ).toContain( `Heading ${ level }` );
+ expect( result ).toContain( '' );
+ } );
+ } );
+
+ test( 'converts unordered list to list block', () => {
+ const result = htmlToBlocks( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '- Item 1
' );
+ expect( result ).toContain( '- Item 2
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ // Should NOT use ordered list attribute.
+ expect( result ).not.toContain( '"ordered":true' );
+ } );
+
+ test( 'converts ordered list to list block with ordered attribute', () => {
+ const result = htmlToBlocks( '- First
- Second
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '- First
' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'converts nested lists correctly', () => {
+ const html = '';
+ const result = htmlToBlocks( html );
+ // Outer list.
+ expect( result ).toContain( '' );
+ // Nested list should also be wrapped.
+ const listCount = ( result.match( //g ) || [] ).length;
+ expect( listCount ).toBeGreaterThanOrEqual( 2 );
+ expect( result ).toContain( 'Child item' );
+ expect( result ).toContain( 'Parent' );
+ } );
+
+ test( 'preserves inline formatting in list items', () => {
+ const html = '';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Bold item' );
+ expect( result ).toContain( 'Italic item' );
+ } );
+
+ test( 'converts blockquote to quote block', () => {
+ const result = htmlToBlocks( 'Quote text
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Quote text' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'converts pre/code to code block', () => {
+ const result = htmlToBlocks( 'const x = 1;
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'const x = 1;' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'handles mix of headings and lists (issue example)', () => {
+ const html = `In Unreal Engine: How to Choose
+
+ - Unreal Engine allows developers to switch:
+
+ - Deferred Rendering: Default for most projects.
+ - Forward Rendering: For VR projects.
+
+
+
+Each technique aligns with different requirements.
`;
+
+ const result = htmlToBlocks( html );
+
+ // Heading should be converted.
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'In Unreal Engine: How to Choose' );
+
+ // List should be converted.
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Deferred Rendering' );
+ expect( result ).toContain( 'Forward Rendering' );
+
+ // Paragraph should be converted.
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Each technique aligns' );
+ } );
+
+ test( 'strips script elements for safety', () => {
+ const html = 'Safe text
More text
';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Safe text' );
+ expect( result ).not.toContain( 'alert' );
+ expect( result ).not.toContain( '">Link
';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Link' );
+ expect( result ).not.toContain( 'data:' );
+ } );
+
+ test( 'strips vbscript: href for safety', () => {
+ const html = 'Link
';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Link' );
+ expect( result ).not.toContain( 'vbscript:' );
+ } );
+
+ test( 'unwraps non-allowlist elements but keeps their text', () => {
+ //