diff --git a/assets/bookmarklet.js b/assets/bookmarklet.js
index 234cda1..145e91a 100644
--- a/assets/bookmarklet.js
+++ b/assets/bookmarklet.js
@@ -15,7 +15,7 @@
encURI = window.encodeURIComponent,
head = document.getElementsByTagName( 'head' )[0],
target = '_press_this_app',
- windowWidth, windowHeight, selection,
+ windowWidth, windowHeight, selection, selectionHtml,
metas, links, content, images, iframes, img, scripts,
scrapedData = {},
popup;
@@ -32,7 +32,22 @@
}
if ( window.getSelection ) {
- selection = window.getSelection() + '';
+ var sel = window.getSelection();
+ if ( sel && sel.rangeCount > 0 ) {
+ selection = sel.toString();
+ // Capture HTML to preserve formatting (bold, lists, headings, etc.).
+ // Wrapped in try-catch: cloneContents() can throw DOMException in
+ // some browsers or unusual DOM states (e.g. cross-shadow-DOM ranges).
+ try {
+ var range = sel.getRangeAt( 0 );
+ var fragment = range.cloneContents();
+ var tempDiv = document.createElement( 'div' );
+ tempDiv.appendChild( fragment );
+ selectionHtml = tempDiv.innerHTML;
+ } catch ( e ) {
+ // HTML capture failed; plain-text selection is still available.
+ }
+ }
} else if ( document.getSelection ) {
selection = document.getSelection() + '';
} else if ( document.selection ) {
@@ -299,6 +314,11 @@
add( 's', selection );
}
+ // Add HTML selection to preserve formatting (bold, lists, headings, etc.).
+ if ( selectionHtml ) {
+ add( 'sel_html', selectionHtml );
+ }
+
/**
* Send scraped data to the Press This popup via postMessage.
* Uses polling to wait for the popup to be ready.
diff --git a/assets/bookmarklet.min.js b/assets/bookmarklet.min.js
index 00bb55c..515f14e 100644
--- a/assets/bookmarklet.min.js
+++ b/assets/bookmarklet.min.js
@@ -1 +1 @@
-!function(e,t,i,a){var n,o,r,l,c,s,g,m,d,f,h,u=e.encodeURIComponent,p=t.getElementsByTagName("head")[0],y={};if(a)if(i.match(/^https?:/)){a+="&u="+u(i),e.getSelection?r=e.getSelection()+"":t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||""),a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,T("pt_version",11),l=p.getElementsByTagName("meta")||[];for(var v=0;v200);v++){var b=l[v],O=b.getAttribute("name"),_=b.getAttribute("property"),x=b.getAttribute("content");x&&(O?T("_meta["+O+"]",x):_&&(T("_meta["+_+"]",x),"og:video"!==_&&"og:video:url"!==_&&"og:video:secure_url"!==_||T("_og_video[]",x)))}c=p.getElementsByTagName("link")||[];for(var E=0;E=50);E++){var A=c[E],w=A.getAttribute("rel");"canonical"!==w&&"icon"!==w&&"shortlink"!==w||T("_links["+w+"]",A.getAttribute("href")),"alternate"===w&&"x-default"===A.getAttribute("hreflang")&&T("_links[alternate_canonical]",A.getAttribute("href"))}!function(){f=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);N++)(d=g[N]).src.indexOf("avatar")>-1||d.className.indexOf("avatar")>-1||d.width&&d.width<256||d.height&&d.height<128||d.src&&0!==d.src.indexOf("data:")&&T("_images[]",d.src);m=t.body.getElementsByTagName("iframe")||[];for(var j=0;j=50);j++){var B=m[j].src;B&&"about:blank"!==B&&(B.indexOf("jetpack-comment")>-1||B.indexOf("disqus.com")>-1||B.indexOf("facebook.com/plugins")>-1||B.indexOf("platform.twitter.com/widgets")>-1||B.indexOf("google.com/recaptcha")>-1||B.indexOf("googletagmanager.com")>-1||B.indexOf("doubleclick.net")>-1||B.indexOf("googlesyndication.com")>-1||B.indexOf("amazon-adsystem.com")>-1||B.indexOf("quantserve.com")>-1||B.indexOf("scorecardresearch.com")>-1||B.indexOf("addthis.com")>-1||B.indexOf("sharethis.com")>-1||B.indexOf("addtoany.com")>-1||T("_embeds[]",B))}var k,P;t.title&&T("t",t.title),r&&T("s",r),h=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),k=0,P=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(k++,h&&!h.closed){try{h.postMessage({type:"press-this-data",version:11,data:y},P)}catch(e){}k<50&&setTimeout(e,100)}},200)}else top.location.href=a;function T(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return y[a]||(y[a]=[]),void y[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return y[o]||(y[o]={}),void(y[o][r]=t)}y[e]=t}}function S(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&T("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&T("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?T("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&T("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&T("_jsonld[headline]",e.headline),e.description&&T("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&T("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url);
\ No newline at end of file
+!function(e,t,i,a){var n,o,r,l,c,s,g,d,m,f,h,p,u=e.encodeURIComponent,y=t.getElementsByTagName("head")[0],v={};if(a)if(i.match(/^https?:/)){if(a+="&u="+u(i),e.getSelection){var b=e.getSelection();if(b&&b.rangeCount>0){r=b.toString();try{var O=b.getRangeAt(0).cloneContents(),_=t.createElement("div");_.appendChild(O),l=_.innerHTML}catch(e){}}}else t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||"");a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,q("pt_version",11),c=y.getElementsByTagName("meta")||[];for(var x=0;x200);x++){var E=c[x],A=E.getAttribute("name"),w=E.getAttribute("property"),N=E.getAttribute("content");N&&(A?q("_meta["+A+"]",N):w&&(q("_meta["+w+"]",N),"og:video"!==w&&"og:video:url"!==w&&"og:video:secure_url"!==w||q("_og_video[]",N)))}s=y.getElementsByTagName("link")||[];for(var j=0;j=50);j++){var B=s[j],T=B.getAttribute("rel");"canonical"!==T&&"icon"!==T&&"shortlink"!==T||q("_links["+T+"]",B.getAttribute("href")),"alternate"===T&&"x-default"===B.getAttribute("hreflang")&&q("_links[alternate_canonical]",B.getAttribute("href"))}!function(){h=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);k++)(f=d[k]).src.indexOf("avatar")>-1||f.className.indexOf("avatar")>-1||f.width&&f.width<256||f.height&&f.height<128||f.src&&0!==f.src.indexOf("data:")&&q("_images[]",f.src);m=t.body.getElementsByTagName("iframe")||[];for(var P=0;P=50);P++){var C=m[P].src;C&&"about:blank"!==C&&(C.indexOf("jetpack-comment")>-1||C.indexOf("disqus.com")>-1||C.indexOf("facebook.com/plugins")>-1||C.indexOf("platform.twitter.com/widgets")>-1||C.indexOf("google.com/recaptcha")>-1||C.indexOf("googletagmanager.com")>-1||C.indexOf("doubleclick.net")>-1||C.indexOf("googlesyndication.com")>-1||C.indexOf("amazon-adsystem.com")>-1||C.indexOf("quantserve.com")>-1||C.indexOf("scorecardresearch.com")>-1||C.indexOf("addthis.com")>-1||C.indexOf("sharethis.com")>-1||C.indexOf("addtoany.com")>-1||q("_embeds[]",C))}var S,U;t.title&&q("t",t.title),r&&q("s",r),l&&q("sel_html",l),p=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),S=0,U=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(S++,p&&!p.closed){try{p.postMessage({type:"press-this-data",version:11,data:v},U)}catch(e){}S<50&&setTimeout(e,100)}},200)}else top.location.href=a;function q(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return v[a]||(v[a]=[]),void v[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return v[o]||(v[o]={}),void(v[o][r]=t)}v[e]=t}}function H(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&q("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&q("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?q("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&q("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&q("_jsonld[headline]",e.headline),e.description&&q("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&q("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url);
\ No newline at end of file
diff --git a/src/App.js b/src/App.js
index 32ba56f..85f4a2d 100644
--- a/src/App.js
+++ b/src/App.js
@@ -221,8 +221,13 @@ export default function App() {
// Build suggested content from bookmarklet metadata.
// Extract description from meta tags.
const meta = messageData._meta || {};
+
+ // HTML selection takes highest priority (preserves formatting).
+ // Always compute plain-text description as a fallback; buildSuggestedContent
+ // will use it if htmlToBlocks() produces no blocks from selectionHtml.
+ const selectionHtml = messageData.sel_html || '';
const description =
- messageData.s || // User selection takes priority.
+ messageData.s || // Plain-text user selection.
meta[ 'twitter:description' ] ||
meta[ 'og:description' ] ||
meta.description ||
@@ -243,6 +248,7 @@ export default function App() {
const suggestedContent = buildSuggestedContentFromMetadata( {
title,
description,
+ selectionHtml,
siteName: meta[ 'og:site_name' ] || '',
canonical,
url: receivedSourceUrl,
diff --git a/src/components/BlockTransformShortcuts.js b/src/components/BlockTransformShortcuts.js
index a61a39b..3f3cf20 100644
--- a/src/components/BlockTransformShortcuts.js
+++ b/src/components/BlockTransformShortcuts.js
@@ -137,7 +137,11 @@ export default function BlockTransformShortcuts() {
if ( innerBlocks.length > 0 ) {
// Replace the quote with its inner blocks directly.
const replacementBlocks = innerBlocks.map( ( inner ) =>
- createBlock( inner.name, { ...inner.attributes }, inner.innerBlocks )
+ createBlock(
+ inner.name,
+ { ...inner.attributes },
+ inner.innerBlocks
+ )
);
replaceBlocks( currentClientId, replacementBlocks );
} else {
diff --git a/src/utils/html-parser.js b/src/utils/html-parser.js
index 2b1f82d..d4b83ed 100644
--- a/src/utils/html-parser.js
+++ b/src/utils/html-parser.js
@@ -447,14 +447,290 @@ function getCanonical( doc, meta ) {
return linkCanonical?.getAttribute( 'href' ) || meta[ 'og:url' ] || '';
}
+/**
+ * Sanitize inline HTML, keeping only a safe allowlist of formatting elements.
+ *
+ * Allowlisted inline elements: strong, em, b, i, u, s, a (href only),
+ * code, mark, sub, sup, span, br.
+ * All other elements are unwrapped (their children are preserved).
+ * Dangerous elements (script, style, object, embed, iframe) are fully removed.
+ * All attributes are stripped except `href` on `` elements.
+ * URI schemes `javascript:`, `data:`, and `vbscript:` are blocked in `href`.
+ *
+ * @param {Element} element DOM element to sanitize (in-place).
+ */
+function sanitizeInlineContent( element ) {
+ // Completely remove dangerous elements (do not preserve children).
+ const dangerous = element.querySelectorAll(
+ 'script, style, object, embed, iframe'
+ );
+ dangerous.forEach( ( el ) => el.remove() );
+
+ const ALLOWED_INLINE = new Set( [
+ 'strong',
+ 'em',
+ 'b',
+ 'i',
+ 'u',
+ 's',
+ 'a',
+ 'code',
+ 'mark',
+ 'sub',
+ 'sup',
+ 'span',
+ 'br',
+ ] );
+
+ // Iterate over a static snapshot — we may mutate the DOM while iterating.
+ Array.from( element.querySelectorAll( '*' ) ).forEach( ( el ) => {
+ const tagName = el.tagName.toLowerCase();
+
+ if ( ! ALLOWED_INLINE.has( tagName ) ) {
+ // Unwrap: replace the element with its child nodes.
+ el.replaceWith( ...el.childNodes );
+ return;
+ }
+
+ // Strip all attributes except allowed ones per element.
+ Array.from( el.attributes ).forEach( ( attr ) => {
+ if ( tagName === 'a' && attr.name === 'href' ) {
+ // Block dangerous URI schemes.
+ if ( /^\s*(javascript|data|vbscript):/i.test( attr.value ) ) {
+ el.removeAttribute( attr.name );
+ }
+ } else {
+ el.removeAttribute( attr.name );
+ }
+ } );
+ } );
+}
+
+/**
+ * Convert a list element (ul/ol) to Gutenberg list block markup.
+ *
+ * @param {Element} listEl The list element (ul or ol).
+ * @param {boolean} ordered Whether this is an ordered list.
+ * @param {number} depth Current recursion depth (default 0; max 10).
+ * @return {string} Gutenberg list block markup.
+ */
+function listElementToBlock( listEl, ordered, depth = 0 ) {
+ if ( depth > 10 ) {
+ return '';
+ }
+
+ const tag = ordered ? 'ol' : 'ul';
+ const attr = ordered ? ' {"ordered":true}' : '';
+ let items = '';
+
+ listEl.childNodes.forEach( ( child ) => {
+ if (
+ child.nodeType !== 1 /* ELEMENT_NODE */ ||
+ child.tagName.toLowerCase() !== 'li'
+ ) {
+ return;
+ }
+
+ // Clone to work with it non-destructively.
+ const li = child.cloneNode( true );
+
+ // Handle only direct-child nested lists within the li to avoid double-processing.
+ let nestedBlocks = '';
+ Array.from( li.children ).forEach( ( childEl ) => {
+ const childTag = childEl.tagName.toLowerCase();
+ if ( childTag === 'ul' || childTag === 'ol' ) {
+ const isOrdered = childTag === 'ol';
+ nestedBlocks +=
+ '\n' + listElementToBlock( childEl, isOrdered, depth + 1 );
+ childEl.remove();
+ }
+ } );
+
+ sanitizeInlineContent( li );
+ const liContent = li.innerHTML.trim();
+
+ items += `\n${ liContent }${ nestedBlocks }\n\n`;
+ } );
+
+ return `\n<${ tag } class="wp-block-list">\n${ items }${ tag }>\n\n\n`;
+}
+
+/**
+ * Convert an HTML string to Gutenberg block markup.
+ *
+ * Handles block-level elements: paragraphs, headings (h1-h6), unordered and
+ * ordered lists with nesting, blockquotes, and preformatted/code blocks.
+ * Inline elements are filtered through an allowlist (strong, em, b, i, u, s,
+ * a[href], code, mark, sub, sup, span, br). All other attributes are stripped.
+ *
+ * Falls back to a paragraph block for unrecognised or purely inline content.
+ *
+ * @param {string} html HTML string to convert.
+ * @param {number} depth Internal recursion depth; callers should omit this.
+ * @return {string} Gutenberg block markup string, or empty string if no content.
+ */
+export function htmlToBlocks( html, depth = 0 ) {
+ if ( ! html || typeof html !== 'string' ) {
+ return '';
+ }
+
+ // Guard against deeply nested blockquote structures.
+ if ( depth > 5 ) {
+ return `\n${ escapeHtml(
+ html
+ ) }
\n\n`;
+ }
+
+ const parser = new DOMParser();
+ const doc = parser.parseFromString( `${ html }`, 'text/html' );
+ const body = doc.body;
+
+ let blocks = '';
+ let inlineBuffer = '';
+
+ /**
+ * Flush any accumulated inline/text content as a paragraph block.
+ */
+ function flushInlineBuffer() {
+ const trimmed = inlineBuffer.trim();
+ if ( trimmed ) {
+ blocks += `\n${ trimmed }
\n\n\n`;
+ }
+ inlineBuffer = '';
+ }
+
+ /**
+ * Set of block-level tag names that start a new block.
+ */
+ const BLOCK_TAGS = new Set( [
+ 'p',
+ 'ul',
+ 'ol',
+ 'h1',
+ 'h2',
+ 'h3',
+ 'h4',
+ 'h5',
+ 'h6',
+ 'blockquote',
+ 'pre',
+ 'figure',
+ 'div',
+ ] );
+
+ body.childNodes.forEach( ( node ) => {
+ if ( node.nodeType === 3 /* TEXT_NODE */ ) {
+ const text = node.textContent;
+ // Accumulate non-empty text into the inline buffer.
+ if ( text.trim() ) {
+ inlineBuffer += escapeHtml( text );
+ }
+ return;
+ }
+
+ if ( node.nodeType !== 1 /* ELEMENT_NODE */ ) {
+ return;
+ }
+
+ const tag = node.tagName.toLowerCase();
+
+ // Skip dangerous elements.
+ if ( tag === 'script' || tag === 'style' ) {
+ return;
+ }
+
+ if ( ! BLOCK_TAGS.has( tag ) ) {
+ // Inline element – add to buffer.
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ inlineBuffer += tempEl.innerHTML;
+ return;
+ }
+
+ // We're about to emit a block – flush any pending inline content first.
+ flushInlineBuffer();
+
+ // Headings.
+ if ( /^h[1-6]$/.test( tag ) ) {
+ const level = tag[ 1 ];
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ // After sanitization, block-level wrappers (h-tags) are unwrapped by
+ // the allowlist; tempEl.innerHTML holds the sanitized inner content.
+ blocks += `\n<${ tag } class="wp-block-heading">${ tempEl.innerHTML }${ tag }>\n\n\n`;
+ return;
+ }
+
+ // Lists.
+ if ( tag === 'ul' || tag === 'ol' ) {
+ blocks += listElementToBlock( node, tag === 'ol' );
+ return;
+ }
+
+ // Blockquotes.
+ if ( tag === 'blockquote' ) {
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ const inner = htmlToBlocks( tempEl.innerHTML, depth + 1 );
+ const innerBlocks =
+ inner ||
+ `\n${ escapeHtml(
+ node.textContent.trim()
+ ) }
\n\n`;
+ blocks += `\n${ innerBlocks }
\n\n\n`;
+ return;
+ }
+
+ // Preformatted / code blocks.
+ if ( tag === 'pre' ) {
+ const codeEl = node.querySelector( 'code' );
+ const codeContent = escapeHtml( ( codeEl || node ).textContent );
+ blocks += `\n${ codeContent }
\n\n\n`;
+ return;
+ }
+
+ // Paragraphs and generic block elements (div, figure, etc.).
+ const clone = node.cloneNode( true );
+ const tempEl = doc.createElement( 'div' );
+ tempEl.appendChild( clone );
+ sanitizeInlineContent( tempEl );
+ // tempEl.innerHTML holds the sanitized inner content after any outer
+ // block-level wrapper has been unwrapped by the allowlist sanitizer.
+ const innerHtml = tempEl.innerHTML.trim();
+ if ( innerHtml ) {
+ blocks += `\n${ innerHtml }
\n\n\n`;
+ }
+ } );
+
+ // Flush any remaining inline content.
+ flushInlineBuffer();
+
+ return blocks.trim();
+}
+
/**
* Build suggested content from server-returned metadata.
*
* All dynamic content is escaped.
* Uses pre-sanitized server data instead of client-side parsing.
*
- * @param {Object} data Server-returned metadata object.
- * @param {string} sourceUrl Original source URL.
+ * When `data.selectionHtml` is provided the selected HTML is converted to
+ * formatted Gutenberg blocks via `htmlToBlocks`. If the conversion produces
+ * no blocks (e.g. the selection contained only script tags), the function
+ * falls back to `data.description` in a plain-text quote block.
+ *
+ * @param {Object} data Server-returned metadata object.
+ * @param {string} data.selectionHtml Optional HTML string of the user's selection.
+ * @param {string} data.description Optional plain-text description / meta excerpt.
+ * @param {string} data.title Optional page title.
+ * @param {string} data.siteName Optional site name.
+ * @param {string} sourceUrl Original source URL.
* @return {string} Gutenberg block content.
*/
export function buildSuggestedContent( data, sourceUrl ) {
@@ -478,9 +754,17 @@ ${ escapeHtml( sourceUrl ) }
`;
}
- // Add quote block with description if available.
- // Escape description.
- if ( data.description ) {
+ // Prefer formatted HTML selection; fall back to plain-text description.
+ let selectionBlocks = '';
+ if ( data.selectionHtml ) {
+ selectionBlocks = htmlToBlocks( data.selectionHtml );
+ }
+
+ if ( selectionBlocks ) {
+ // HTML selection converted successfully to blocks.
+ content += selectionBlocks + '\n\n';
+ } else if ( data.description ) {
+ // No HTML selection (or it produced no blocks) – use plain-text quote.
content += `
${ escapeHtml( data.description ) }
diff --git a/src/utils/index.js b/src/utils/index.js
index 4843855..e2650ac 100644
--- a/src/utils/index.js
+++ b/src/utils/index.js
@@ -8,6 +8,7 @@ export {
parseHtmlMetadata,
buildSuggestedContent,
buildSuggestedContentFromMetadata,
+ htmlToBlocks,
escapeHtml,
escapeAttr,
} from './html-parser';
diff --git a/tests/bookmarklet/bookmarklet.test.js b/tests/bookmarklet/bookmarklet.test.js
index 2e0263e..c1353eb 100644
--- a/tests/bookmarklet/bookmarklet.test.js
+++ b/tests/bookmarklet/bookmarklet.test.js
@@ -207,6 +207,30 @@ describe( 'Bookmarklet Functionality', () => {
).toBe( true );
} );
+ test( 'HTML selection capture preserves formatting', () => {
+ // Check for getRangeAt usage to capture the selection range.
+ expect( bookmarkletSource ).toContain( 'getRangeAt' );
+
+ // Check for cloneContents to extract selected DOM fragment.
+ expect( bookmarkletSource ).toContain( 'cloneContents' );
+
+ // Check for innerHTML to serialise the selection as HTML.
+ expect( bookmarkletSource ).toContain( 'innerHTML' );
+
+ // Check that sel_html is sent alongside plain-text selection.
+ expect(
+ bookmarkletSource.includes( "'sel_html'" ) ||
+ bookmarkletSource.includes( '"sel_html"' ) ||
+ bookmarkletSource.includes( 'sel_html' )
+ ).toBe( true );
+
+ // sel_html is added via the add() helper.
+ expect(
+ bookmarkletSource.includes( "add( 'sel_html'" ) ||
+ bookmarkletSource.includes( "add('sel_html'" )
+ ).toBe( true );
+ } );
+
test( 'Enhanced data extraction - Open Graph video', () => {
// Check for og:video detection.
expect( bookmarkletSource ).toContain( 'og:video' );
diff --git a/tests/e2e/html-selection.spec.js b/tests/e2e/html-selection.spec.js
new file mode 100644
index 0000000..0978396
--- /dev/null
+++ b/tests/e2e/html-selection.spec.js
@@ -0,0 +1,381 @@
+/**
+ * HTML Selection Preservation E2E Tests
+ *
+ * Verifies that HTML-formatted text selections sent via postMessage
+ * are correctly converted to Gutenberg blocks in the editor.
+ *
+ * These tests bypass the bookmarklet and send postMessage directly,
+ * testing the real code path: App.js handler → htmlToBlocks() → block rendering.
+ */
+const { test, expect } = require( './utils/auth' );
+
+/**
+ * Send a postMessage to the Press This editor with test data.
+ *
+ * @param {import('@playwright/test').Page} page Playwright page.
+ * @param {Object} overrides Fields to override in the message data.
+ */
+async function sendPostMessage( page, overrides = {} ) {
+ const defaults = {
+ t: 'Test Page Title',
+ s: '',
+ sel_html: '',
+ u: 'https://example.com/test',
+ _meta: {},
+ _links: {},
+ _images: [],
+ _embeds: [],
+ };
+ await page.evaluate( ( data ) => {
+ window.postMessage(
+ {
+ type: 'press-this-data',
+ version: '1.0.0',
+ data,
+ },
+ '*'
+ );
+ }, { ...defaults, ...overrides } );
+}
+
+/**
+ * Wait for Gutenberg blocks to appear in the editor content area.
+ *
+ * @param {import('@playwright/test').Page} page Playwright page.
+ */
+async function waitForBlocks( page ) {
+ await page
+ .locator( '.press-this-editor__content [data-type]' )
+ .first()
+ .waitFor( { timeout: 10000 } );
+}
+
+/**
+ * Navigate to Press This in postMessage mode and wait for the editor.
+ *
+ * @param {import('@playwright/test').Page} page Playwright page.
+ */
+async function loadEditor( page ) {
+ await page.goto( '/wp-admin/press-this.php?pm=1' );
+ await page
+ .locator( '.press-this-editor__content' )
+ .waitFor( { timeout: 10000 } );
+}
+
+const editorContent = '.press-this-editor__content';
+
+test.describe( 'HTML Selection Preservation', () => {
+ test.describe( 'Core Formatting', () => {
+ test( 'bold and italic inline formatting', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html:
+ 'This has bold and italic text.
',
+ } );
+ await waitForBlocks( page );
+
+ const paragraph = page.locator(
+ `${ editorContent } [data-type="core/paragraph"]`
+ );
+ await expect( paragraph.first() ).toContainText( 'bold' );
+ await expect( paragraph.first() ).toContainText( 'italic' );
+
+ // Verify the formatting tags are preserved in the rendered HTML.
+ const html = await paragraph.first().innerHTML();
+ expect( html ).toContain( '' );
+ expect( html ).toContain( '' );
+ } );
+
+ test( 'headings at correct levels', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html: 'Heading Two
Heading Three
',
+ } );
+ await waitForBlocks( page );
+
+ const headings = page.locator(
+ `${ editorContent } [data-type="core/heading"]`
+ );
+ await expect( headings ).toHaveCount( 2 );
+ await expect( headings.nth( 0 ) ).toContainText(
+ 'Heading Two'
+ );
+ await expect( headings.nth( 1 ) ).toContainText(
+ 'Heading Three'
+ );
+ } );
+
+ test( 'unordered list', async ( { loggedInPage: page } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html:
+ '',
+ } );
+ await waitForBlocks( page );
+
+ const list = page.locator(
+ `${ editorContent } [data-type="core/list"]`
+ );
+ await expect( list.first() ).toBeVisible();
+ await expect( list.first() ).toContainText( 'Item A' );
+ await expect( list.first() ).toContainText( 'Item B' );
+ } );
+
+ test( 'ordered list', async ( { loggedInPage: page } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html:
+ '- First
- Second
',
+ } );
+ await waitForBlocks( page );
+
+ const list = page.locator(
+ `${ editorContent } [data-type="core/list"]`
+ );
+ await expect( list.first() ).toBeVisible();
+ await expect( list.first() ).toContainText( 'First' );
+ await expect( list.first() ).toContainText( 'Second' );
+ } );
+
+ test( 'blockquote', async ( { loggedInPage: page } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html:
+ 'A quoted passage.
',
+ } );
+ await waitForBlocks( page );
+
+ const quote = page.locator(
+ `${ editorContent } [data-type="core/quote"]`
+ );
+ await expect( quote ).toBeVisible();
+ await expect( quote ).toContainText( 'A quoted passage.' );
+ } );
+
+ test( 'code block', async ( { loggedInPage: page } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html: 'const x = 42;
',
+ } );
+ await waitForBlocks( page );
+
+ const code = page.locator(
+ `${ editorContent } [data-type="core/code"]`
+ );
+ await expect( code ).toBeVisible();
+ await expect( code ).toContainText( 'const x = 42;' );
+ } );
+
+ test( 'mixed content preserves all block types', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html: [
+ 'Getting Started
',
+ 'Follow these steps:
',
+ '- Install dependencies
- Run the server
',
+ 'npm start
',
+ ].join( '' ),
+ } );
+ await waitForBlocks( page );
+
+ await expect(
+ page.locator(
+ `${ editorContent } [data-type="core/heading"]`
+ )
+ ).toBeVisible();
+ await expect(
+ page.locator(
+ `${ editorContent } [data-type="core/paragraph"]`
+ ).first()
+ ).toContainText( 'steps' );
+ await expect(
+ page.locator(
+ `${ editorContent } [data-type="core/list"]`
+ )
+ ).toBeVisible();
+ await expect(
+ page.locator(
+ `${ editorContent } [data-type="core/code"]`
+ )
+ ).toBeVisible();
+ } );
+ } );
+
+ test.describe( 'Backward Compatibility', () => {
+ test( 'plain text selection without sel_html produces quote block', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ s: 'This is a plain text selection that should appear in a quote block.',
+ sel_html: '',
+ } );
+ await waitForBlocks( page );
+
+ const quote = page.locator(
+ `${ editorContent } [data-type="core/quote"]`
+ );
+ await expect( quote ).toBeVisible();
+ await expect( quote ).toContainText(
+ 'plain text selection'
+ );
+ } );
+
+ test( 'no selection populates title and source only', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ t: 'Just a Title',
+ s: '',
+ sel_html: '',
+ } );
+ await waitForBlocks( page );
+
+ // Title should be populated.
+ const titleInput = page.getByLabel( 'Post title' );
+ await expect( titleInput ).toHaveValue( 'Just a Title', {
+ timeout: 10000,
+ } );
+
+ // Source attribution should exist.
+ await expect(
+ page
+ .locator( `${ editorContent }` )
+ .getByText( 'Source:' )
+ ).toBeVisible();
+
+ // No quote block should appear.
+ await expect(
+ page.locator(
+ `${ editorContent } [data-type="core/quote"]`
+ )
+ ).toHaveCount( 0 );
+ } );
+ } );
+
+ test.describe( 'Security', () => {
+ test( 'script tags are stripped', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html:
+ 'Safe text
',
+ } );
+ await waitForBlocks( page );
+
+ await expect(
+ page.locator( `${ editorContent }` )
+ ).toContainText( 'Safe text' );
+ await expect(
+ page.locator( `${ editorContent }` )
+ ).not.toContainText( 'alert' );
+ } );
+
+ test( 'dangerous attributes are stripped', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html:
+ 'Styled text
',
+ } );
+ await waitForBlocks( page );
+
+ const paragraph = page.locator(
+ `${ editorContent } [data-type="core/paragraph"]`
+ );
+ await expect( paragraph.first() ).toContainText(
+ 'Styled text'
+ );
+
+ // Verify no dangerous attributes leaked into the rich text content.
+ const hasUnsafeAttrs = await paragraph.first().evaluate( ( el ) => {
+ const p = el.querySelector( 'p, [role="document"]' );
+ if ( ! p ) {
+ return false;
+ }
+ return (
+ p.hasAttribute( 'onclick' ) ||
+ p.hasAttribute( 'style' ) ||
+ p.getAttribute( 'class' )?.includes( 'danger' )
+ );
+ } );
+ expect( hasUnsafeAttrs ).toBe( false );
+ } );
+
+ test( 'javascript: and data: URI schemes are blocked', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html:
+ 'Bad link and Good link
',
+ } );
+ await waitForBlocks( page );
+
+ // The safe link should be present.
+ await expect(
+ page.locator(
+ `${ editorContent } a[href="https://safe.example"]`
+ )
+ ).toBeVisible();
+
+ // No javascript: href should exist in the editor content.
+ const hasJsHref = await page
+ .locator( `${ editorContent }` )
+ .evaluate( ( el ) => {
+ return el.querySelector( 'a[href^="javascript:"]' ) !== null;
+ } );
+ expect( hasJsHref ).toBe( false );
+ } );
+ } );
+
+ test.describe( 'Edge Cases', () => {
+ test( 'empty sel_html falls back to plain-text description', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html: '',
+ s: 'Fallback description text.',
+ } );
+ await waitForBlocks( page );
+
+ // htmlToBlocks returns empty for script-only input,
+ // so the plain-text fallback should be used in a quote block.
+ const quote = page.locator(
+ `${ editorContent } [data-type="core/quote"]`
+ );
+ await expect( quote ).toBeVisible();
+ await expect( quote ).toContainText(
+ 'Fallback description text.'
+ );
+ } );
+
+ test( 'special characters are not double-escaped', async ( {
+ loggedInPage: page,
+ } ) => {
+ await loadEditor( page );
+ await sendPostMessage( page, {
+ sel_html: 'Tom & Jerry
',
+ } );
+ await waitForBlocks( page );
+
+ const paragraph = page.locator(
+ `${ editorContent } [data-type="core/paragraph"]`
+ );
+ // Should render as "Tom & Jerry", not "Tom & Jerry".
+ await expect( paragraph.first() ).toContainText(
+ 'Tom & Jerry'
+ );
+ } );
+ } );
+} );
diff --git a/tests/utils/html-parser.test.js b/tests/utils/html-parser.test.js
index c42c68f..26c982f 100644
--- a/tests/utils/html-parser.test.js
+++ b/tests/utils/html-parser.test.js
@@ -1,5 +1,5 @@
/**
- * Tests for HTML Parser utilities.
+ * Tests for HTML parser utilities.
*
* @package press-this
*/
@@ -7,6 +7,7 @@
import {
escapeHtml,
escapeAttr,
+ htmlToBlocks,
parseHtmlMetadata,
buildSuggestedContent,
buildSuggestedContentFromMetadata,
@@ -84,6 +85,214 @@ describe( 'escapeAttr', () => {
} );
} );
+describe( 'htmlToBlocks', () => {
+ test( 'returns empty string for empty input', () => {
+ expect( htmlToBlocks( '' ) ).toBe( '' );
+ expect( htmlToBlocks( null ) ).toBe( '' );
+ expect( htmlToBlocks( undefined ) ).toBe( '' );
+ } );
+
+ test( 'converts a paragraph to a paragraph block', () => {
+ const result = htmlToBlocks( 'Hello world
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Hello world
' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'converts plain text to a paragraph block', () => {
+ const result = htmlToBlocks( 'Plain text content' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Plain text content' );
+ } );
+
+ test( 'converts h1-h6 headings to heading blocks', () => {
+ [ 1, 2, 3, 4, 5, 6 ].forEach( ( level ) => {
+ const result = htmlToBlocks( `Heading ${ level }` );
+ expect( result ).toContain( `` );
+ expect( result ).toContain( `` );
+ expect( result ).toContain( `Heading ${ level }` );
+ expect( result ).toContain( '' );
+ } );
+ } );
+
+ test( 'converts unordered list to list block', () => {
+ const result = htmlToBlocks( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '- Item 1
' );
+ expect( result ).toContain( '- Item 2
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ // Should NOT use ordered list attribute.
+ expect( result ).not.toContain( '"ordered":true' );
+ } );
+
+ test( 'converts ordered list to list block with ordered attribute', () => {
+ const result = htmlToBlocks( '- First
- Second
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '- First
' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'converts nested lists correctly', () => {
+ const html = '';
+ const result = htmlToBlocks( html );
+ // Outer list.
+ expect( result ).toContain( '' );
+ // Nested list should also be wrapped.
+ const listCount = ( result.match( //g ) || [] ).length;
+ expect( listCount ).toBeGreaterThanOrEqual( 2 );
+ expect( result ).toContain( 'Child item' );
+ expect( result ).toContain( 'Parent' );
+ } );
+
+ test( 'preserves inline formatting in list items', () => {
+ const html = '';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Bold item' );
+ expect( result ).toContain( 'Italic item' );
+ } );
+
+ test( 'converts blockquote to quote block', () => {
+ const result = htmlToBlocks( 'Quote text
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Quote text' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'converts pre/code to code block', () => {
+ const result = htmlToBlocks( 'const x = 1;
' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'const x = 1;' );
+ expect( result ).toContain( '' );
+ } );
+
+ test( 'handles mix of headings and lists (issue example)', () => {
+ const html = `In Unreal Engine: How to Choose
+
+ - Unreal Engine allows developers to switch:
+
+ - Deferred Rendering: Default for most projects.
+ - Forward Rendering: For VR projects.
+
+
+
+Each technique aligns with different requirements.
`;
+
+ const result = htmlToBlocks( html );
+
+ // Heading should be converted.
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'In Unreal Engine: How to Choose' );
+
+ // List should be converted.
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Deferred Rendering' );
+ expect( result ).toContain( 'Forward Rendering' );
+
+ // Paragraph should be converted.
+ expect( result ).toContain( '' );
+ expect( result ).toContain( 'Each technique aligns' );
+ } );
+
+ test( 'strips script elements for safety', () => {
+ const html = 'Safe text
More text
';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Safe text' );
+ expect( result ).not.toContain( 'alert' );
+ expect( result ).not.toContain( '">Link
';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Link' );
+ expect( result ).not.toContain( 'data:' );
+ } );
+
+ test( 'strips vbscript: href for safety', () => {
+ const html = 'Link
';
+ const result = htmlToBlocks( html );
+ expect( result ).toContain( 'Link' );
+ expect( result ).not.toContain( 'vbscript:' );
+ } );
+
+ test( 'unwraps non-allowlist elements but keeps their text', () => {
+ //