diff --git a/assets/bookmarklet.js b/assets/bookmarklet.js index 234cda1..145e91a 100644 --- a/assets/bookmarklet.js +++ b/assets/bookmarklet.js @@ -15,7 +15,7 @@ encURI = window.encodeURIComponent, head = document.getElementsByTagName( 'head' )[0], target = '_press_this_app', - windowWidth, windowHeight, selection, + windowWidth, windowHeight, selection, selectionHtml, metas, links, content, images, iframes, img, scripts, scrapedData = {}, popup; @@ -32,7 +32,22 @@ } if ( window.getSelection ) { - selection = window.getSelection() + ''; + var sel = window.getSelection(); + if ( sel && sel.rangeCount > 0 ) { + selection = sel.toString(); + // Capture HTML to preserve formatting (bold, lists, headings, etc.). + // Wrapped in try-catch: cloneContents() can throw DOMException in + // some browsers or unusual DOM states (e.g. cross-shadow-DOM ranges). + try { + var range = sel.getRangeAt( 0 ); + var fragment = range.cloneContents(); + var tempDiv = document.createElement( 'div' ); + tempDiv.appendChild( fragment ); + selectionHtml = tempDiv.innerHTML; + } catch ( e ) { + // HTML capture failed; plain-text selection is still available. + } + } } else if ( document.getSelection ) { selection = document.getSelection() + ''; } else if ( document.selection ) { @@ -299,6 +314,11 @@ add( 's', selection ); } + // Add HTML selection to preserve formatting (bold, lists, headings, etc.). + if ( selectionHtml ) { + add( 'sel_html', selectionHtml ); + } + /** * Send scraped data to the Press This popup via postMessage. * Uses polling to wait for the popup to be ready. diff --git a/assets/bookmarklet.min.js b/assets/bookmarklet.min.js index 00bb55c..515f14e 100644 --- a/assets/bookmarklet.min.js +++ b/assets/bookmarklet.min.js @@ -1 +1 @@ -!function(e,t,i,a){var n,o,r,l,c,s,g,m,d,f,h,u=e.encodeURIComponent,p=t.getElementsByTagName("head")[0],y={};if(a)if(i.match(/^https?:/)){a+="&u="+u(i),e.getSelection?r=e.getSelection()+"":t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||""),a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,T("pt_version",11),l=p.getElementsByTagName("meta")||[];for(var v=0;v200);v++){var b=l[v],O=b.getAttribute("name"),_=b.getAttribute("property"),x=b.getAttribute("content");x&&(O?T("_meta["+O+"]",x):_&&(T("_meta["+_+"]",x),"og:video"!==_&&"og:video:url"!==_&&"og:video:secure_url"!==_||T("_og_video[]",x)))}c=p.getElementsByTagName("link")||[];for(var E=0;E=50);E++){var A=c[E],w=A.getAttribute("rel");"canonical"!==w&&"icon"!==w&&"shortlink"!==w||T("_links["+w+"]",A.getAttribute("href")),"alternate"===w&&"x-default"===A.getAttribute("hreflang")&&T("_links[alternate_canonical]",A.getAttribute("href"))}!function(){f=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);N++)(d=g[N]).src.indexOf("avatar")>-1||d.className.indexOf("avatar")>-1||d.width&&d.width<256||d.height&&d.height<128||d.src&&0!==d.src.indexOf("data:")&&T("_images[]",d.src);m=t.body.getElementsByTagName("iframe")||[];for(var j=0;j=50);j++){var B=m[j].src;B&&"about:blank"!==B&&(B.indexOf("jetpack-comment")>-1||B.indexOf("disqus.com")>-1||B.indexOf("facebook.com/plugins")>-1||B.indexOf("platform.twitter.com/widgets")>-1||B.indexOf("google.com/recaptcha")>-1||B.indexOf("googletagmanager.com")>-1||B.indexOf("doubleclick.net")>-1||B.indexOf("googlesyndication.com")>-1||B.indexOf("amazon-adsystem.com")>-1||B.indexOf("quantserve.com")>-1||B.indexOf("scorecardresearch.com")>-1||B.indexOf("addthis.com")>-1||B.indexOf("sharethis.com")>-1||B.indexOf("addtoany.com")>-1||T("_embeds[]",B))}var k,P;t.title&&T("t",t.title),r&&T("s",r),h=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),k=0,P=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(k++,h&&!h.closed){try{h.postMessage({type:"press-this-data",version:11,data:y},P)}catch(e){}k<50&&setTimeout(e,100)}},200)}else top.location.href=a;function T(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return y[a]||(y[a]=[]),void y[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return y[o]||(y[o]={}),void(y[o][r]=t)}y[e]=t}}function S(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&T("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&T("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?T("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&T("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&T("_jsonld[headline]",e.headline),e.description&&T("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&T("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url); \ No newline at end of file +!function(e,t,i,a){var n,o,r,l,c,s,g,d,m,f,h,p,u=e.encodeURIComponent,y=t.getElementsByTagName("head")[0],v={};if(a)if(i.match(/^https?:/)){if(a+="&u="+u(i),e.getSelection){var b=e.getSelection();if(b&&b.rangeCount>0){r=b.toString();try{var O=b.getRangeAt(0).cloneContents(),_=t.createElement("div");_.appendChild(O),l=_.innerHTML}catch(e){}}}else t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||"");a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,q("pt_version",11),c=y.getElementsByTagName("meta")||[];for(var x=0;x200);x++){var E=c[x],A=E.getAttribute("name"),w=E.getAttribute("property"),N=E.getAttribute("content");N&&(A?q("_meta["+A+"]",N):w&&(q("_meta["+w+"]",N),"og:video"!==w&&"og:video:url"!==w&&"og:video:secure_url"!==w||q("_og_video[]",N)))}s=y.getElementsByTagName("link")||[];for(var j=0;j=50);j++){var B=s[j],T=B.getAttribute("rel");"canonical"!==T&&"icon"!==T&&"shortlink"!==T||q("_links["+T+"]",B.getAttribute("href")),"alternate"===T&&"x-default"===B.getAttribute("hreflang")&&q("_links[alternate_canonical]",B.getAttribute("href"))}!function(){h=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);k++)(f=d[k]).src.indexOf("avatar")>-1||f.className.indexOf("avatar")>-1||f.width&&f.width<256||f.height&&f.height<128||f.src&&0!==f.src.indexOf("data:")&&q("_images[]",f.src);m=t.body.getElementsByTagName("iframe")||[];for(var P=0;P=50);P++){var C=m[P].src;C&&"about:blank"!==C&&(C.indexOf("jetpack-comment")>-1||C.indexOf("disqus.com")>-1||C.indexOf("facebook.com/plugins")>-1||C.indexOf("platform.twitter.com/widgets")>-1||C.indexOf("google.com/recaptcha")>-1||C.indexOf("googletagmanager.com")>-1||C.indexOf("doubleclick.net")>-1||C.indexOf("googlesyndication.com")>-1||C.indexOf("amazon-adsystem.com")>-1||C.indexOf("quantserve.com")>-1||C.indexOf("scorecardresearch.com")>-1||C.indexOf("addthis.com")>-1||C.indexOf("sharethis.com")>-1||C.indexOf("addtoany.com")>-1||q("_embeds[]",C))}var S,U;t.title&&q("t",t.title),r&&q("s",r),l&&q("sel_html",l),p=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),S=0,U=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(S++,p&&!p.closed){try{p.postMessage({type:"press-this-data",version:11,data:v},U)}catch(e){}S<50&&setTimeout(e,100)}},200)}else top.location.href=a;function q(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return v[a]||(v[a]=[]),void v[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return v[o]||(v[o]={}),void(v[o][r]=t)}v[e]=t}}function H(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&q("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&q("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?q("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&q("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&q("_jsonld[headline]",e.headline),e.description&&q("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&q("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url); \ No newline at end of file diff --git a/src/App.js b/src/App.js index 32ba56f..85f4a2d 100644 --- a/src/App.js +++ b/src/App.js @@ -221,8 +221,13 @@ export default function App() { // Build suggested content from bookmarklet metadata. // Extract description from meta tags. const meta = messageData._meta || {}; + + // HTML selection takes highest priority (preserves formatting). + // Always compute plain-text description as a fallback; buildSuggestedContent + // will use it if htmlToBlocks() produces no blocks from selectionHtml. + const selectionHtml = messageData.sel_html || ''; const description = - messageData.s || // User selection takes priority. + messageData.s || // Plain-text user selection. meta[ 'twitter:description' ] || meta[ 'og:description' ] || meta.description || @@ -243,6 +248,7 @@ export default function App() { const suggestedContent = buildSuggestedContentFromMetadata( { title, description, + selectionHtml, siteName: meta[ 'og:site_name' ] || '', canonical, url: receivedSourceUrl, diff --git a/src/components/BlockTransformShortcuts.js b/src/components/BlockTransformShortcuts.js index a61a39b..3f3cf20 100644 --- a/src/components/BlockTransformShortcuts.js +++ b/src/components/BlockTransformShortcuts.js @@ -137,7 +137,11 @@ export default function BlockTransformShortcuts() { if ( innerBlocks.length > 0 ) { // Replace the quote with its inner blocks directly. const replacementBlocks = innerBlocks.map( ( inner ) => - createBlock( inner.name, { ...inner.attributes }, inner.innerBlocks ) + createBlock( + inner.name, + { ...inner.attributes }, + inner.innerBlocks + ) ); replaceBlocks( currentClientId, replacementBlocks ); } else { diff --git a/src/utils/html-parser.js b/src/utils/html-parser.js index 2b1f82d..d4b83ed 100644 --- a/src/utils/html-parser.js +++ b/src/utils/html-parser.js @@ -447,14 +447,290 @@ function getCanonical( doc, meta ) { return linkCanonical?.getAttribute( 'href' ) || meta[ 'og:url' ] || ''; } +/** + * Sanitize inline HTML, keeping only a safe allowlist of formatting elements. + * + * Allowlisted inline elements: strong, em, b, i, u, s, a (href only), + * code, mark, sub, sup, span, br. + * All other elements are unwrapped (their children are preserved). + * Dangerous elements (script, style, object, embed, iframe) are fully removed. + * All attributes are stripped except `href` on `` elements. + * URI schemes `javascript:`, `data:`, and `vbscript:` are blocked in `href`. + * + * @param {Element} element DOM element to sanitize (in-place). + */ +function sanitizeInlineContent( element ) { + // Completely remove dangerous elements (do not preserve children). + const dangerous = element.querySelectorAll( + 'script, style, object, embed, iframe' + ); + dangerous.forEach( ( el ) => el.remove() ); + + const ALLOWED_INLINE = new Set( [ + 'strong', + 'em', + 'b', + 'i', + 'u', + 's', + 'a', + 'code', + 'mark', + 'sub', + 'sup', + 'span', + 'br', + ] ); + + // Iterate over a static snapshot — we may mutate the DOM while iterating. + Array.from( element.querySelectorAll( '*' ) ).forEach( ( el ) => { + const tagName = el.tagName.toLowerCase(); + + if ( ! ALLOWED_INLINE.has( tagName ) ) { + // Unwrap: replace the element with its child nodes. + el.replaceWith( ...el.childNodes ); + return; + } + + // Strip all attributes except allowed ones per element. + Array.from( el.attributes ).forEach( ( attr ) => { + if ( tagName === 'a' && attr.name === 'href' ) { + // Block dangerous URI schemes. + if ( /^\s*(javascript|data|vbscript):/i.test( attr.value ) ) { + el.removeAttribute( attr.name ); + } + } else { + el.removeAttribute( attr.name ); + } + } ); + } ); +} + +/** + * Convert a list element (ul/ol) to Gutenberg list block markup. + * + * @param {Element} listEl The list element (ul or ol). + * @param {boolean} ordered Whether this is an ordered list. + * @param {number} depth Current recursion depth (default 0; max 10). + * @return {string} Gutenberg list block markup. + */ +function listElementToBlock( listEl, ordered, depth = 0 ) { + if ( depth > 10 ) { + return ''; + } + + const tag = ordered ? 'ol' : 'ul'; + const attr = ordered ? ' {"ordered":true}' : ''; + let items = ''; + + listEl.childNodes.forEach( ( child ) => { + if ( + child.nodeType !== 1 /* ELEMENT_NODE */ || + child.tagName.toLowerCase() !== 'li' + ) { + return; + } + + // Clone to work with it non-destructively. + const li = child.cloneNode( true ); + + // Handle only direct-child nested lists within the li to avoid double-processing. + let nestedBlocks = ''; + Array.from( li.children ).forEach( ( childEl ) => { + const childTag = childEl.tagName.toLowerCase(); + if ( childTag === 'ul' || childTag === 'ol' ) { + const isOrdered = childTag === 'ol'; + nestedBlocks += + '\n' + listElementToBlock( childEl, isOrdered, depth + 1 ); + childEl.remove(); + } + } ); + + sanitizeInlineContent( li ); + const liContent = li.innerHTML.trim(); + + items += `\n
  • ${ liContent }${ nestedBlocks }
  • \n\n`; + } ); + + return `\n<${ tag } class="wp-block-list">\n${ items }\n\n\n`; +} + +/** + * Convert an HTML string to Gutenberg block markup. + * + * Handles block-level elements: paragraphs, headings (h1-h6), unordered and + * ordered lists with nesting, blockquotes, and preformatted/code blocks. + * Inline elements are filtered through an allowlist (strong, em, b, i, u, s, + * a[href], code, mark, sub, sup, span, br). All other attributes are stripped. + * + * Falls back to a paragraph block for unrecognised or purely inline content. + * + * @param {string} html HTML string to convert. + * @param {number} depth Internal recursion depth; callers should omit this. + * @return {string} Gutenberg block markup string, or empty string if no content. + */ +export function htmlToBlocks( html, depth = 0 ) { + if ( ! html || typeof html !== 'string' ) { + return ''; + } + + // Guard against deeply nested blockquote structures. + if ( depth > 5 ) { + return `\n

    ${ escapeHtml( + html + ) }

    \n\n`; + } + + const parser = new DOMParser(); + const doc = parser.parseFromString( `${ html }`, 'text/html' ); + const body = doc.body; + + let blocks = ''; + let inlineBuffer = ''; + + /** + * Flush any accumulated inline/text content as a paragraph block. + */ + function flushInlineBuffer() { + const trimmed = inlineBuffer.trim(); + if ( trimmed ) { + blocks += `\n

    ${ trimmed }

    \n\n\n`; + } + inlineBuffer = ''; + } + + /** + * Set of block-level tag names that start a new block. + */ + const BLOCK_TAGS = new Set( [ + 'p', + 'ul', + 'ol', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'blockquote', + 'pre', + 'figure', + 'div', + ] ); + + body.childNodes.forEach( ( node ) => { + if ( node.nodeType === 3 /* TEXT_NODE */ ) { + const text = node.textContent; + // Accumulate non-empty text into the inline buffer. + if ( text.trim() ) { + inlineBuffer += escapeHtml( text ); + } + return; + } + + if ( node.nodeType !== 1 /* ELEMENT_NODE */ ) { + return; + } + + const tag = node.tagName.toLowerCase(); + + // Skip dangerous elements. + if ( tag === 'script' || tag === 'style' ) { + return; + } + + if ( ! BLOCK_TAGS.has( tag ) ) { + // Inline element – add to buffer. + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + inlineBuffer += tempEl.innerHTML; + return; + } + + // We're about to emit a block – flush any pending inline content first. + flushInlineBuffer(); + + // Headings. + if ( /^h[1-6]$/.test( tag ) ) { + const level = tag[ 1 ]; + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + // After sanitization, block-level wrappers (h-tags) are unwrapped by + // the allowlist; tempEl.innerHTML holds the sanitized inner content. + blocks += `\n<${ tag } class="wp-block-heading">${ tempEl.innerHTML }\n\n\n`; + return; + } + + // Lists. + if ( tag === 'ul' || tag === 'ol' ) { + blocks += listElementToBlock( node, tag === 'ol' ); + return; + } + + // Blockquotes. + if ( tag === 'blockquote' ) { + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + const inner = htmlToBlocks( tempEl.innerHTML, depth + 1 ); + const innerBlocks = + inner || + `\n

    ${ escapeHtml( + node.textContent.trim() + ) }

    \n\n`; + blocks += `\n
    ${ innerBlocks }
    \n\n\n`; + return; + } + + // Preformatted / code blocks. + if ( tag === 'pre' ) { + const codeEl = node.querySelector( 'code' ); + const codeContent = escapeHtml( ( codeEl || node ).textContent ); + blocks += `\n
    ${ codeContent }
    \n\n\n`; + return; + } + + // Paragraphs and generic block elements (div, figure, etc.). + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + // tempEl.innerHTML holds the sanitized inner content after any outer + // block-level wrapper has been unwrapped by the allowlist sanitizer. + const innerHtml = tempEl.innerHTML.trim(); + if ( innerHtml ) { + blocks += `\n

    ${ innerHtml }

    \n\n\n`; + } + } ); + + // Flush any remaining inline content. + flushInlineBuffer(); + + return blocks.trim(); +} + /** * Build suggested content from server-returned metadata. * * All dynamic content is escaped. * Uses pre-sanitized server data instead of client-side parsing. * - * @param {Object} data Server-returned metadata object. - * @param {string} sourceUrl Original source URL. + * When `data.selectionHtml` is provided the selected HTML is converted to + * formatted Gutenberg blocks via `htmlToBlocks`. If the conversion produces + * no blocks (e.g. the selection contained only script tags), the function + * falls back to `data.description` in a plain-text quote block. + * + * @param {Object} data Server-returned metadata object. + * @param {string} data.selectionHtml Optional HTML string of the user's selection. + * @param {string} data.description Optional plain-text description / meta excerpt. + * @param {string} data.title Optional page title. + * @param {string} data.siteName Optional site name. + * @param {string} sourceUrl Original source URL. * @return {string} Gutenberg block content. */ export function buildSuggestedContent( data, sourceUrl ) { @@ -478,9 +754,17 @@ ${ escapeHtml( sourceUrl ) } `; } - // Add quote block with description if available. - // Escape description. - if ( data.description ) { + // Prefer formatted HTML selection; fall back to plain-text description. + let selectionBlocks = ''; + if ( data.selectionHtml ) { + selectionBlocks = htmlToBlocks( data.selectionHtml ); + } + + if ( selectionBlocks ) { + // HTML selection converted successfully to blocks. + content += selectionBlocks + '\n\n'; + } else if ( data.description ) { + // No HTML selection (or it produced no blocks) – use plain-text quote. content += `

    ${ escapeHtml( data.description ) }

    diff --git a/src/utils/index.js b/src/utils/index.js index 4843855..e2650ac 100644 --- a/src/utils/index.js +++ b/src/utils/index.js @@ -8,6 +8,7 @@ export { parseHtmlMetadata, buildSuggestedContent, buildSuggestedContentFromMetadata, + htmlToBlocks, escapeHtml, escapeAttr, } from './html-parser'; diff --git a/tests/bookmarklet/bookmarklet.test.js b/tests/bookmarklet/bookmarklet.test.js index 2e0263e..c1353eb 100644 --- a/tests/bookmarklet/bookmarklet.test.js +++ b/tests/bookmarklet/bookmarklet.test.js @@ -207,6 +207,30 @@ describe( 'Bookmarklet Functionality', () => { ).toBe( true ); } ); + test( 'HTML selection capture preserves formatting', () => { + // Check for getRangeAt usage to capture the selection range. + expect( bookmarkletSource ).toContain( 'getRangeAt' ); + + // Check for cloneContents to extract selected DOM fragment. + expect( bookmarkletSource ).toContain( 'cloneContents' ); + + // Check for innerHTML to serialise the selection as HTML. + expect( bookmarkletSource ).toContain( 'innerHTML' ); + + // Check that sel_html is sent alongside plain-text selection. + expect( + bookmarkletSource.includes( "'sel_html'" ) || + bookmarkletSource.includes( '"sel_html"' ) || + bookmarkletSource.includes( 'sel_html' ) + ).toBe( true ); + + // sel_html is added via the add() helper. + expect( + bookmarkletSource.includes( "add( 'sel_html'" ) || + bookmarkletSource.includes( "add('sel_html'" ) + ).toBe( true ); + } ); + test( 'Enhanced data extraction - Open Graph video', () => { // Check for og:video detection. expect( bookmarkletSource ).toContain( 'og:video' ); diff --git a/tests/e2e/html-selection.spec.js b/tests/e2e/html-selection.spec.js new file mode 100644 index 0000000..0978396 --- /dev/null +++ b/tests/e2e/html-selection.spec.js @@ -0,0 +1,381 @@ +/** + * HTML Selection Preservation E2E Tests + * + * Verifies that HTML-formatted text selections sent via postMessage + * are correctly converted to Gutenberg blocks in the editor. + * + * These tests bypass the bookmarklet and send postMessage directly, + * testing the real code path: App.js handler → htmlToBlocks() → block rendering. + */ +const { test, expect } = require( './utils/auth' ); + +/** + * Send a postMessage to the Press This editor with test data. + * + * @param {import('@playwright/test').Page} page Playwright page. + * @param {Object} overrides Fields to override in the message data. + */ +async function sendPostMessage( page, overrides = {} ) { + const defaults = { + t: 'Test Page Title', + s: '', + sel_html: '', + u: 'https://example.com/test', + _meta: {}, + _links: {}, + _images: [], + _embeds: [], + }; + await page.evaluate( ( data ) => { + window.postMessage( + { + type: 'press-this-data', + version: '1.0.0', + data, + }, + '*' + ); + }, { ...defaults, ...overrides } ); +} + +/** + * Wait for Gutenberg blocks to appear in the editor content area. + * + * @param {import('@playwright/test').Page} page Playwright page. + */ +async function waitForBlocks( page ) { + await page + .locator( '.press-this-editor__content [data-type]' ) + .first() + .waitFor( { timeout: 10000 } ); +} + +/** + * Navigate to Press This in postMessage mode and wait for the editor. + * + * @param {import('@playwright/test').Page} page Playwright page. + */ +async function loadEditor( page ) { + await page.goto( '/wp-admin/press-this.php?pm=1' ); + await page + .locator( '.press-this-editor__content' ) + .waitFor( { timeout: 10000 } ); +} + +const editorContent = '.press-this-editor__content'; + +test.describe( 'HTML Selection Preservation', () => { + test.describe( 'Core Formatting', () => { + test( 'bold and italic inline formatting', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: + '

    This has bold and italic text.

    ', + } ); + await waitForBlocks( page ); + + const paragraph = page.locator( + `${ editorContent } [data-type="core/paragraph"]` + ); + await expect( paragraph.first() ).toContainText( 'bold' ); + await expect( paragraph.first() ).toContainText( 'italic' ); + + // Verify the formatting tags are preserved in the rendered HTML. + const html = await paragraph.first().innerHTML(); + expect( html ).toContain( '' ); + expect( html ).toContain( '' ); + } ); + + test( 'headings at correct levels', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: '

    Heading Two

    Heading Three

    ', + } ); + await waitForBlocks( page ); + + const headings = page.locator( + `${ editorContent } [data-type="core/heading"]` + ); + await expect( headings ).toHaveCount( 2 ); + await expect( headings.nth( 0 ) ).toContainText( + 'Heading Two' + ); + await expect( headings.nth( 1 ) ).toContainText( + 'Heading Three' + ); + } ); + + test( 'unordered list', async ( { loggedInPage: page } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: + '
    • Item A
    • Item B
    ', + } ); + await waitForBlocks( page ); + + const list = page.locator( + `${ editorContent } [data-type="core/list"]` + ); + await expect( list.first() ).toBeVisible(); + await expect( list.first() ).toContainText( 'Item A' ); + await expect( list.first() ).toContainText( 'Item B' ); + } ); + + test( 'ordered list', async ( { loggedInPage: page } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: + '
    1. First
    2. Second
    ', + } ); + await waitForBlocks( page ); + + const list = page.locator( + `${ editorContent } [data-type="core/list"]` + ); + await expect( list.first() ).toBeVisible(); + await expect( list.first() ).toContainText( 'First' ); + await expect( list.first() ).toContainText( 'Second' ); + } ); + + test( 'blockquote', async ( { loggedInPage: page } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: + '

    A quoted passage.

    ', + } ); + await waitForBlocks( page ); + + const quote = page.locator( + `${ editorContent } [data-type="core/quote"]` + ); + await expect( quote ).toBeVisible(); + await expect( quote ).toContainText( 'A quoted passage.' ); + } ); + + test( 'code block', async ( { loggedInPage: page } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: '
    const x = 42;
    ', + } ); + await waitForBlocks( page ); + + const code = page.locator( + `${ editorContent } [data-type="core/code"]` + ); + await expect( code ).toBeVisible(); + await expect( code ).toContainText( 'const x = 42;' ); + } ); + + test( 'mixed content preserves all block types', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: [ + '

    Getting Started

    ', + '

    Follow these steps:

    ', + '
    • Install dependencies
    • Run the server
    ', + '
    npm start
    ', + ].join( '' ), + } ); + await waitForBlocks( page ); + + await expect( + page.locator( + `${ editorContent } [data-type="core/heading"]` + ) + ).toBeVisible(); + await expect( + page.locator( + `${ editorContent } [data-type="core/paragraph"]` + ).first() + ).toContainText( 'steps' ); + await expect( + page.locator( + `${ editorContent } [data-type="core/list"]` + ) + ).toBeVisible(); + await expect( + page.locator( + `${ editorContent } [data-type="core/code"]` + ) + ).toBeVisible(); + } ); + } ); + + test.describe( 'Backward Compatibility', () => { + test( 'plain text selection without sel_html produces quote block', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + s: 'This is a plain text selection that should appear in a quote block.', + sel_html: '', + } ); + await waitForBlocks( page ); + + const quote = page.locator( + `${ editorContent } [data-type="core/quote"]` + ); + await expect( quote ).toBeVisible(); + await expect( quote ).toContainText( + 'plain text selection' + ); + } ); + + test( 'no selection populates title and source only', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + t: 'Just a Title', + s: '', + sel_html: '', + } ); + await waitForBlocks( page ); + + // Title should be populated. + const titleInput = page.getByLabel( 'Post title' ); + await expect( titleInput ).toHaveValue( 'Just a Title', { + timeout: 10000, + } ); + + // Source attribution should exist. + await expect( + page + .locator( `${ editorContent }` ) + .getByText( 'Source:' ) + ).toBeVisible(); + + // No quote block should appear. + await expect( + page.locator( + `${ editorContent } [data-type="core/quote"]` + ) + ).toHaveCount( 0 ); + } ); + } ); + + test.describe( 'Security', () => { + test( 'script tags are stripped', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: + '

    Safe text

    ', + } ); + await waitForBlocks( page ); + + await expect( + page.locator( `${ editorContent }` ) + ).toContainText( 'Safe text' ); + await expect( + page.locator( `${ editorContent }` ) + ).not.toContainText( 'alert' ); + } ); + + test( 'dangerous attributes are stripped', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: + '

    Styled text

    ', + } ); + await waitForBlocks( page ); + + const paragraph = page.locator( + `${ editorContent } [data-type="core/paragraph"]` + ); + await expect( paragraph.first() ).toContainText( + 'Styled text' + ); + + // Verify no dangerous attributes leaked into the rich text content. + const hasUnsafeAttrs = await paragraph.first().evaluate( ( el ) => { + const p = el.querySelector( 'p, [role="document"]' ); + if ( ! p ) { + return false; + } + return ( + p.hasAttribute( 'onclick' ) || + p.hasAttribute( 'style' ) || + p.getAttribute( 'class' )?.includes( 'danger' ) + ); + } ); + expect( hasUnsafeAttrs ).toBe( false ); + } ); + + test( 'javascript: and data: URI schemes are blocked', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: + '

    Bad link and Good link

    ', + } ); + await waitForBlocks( page ); + + // The safe link should be present. + await expect( + page.locator( + `${ editorContent } a[href="https://safe.example"]` + ) + ).toBeVisible(); + + // No javascript: href should exist in the editor content. + const hasJsHref = await page + .locator( `${ editorContent }` ) + .evaluate( ( el ) => { + return el.querySelector( 'a[href^="javascript:"]' ) !== null; + } ); + expect( hasJsHref ).toBe( false ); + } ); + } ); + + test.describe( 'Edge Cases', () => { + test( 'empty sel_html falls back to plain-text description', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: '', + s: 'Fallback description text.', + } ); + await waitForBlocks( page ); + + // htmlToBlocks returns empty for script-only input, + // so the plain-text fallback should be used in a quote block. + const quote = page.locator( + `${ editorContent } [data-type="core/quote"]` + ); + await expect( quote ).toBeVisible(); + await expect( quote ).toContainText( + 'Fallback description text.' + ); + } ); + + test( 'special characters are not double-escaped', async ( { + loggedInPage: page, + } ) => { + await loadEditor( page ); + await sendPostMessage( page, { + sel_html: '

    Tom & Jerry

    ', + } ); + await waitForBlocks( page ); + + const paragraph = page.locator( + `${ editorContent } [data-type="core/paragraph"]` + ); + // Should render as "Tom & Jerry", not "Tom & Jerry". + await expect( paragraph.first() ).toContainText( + 'Tom & Jerry' + ); + } ); + } ); +} ); diff --git a/tests/utils/html-parser.test.js b/tests/utils/html-parser.test.js index c42c68f..26c982f 100644 --- a/tests/utils/html-parser.test.js +++ b/tests/utils/html-parser.test.js @@ -1,5 +1,5 @@ /** - * Tests for HTML Parser utilities. + * Tests for HTML parser utilities. * * @package press-this */ @@ -7,6 +7,7 @@ import { escapeHtml, escapeAttr, + htmlToBlocks, parseHtmlMetadata, buildSuggestedContent, buildSuggestedContentFromMetadata, @@ -84,6 +85,214 @@ describe( 'escapeAttr', () => { } ); } ); +describe( 'htmlToBlocks', () => { + test( 'returns empty string for empty input', () => { + expect( htmlToBlocks( '' ) ).toBe( '' ); + expect( htmlToBlocks( null ) ).toBe( '' ); + expect( htmlToBlocks( undefined ) ).toBe( '' ); + } ); + + test( 'converts a paragraph to a paragraph block', () => { + const result = htmlToBlocks( '

    Hello world

    ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '

    Hello world

    ' ); + expect( result ).toContain( '' ); + } ); + + test( 'converts plain text to a paragraph block', () => { + const result = htmlToBlocks( 'Plain text content' ); + expect( result ).toContain( '' ); + expect( result ).toContain( 'Plain text content' ); + } ); + + test( 'converts h1-h6 headings to heading blocks', () => { + [ 1, 2, 3, 4, 5, 6 ].forEach( ( level ) => { + const result = htmlToBlocks( `Heading ${ level }` ); + expect( result ).toContain( `` ); + expect( result ).toContain( `` ); + expect( result ).toContain( `Heading ${ level }` ); + expect( result ).toContain( '' ); + } ); + } ); + + test( 'converts unordered list to list block', () => { + const result = htmlToBlocks( '
    • Item 1
    • Item 2
    ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
      ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
    • Item 1
    • ' ); + expect( result ).toContain( '
    • Item 2
    • ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '' ); + // Should NOT use ordered list attribute. + expect( result ).not.toContain( '"ordered":true' ); + } ); + + test( 'converts ordered list to list block with ordered attribute', () => { + const result = htmlToBlocks( '
      1. First
      2. Second
      ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
        ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
      1. First
      2. ' ); + expect( result ).toContain( '' ); + } ); + + test( 'converts nested lists correctly', () => { + const html = '
        • Parent
          • Child item
        '; + const result = htmlToBlocks( html ); + // Outer list. + expect( result ).toContain( '' ); + // Nested list should also be wrapped. + const listCount = ( result.match( //g ) || [] ).length; + expect( listCount ).toBeGreaterThanOrEqual( 2 ); + expect( result ).toContain( 'Child item' ); + expect( result ).toContain( 'Parent' ); + } ); + + test( 'preserves inline formatting in list items', () => { + const html = '
        • Bold item
        • Italic item
        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Bold item' ); + expect( result ).toContain( 'Italic item' ); + } ); + + test( 'converts blockquote to quote block', () => { + const result = htmlToBlocks( '

        Quote text

        ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
        ' ); + expect( result ).toContain( 'Quote text' ); + expect( result ).toContain( '' ); + } ); + + test( 'converts pre/code to code block', () => { + const result = htmlToBlocks( '
        const x = 1;
        ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
        ' );
        +		expect( result ).toContain( 'const x = 1;' );
        +		expect( result ).toContain( '' );
        +	} );
        +
        +	test( 'handles mix of headings and lists (issue example)', () => {
        +		const html = `

        In Unreal Engine: How to Choose

        +
          +
        • Unreal Engine allows developers to switch: +
            +
          • Deferred Rendering: Default for most projects.
          • +
          • Forward Rendering: For VR projects.
          • +
          +
        • +
        +

        Each technique aligns with different requirements.

        `; + + const result = htmlToBlocks( html ); + + // Heading should be converted. + expect( result ).toContain( '' ); + expect( result ).toContain( 'In Unreal Engine: How to Choose' ); + + // List should be converted. + expect( result ).toContain( '' ); + expect( result ).toContain( 'Deferred Rendering' ); + expect( result ).toContain( 'Forward Rendering' ); + + // Paragraph should be converted. + expect( result ).toContain( '' ); + expect( result ).toContain( 'Each technique aligns' ); + } ); + + test( 'strips script elements for safety', () => { + const html = '

        Safe text

        More text

        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Safe text' ); + expect( result ).not.toContain( 'alert' ); + expect( result ).not.toContain( '">Link

        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Link' ); + expect( result ).not.toContain( 'data:' ); + } ); + + test( 'strips vbscript: href for safety', () => { + const html = '

        Link

        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Link' ); + expect( result ).not.toContain( 'vbscript:' ); + } ); + + test( 'unwraps non-allowlist elements but keeps their text', () => { + //
        ,