From ef71d306432af8e97802005bb9affa135e609dec Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 23:06:44 +0000 Subject: [PATCH 1/6] Initial plan From 3450b7650f99af203448313fc849b3ad42a21bbe Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 23:12:06 +0000 Subject: [PATCH 2/6] chore: initial plan for HTML selection preservation Co-authored-by: kraftbj <88897+kraftbj@users.noreply.github.com> --- package-lock.json | 4 ++++ 1 file changed, 4 insertions(+) diff --git a/package-lock.json b/package-lock.json index fcada17..df79f71 100644 --- a/package-lock.json +++ b/package-lock.json @@ -34,6 +34,10 @@ "grunt-checktextdomain": "^1.0.1", "grunt-wp-i18n": "^1.0.4", "terser": "^5.44.1" + }, + "engines": { + "node": ">=20.10.0", + "npm": ">=10.2.3" } }, "node_modules/@ampproject/remapping": { From 818d998488001b809f1cde846e56615a8f57c112 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Fri, 6 Mar 2026 23:21:39 +0000 Subject: [PATCH 3/6] fix: preserve HTML formatting when capturing bookmarklet text selection - Bookmarklet now captures selected HTML using getRangeAt()/cloneContents() and sends it as `sel_html` alongside the plain-text `s` field - Added htmlToBlocks() utility that converts HTML to Gutenberg block markup (handles headings h1-h6, paragraphs, ul/ol lists with nesting, blockquotes, and pre/code blocks; strips scripts/event handlers for safety) - buildSuggestedContent() now renders HTML selections as formatted blocks instead of plain-text in a quote block - App.js postMessage handler passes sel_html through to content builder - Added 20 new unit tests for htmlToBlocks and updated buildSuggestedContent - Added bookmarklet test for HTML selection capture - Rebuilt minified bookmarklet Co-authored-by: kraftbj <88897+kraftbj@users.noreply.github.com> --- assets/bookmarklet.js | 18 +- assets/bookmarklet.min.js | 2 +- src/App.js | 18 +- src/components/BlockTransformShortcuts.js | 6 +- src/utils/html-parser.js | 240 +++++++++++++++++++++- src/utils/index.js | 1 + tests/bookmarklet/bookmarklet.test.js | 24 +++ tests/utils/html-parser.test.js | 224 ++++++++++++++++++++ 8 files changed, 520 insertions(+), 13 deletions(-) create mode 100644 tests/utils/html-parser.test.js diff --git a/assets/bookmarklet.js b/assets/bookmarklet.js index 234cda1..3380029 100644 --- a/assets/bookmarklet.js +++ b/assets/bookmarklet.js @@ -15,7 +15,7 @@ encURI = window.encodeURIComponent, head = document.getElementsByTagName( 'head' )[0], target = '_press_this_app', - windowWidth, windowHeight, selection, + windowWidth, windowHeight, selection, selectionHtml, metas, links, content, images, iframes, img, scripts, scrapedData = {}, popup; @@ -32,7 +32,16 @@ } if ( window.getSelection ) { - selection = window.getSelection() + ''; + var sel = window.getSelection(); + if ( sel && sel.rangeCount > 0 ) { + selection = sel.toString(); + // Capture HTML to preserve formatting (bold, lists, headings, etc.). + var range = sel.getRangeAt( 0 ); + var fragment = range.cloneContents(); + var tempDiv = document.createElement( 'div' ); + tempDiv.appendChild( fragment ); + selectionHtml = tempDiv.innerHTML; + } } else if ( document.getSelection ) { selection = document.getSelection() + ''; } else if ( document.selection ) { @@ -299,6 +308,11 @@ add( 's', selection ); } + // Add HTML selection to preserve formatting (bold, lists, headings, etc.). + if ( selectionHtml && selectionHtml !== selection ) { + add( 'sel_html', selectionHtml ); + } + /** * Send scraped data to the Press This popup via postMessage. * Uses polling to wait for the popup to be ready. diff --git a/assets/bookmarklet.min.js b/assets/bookmarklet.min.js index 00bb55c..c43d75b 100644 --- a/assets/bookmarklet.min.js +++ b/assets/bookmarklet.min.js @@ -1 +1 @@ -!function(e,t,i,a){var n,o,r,l,c,s,g,m,d,f,h,u=e.encodeURIComponent,p=t.getElementsByTagName("head")[0],y={};if(a)if(i.match(/^https?:/)){a+="&u="+u(i),e.getSelection?r=e.getSelection()+"":t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||""),a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,T("pt_version",11),l=p.getElementsByTagName("meta")||[];for(var v=0;v200);v++){var b=l[v],O=b.getAttribute("name"),_=b.getAttribute("property"),x=b.getAttribute("content");x&&(O?T("_meta["+O+"]",x):_&&(T("_meta["+_+"]",x),"og:video"!==_&&"og:video:url"!==_&&"og:video:secure_url"!==_||T("_og_video[]",x)))}c=p.getElementsByTagName("link")||[];for(var E=0;E=50);E++){var A=c[E],w=A.getAttribute("rel");"canonical"!==w&&"icon"!==w&&"shortlink"!==w||T("_links["+w+"]",A.getAttribute("href")),"alternate"===w&&"x-default"===A.getAttribute("hreflang")&&T("_links[alternate_canonical]",A.getAttribute("href"))}!function(){f=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);N++)(d=g[N]).src.indexOf("avatar")>-1||d.className.indexOf("avatar")>-1||d.width&&d.width<256||d.height&&d.height<128||d.src&&0!==d.src.indexOf("data:")&&T("_images[]",d.src);m=t.body.getElementsByTagName("iframe")||[];for(var j=0;j=50);j++){var B=m[j].src;B&&"about:blank"!==B&&(B.indexOf("jetpack-comment")>-1||B.indexOf("disqus.com")>-1||B.indexOf("facebook.com/plugins")>-1||B.indexOf("platform.twitter.com/widgets")>-1||B.indexOf("google.com/recaptcha")>-1||B.indexOf("googletagmanager.com")>-1||B.indexOf("doubleclick.net")>-1||B.indexOf("googlesyndication.com")>-1||B.indexOf("amazon-adsystem.com")>-1||B.indexOf("quantserve.com")>-1||B.indexOf("scorecardresearch.com")>-1||B.indexOf("addthis.com")>-1||B.indexOf("sharethis.com")>-1||B.indexOf("addtoany.com")>-1||T("_embeds[]",B))}var k,P;t.title&&T("t",t.title),r&&T("s",r),h=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),k=0,P=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(k++,h&&!h.closed){try{h.postMessage({type:"press-this-data",version:11,data:y},P)}catch(e){}k<50&&setTimeout(e,100)}},200)}else top.location.href=a;function T(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return y[a]||(y[a]=[]),void y[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return y[o]||(y[o]={}),void(y[o][r]=t)}y[e]=t}}function S(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&T("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&T("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?T("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&T("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&T("_jsonld[headline]",e.headline),e.description&&T("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&T("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url); \ No newline at end of file +!function(e,t,i,a){var n,o,r,l,c,s,g,d,m,f,h,p,u=e.encodeURIComponent,y=t.getElementsByTagName("head")[0],v={};if(a)if(i.match(/^https?:/)){if(a+="&u="+u(i),e.getSelection){var b=e.getSelection();if(b&&b.rangeCount>0){r=b.toString();var O=b.getRangeAt(0).cloneContents(),_=t.createElement("div");_.appendChild(O),l=_.innerHTML}}else t.getSelection?r=t.getSelection()+"":t.selection&&(r=t.selection.createRange().text||"");a+="&buster="+(new Date).getTime(),a+="&pm=1",n=(n=e.outerWidth||t.documentElement.clientWidth||600)<800||n>5e3?600:.7*n,o=(o=e.outerHeight||t.documentElement.clientHeight||700)<800||o>3e3?700:.9*o,q("pt_version",11),c=y.getElementsByTagName("meta")||[];for(var x=0;x200);x++){var E=c[x],A=E.getAttribute("name"),w=E.getAttribute("property"),N=E.getAttribute("content");N&&(A?q("_meta["+A+"]",N):w&&(q("_meta["+w+"]",N),"og:video"!==w&&"og:video:url"!==w&&"og:video:secure_url"!==w||q("_og_video[]",N)))}s=y.getElementsByTagName("link")||[];for(var j=0;j=50);j++){var B=s[j],T=B.getAttribute("rel");"canonical"!==T&&"icon"!==T&&"shortlink"!==T||q("_links["+T+"]",B.getAttribute("href")),"alternate"===T&&"x-default"===B.getAttribute("hreflang")&&q("_links[alternate_canonical]",B.getAttribute("href"))}!function(){h=t.querySelectorAll('script[type="application/ld+json"]');for(var e=0;e=100);k++)(f=d[k]).src.indexOf("avatar")>-1||f.className.indexOf("avatar")>-1||f.width&&f.width<256||f.height&&f.height<128||f.src&&0!==f.src.indexOf("data:")&&q("_images[]",f.src);m=t.body.getElementsByTagName("iframe")||[];for(var P=0;P=50);P++){var C=m[P].src;C&&"about:blank"!==C&&(C.indexOf("jetpack-comment")>-1||C.indexOf("disqus.com")>-1||C.indexOf("facebook.com/plugins")>-1||C.indexOf("platform.twitter.com/widgets")>-1||C.indexOf("google.com/recaptcha")>-1||C.indexOf("googletagmanager.com")>-1||C.indexOf("doubleclick.net")>-1||C.indexOf("googlesyndication.com")>-1||C.indexOf("amazon-adsystem.com")>-1||C.indexOf("quantserve.com")>-1||C.indexOf("scorecardresearch.com")>-1||C.indexOf("addthis.com")>-1||C.indexOf("sharethis.com")>-1||C.indexOf("addtoany.com")>-1||q("_embeds[]",C))}var S,U;t.title&&q("t",t.title),r&&q("s",r),l&&l!==r&&q("sel_html",l),p=e.open(a,"_press_this_app","location,resizable,scrollbars,width="+n+",height="+o),S=0,U=a.match(/^https?:\/\/[^\/]+/)[0],setTimeout(function e(){if(S++,p&&!p.closed){try{p.postMessage({type:"press-this-data",version:11,data:v},U)}catch(e){}S<50&&setTimeout(e,100)}},200)}else top.location.href=a;function q(e,t){if(null!=t&&""!==t){var i=e.match(/^(.+)\[\]$/);if(i){var a=i[1];return v[a]||(v[a]=[]),void v[a].push(t)}var n=e.match(/^(.+)\[(.+)\]$/);if(n){var o=n[1],r=n[2];return v[o]||(v[o]={}),void(v[o][r]=t)}v[e]=t}}function H(e){if(e&&"object"==typeof e){var t=e["@type"];if("VideoObject"===t&&(e.embedUrl&&q("_embeds[]",e.embedUrl),e.contentUrl&&!e.embedUrl&&q("_embeds[]",e.contentUrl)),"Article"!==t&&"WebPage"!==t&&"NewsArticle"!==t&&"BlogPosting"!==t||(e.mainEntityOfPage&&"string"==typeof e.mainEntityOfPage?q("_jsonld[canonical]",e.mainEntityOfPage):e.mainEntityOfPage&&e.mainEntityOfPage["@id"]&&q("_jsonld[canonical]",e.mainEntityOfPage["@id"]),e.headline&&q("_jsonld[headline]",e.headline),e.description&&q("_jsonld[description]",e.description)),e.image){var i="";"string"==typeof e.image?i=e.image:e.image.url?i=e.image.url:Array.isArray(e.image)&&e.image[0]&&(i="string"==typeof e.image[0]?e.image[0]:e.image[0].url),i&&q("_jsonld[image]",i)}}}}(window,document,top.location.href,window.pt_url); \ No newline at end of file diff --git a/src/App.js b/src/App.js index 32ba56f..9b9bb6a 100644 --- a/src/App.js +++ b/src/App.js @@ -221,12 +221,17 @@ export default function App() { // Build suggested content from bookmarklet metadata. // Extract description from meta tags. const meta = messageData._meta || {}; - const description = - messageData.s || // User selection takes priority. - meta[ 'twitter:description' ] || - meta[ 'og:description' ] || - meta.description || - ''; + + // HTML selection takes highest priority (preserves formatting). + // Plain-text selection is a fallback, then meta descriptions. + const selectionHtml = messageData.sel_html || ''; + const description = selectionHtml + ? '' // HTML selection is used directly; skip plain-text fallback. + : messageData.s || // Plain-text user selection. + meta[ 'twitter:description' ] || + meta[ 'og:description' ] || + meta.description || + ''; const title = messageData.t || @@ -243,6 +248,7 @@ export default function App() { const suggestedContent = buildSuggestedContentFromMetadata( { title, description, + selectionHtml, siteName: meta[ 'og:site_name' ] || '', canonical, url: receivedSourceUrl, diff --git a/src/components/BlockTransformShortcuts.js b/src/components/BlockTransformShortcuts.js index a61a39b..3f3cf20 100644 --- a/src/components/BlockTransformShortcuts.js +++ b/src/components/BlockTransformShortcuts.js @@ -137,7 +137,11 @@ export default function BlockTransformShortcuts() { if ( innerBlocks.length > 0 ) { // Replace the quote with its inner blocks directly. const replacementBlocks = innerBlocks.map( ( inner ) => - createBlock( inner.name, { ...inner.attributes }, inner.innerBlocks ) + createBlock( + inner.name, + { ...inner.attributes }, + inner.innerBlocks + ) ); replaceBlocks( currentClientId, replacementBlocks ); } else { diff --git a/src/utils/html-parser.js b/src/utils/html-parser.js index 2b1f82d..2f3033a 100644 --- a/src/utils/html-parser.js +++ b/src/utils/html-parser.js @@ -447,6 +447,234 @@ function getCanonical( doc, meta ) { return linkCanonical?.getAttribute( 'href' ) || meta[ 'og:url' ] || ''; } +/** + * Sanitize inline HTML, keeping only safe formatting elements. + * + * Strips script/style elements and event handler attributes. + * Preserves safe inline elements: strong, em, b, i, u, s, a (href only), + * code, mark, sub, sup, span, br. + * + * @param {Element} element DOM element to sanitize (in-place). + */ +function sanitizeInlineContent( element ) { + // Remove script and style elements. + const dangerous = element.querySelectorAll( + 'script, style, object, embed, iframe' + ); + dangerous.forEach( ( el ) => el.remove() ); + + // Strip event handlers and javascript: hrefs from all elements. + const allEls = element.querySelectorAll( '*' ); + allEls.forEach( ( el ) => { + // Remove all event handler attributes. + Array.from( el.attributes ).forEach( ( attr ) => { + if ( attr.name.startsWith( 'on' ) ) { + el.removeAttribute( attr.name ); + } + } ); + + // Strip javascript: URLs from href and src. + const href = el.getAttribute( 'href' ); + if ( href && /^\s*javascript:/i.test( href ) ) { + el.removeAttribute( 'href' ); + } + const src = el.getAttribute( 'src' ); + if ( src && /^\s*javascript:/i.test( src ) ) { + el.removeAttribute( 'src' ); + } + } ); +} + +/** + * Convert a list element (ul/ol) to Gutenberg list block markup. + * + * @param {Element} listEl The list element (ul or ol). + * @param {boolean} ordered Whether this is an ordered list. + * @return {string} Gutenberg list block markup. + */ +function listElementToBlock( listEl, ordered ) { + const tag = ordered ? 'ol' : 'ul'; + const attr = ordered ? ' {"ordered":true}' : ''; + let items = ''; + + listEl.childNodes.forEach( ( child ) => { + if ( + child.nodeType !== 1 /* ELEMENT_NODE */ || + child.tagName.toLowerCase() !== 'li' + ) { + return; + } + + // Clone to work with it non-destructively. + const li = child.cloneNode( true ); + + // Handle only direct-child nested lists within the li to avoid double-processing. + let nestedBlocks = ''; + Array.from( li.children ).forEach( ( childEl ) => { + const childTag = childEl.tagName.toLowerCase(); + if ( childTag === 'ul' || childTag === 'ol' ) { + const isOrdered = childTag === 'ol'; + nestedBlocks += '\n' + listElementToBlock( childEl, isOrdered ); + childEl.remove(); + } + } ); + + sanitizeInlineContent( li ); + const liContent = li.innerHTML.trim(); + + items += `\n
  • ${ liContent }${ nestedBlocks }
  • \n\n`; + } ); + + return `\n<${ tag } class="wp-block-list">\n${ items }\n\n\n`; +} + +/** + * Convert an HTML string to Gutenberg block markup. + * + * Handles block-level elements: paragraphs, headings (h1-h6), unordered and + * ordered lists with nesting, blockquotes, and preformatted/code blocks. + * Inline elements (strong, em, a, code, etc.) are preserved within blocks. + * Script/style elements and event handlers are stripped for safety. + * + * Falls back to a paragraph block for unrecognised or purely inline content. + * + * @param {string} html HTML string to convert. + * @return {string} Gutenberg block markup string, or empty string if no content. + */ +export function htmlToBlocks( html ) { + if ( ! html || typeof html !== 'string' ) { + return ''; + } + + const parser = new DOMParser(); + const doc = parser.parseFromString( `${ html }`, 'text/html' ); + const body = doc.body; + + let blocks = ''; + let inlineBuffer = ''; + + /** + * Flush any accumulated inline/text content as a paragraph block. + */ + function flushInlineBuffer() { + const trimmed = inlineBuffer.trim(); + if ( trimmed ) { + blocks += `\n

    ${ trimmed }

    \n\n\n`; + } + inlineBuffer = ''; + } + + /** + * Set of block-level tag names that start a new block. + */ + const BLOCK_TAGS = new Set( [ + 'p', + 'ul', + 'ol', + 'h1', + 'h2', + 'h3', + 'h4', + 'h5', + 'h6', + 'blockquote', + 'pre', + 'figure', + 'div', + ] ); + + body.childNodes.forEach( ( node ) => { + if ( node.nodeType === 3 /* TEXT_NODE */ ) { + const text = node.textContent; + // Accumulate non-empty text into the inline buffer. + if ( text.trim() ) { + inlineBuffer += escapeHtml( text ); + } + return; + } + + if ( node.nodeType !== 1 /* ELEMENT_NODE */ ) { + return; + } + + const tag = node.tagName.toLowerCase(); + + // Skip dangerous elements. + if ( tag === 'script' || tag === 'style' ) { + return; + } + + if ( ! BLOCK_TAGS.has( tag ) ) { + // Inline element – add to buffer. + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + inlineBuffer += tempEl.innerHTML; + return; + } + + // We're about to emit a block – flush any pending inline content first. + flushInlineBuffer(); + + // Headings. + if ( /^h[1-6]$/.test( tag ) ) { + const level = tag[ 1 ]; + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + blocks += `\n<${ tag } class="wp-block-heading">${ tempEl.firstChild.innerHTML }\n\n\n`; + return; + } + + // Lists. + if ( tag === 'ul' || tag === 'ol' ) { + blocks += listElementToBlock( node, tag === 'ol' ); + return; + } + + // Blockquotes. + if ( tag === 'blockquote' ) { + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + const inner = htmlToBlocks( tempEl.firstChild.innerHTML ); + const innerBlocks = + inner || + `\n

    ${ escapeHtml( + node.textContent.trim() + ) }

    \n\n`; + blocks += `\n
    ${ innerBlocks }
    \n\n\n`; + return; + } + + // Preformatted / code blocks. + if ( tag === 'pre' ) { + const codeEl = node.querySelector( 'code' ); + const codeContent = escapeHtml( ( codeEl || node ).textContent ); + blocks += `\n
    ${ codeContent }
    \n\n\n`; + return; + } + + // Paragraphs and generic block elements (div, figure, etc.). + const clone = node.cloneNode( true ); + const tempEl = doc.createElement( 'div' ); + tempEl.appendChild( clone ); + sanitizeInlineContent( tempEl ); + const innerHtml = tempEl.firstChild.innerHTML.trim(); + if ( innerHtml ) { + blocks += `\n

    ${ innerHtml }

    \n\n\n`; + } + } ); + + // Flush any remaining inline content. + flushInlineBuffer(); + + return blocks.trim(); +} + /** * Build suggested content from server-returned metadata. * @@ -478,9 +706,15 @@ ${ escapeHtml( sourceUrl ) } `; } - // Add quote block with description if available. - // Escape description. - if ( data.description ) { + // Add HTML selection as formatted blocks when available. + // Falls back to a plain-text quote block for meta descriptions. + if ( data.selectionHtml ) { + const selectionBlocks = htmlToBlocks( data.selectionHtml ); + if ( selectionBlocks ) { + content += selectionBlocks + '\n\n'; + } + } else if ( data.description ) { + // Escape description. content += `

    ${ escapeHtml( data.description ) }

    diff --git a/src/utils/index.js b/src/utils/index.js index 4843855..e2650ac 100644 --- a/src/utils/index.js +++ b/src/utils/index.js @@ -8,6 +8,7 @@ export { parseHtmlMetadata, buildSuggestedContent, buildSuggestedContentFromMetadata, + htmlToBlocks, escapeHtml, escapeAttr, } from './html-parser'; diff --git a/tests/bookmarklet/bookmarklet.test.js b/tests/bookmarklet/bookmarklet.test.js index 2e0263e..c1353eb 100644 --- a/tests/bookmarklet/bookmarklet.test.js +++ b/tests/bookmarklet/bookmarklet.test.js @@ -207,6 +207,30 @@ describe( 'Bookmarklet Functionality', () => { ).toBe( true ); } ); + test( 'HTML selection capture preserves formatting', () => { + // Check for getRangeAt usage to capture the selection range. + expect( bookmarkletSource ).toContain( 'getRangeAt' ); + + // Check for cloneContents to extract selected DOM fragment. + expect( bookmarkletSource ).toContain( 'cloneContents' ); + + // Check for innerHTML to serialise the selection as HTML. + expect( bookmarkletSource ).toContain( 'innerHTML' ); + + // Check that sel_html is sent alongside plain-text selection. + expect( + bookmarkletSource.includes( "'sel_html'" ) || + bookmarkletSource.includes( '"sel_html"' ) || + bookmarkletSource.includes( 'sel_html' ) + ).toBe( true ); + + // sel_html is added via the add() helper. + expect( + bookmarkletSource.includes( "add( 'sel_html'" ) || + bookmarkletSource.includes( "add('sel_html'" ) + ).toBe( true ); + } ); + test( 'Enhanced data extraction - Open Graph video', () => { // Check for og:video detection. expect( bookmarkletSource ).toContain( 'og:video' ); diff --git a/tests/utils/html-parser.test.js b/tests/utils/html-parser.test.js new file mode 100644 index 0000000..25434ee --- /dev/null +++ b/tests/utils/html-parser.test.js @@ -0,0 +1,224 @@ +/** + * Tests for HTML parser utilities, especially htmlToBlocks. + * + * @package press-this + */ + +import { htmlToBlocks, buildSuggestedContent } from '../../src/utils/html-parser'; + +describe( 'htmlToBlocks', () => { + test( 'returns empty string for empty input', () => { + expect( htmlToBlocks( '' ) ).toBe( '' ); + expect( htmlToBlocks( null ) ).toBe( '' ); + expect( htmlToBlocks( undefined ) ).toBe( '' ); + } ); + + test( 'converts a paragraph to a paragraph block', () => { + const result = htmlToBlocks( '

    Hello world

    ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '

    Hello world

    ' ); + expect( result ).toContain( '' ); + } ); + + test( 'converts plain text to a paragraph block', () => { + const result = htmlToBlocks( 'Plain text content' ); + expect( result ).toContain( '' ); + expect( result ).toContain( 'Plain text content' ); + } ); + + test( 'converts h1-h6 headings to heading blocks', () => { + [ 1, 2, 3, 4, 5, 6 ].forEach( ( level ) => { + const result = htmlToBlocks( `Heading ${ level }` ); + expect( result ).toContain( `` ); + expect( result ).toContain( `` ); + expect( result ).toContain( `Heading ${ level }` ); + expect( result ).toContain( '' ); + } ); + } ); + + test( 'converts unordered list to list block', () => { + const result = htmlToBlocks( '
    • Item 1
    • Item 2
    ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
      ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
    • Item 1
    • ' ); + expect( result ).toContain( '
    • Item 2
    • ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '' ); + // Should NOT use ordered list attribute. + expect( result ).not.toContain( '"ordered":true' ); + } ); + + test( 'converts ordered list to list block with ordered attribute', () => { + const result = htmlToBlocks( '
      1. First
      2. Second
      ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
        ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
      1. First
      2. ' ); + expect( result ).toContain( '' ); + } ); + + test( 'converts nested lists correctly', () => { + const html = '
        • Parent
          • Child item
        '; + const result = htmlToBlocks( html ); + // Outer list. + expect( result ).toContain( '' ); + // Nested list should also be wrapped. + const listCount = ( result.match( //g ) || [] ).length; + expect( listCount ).toBeGreaterThanOrEqual( 2 ); + expect( result ).toContain( 'Child item' ); + expect( result ).toContain( 'Parent' ); + } ); + + test( 'preserves inline formatting in list items', () => { + const html = '
        • Bold item
        • Italic item
        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Bold item' ); + expect( result ).toContain( 'Italic item' ); + } ); + + test( 'converts blockquote to quote block', () => { + const result = htmlToBlocks( '

        Quote text

        ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
        ' ); + expect( result ).toContain( 'Quote text' ); + expect( result ).toContain( '' ); + } ); + + test( 'converts pre/code to code block', () => { + const result = htmlToBlocks( '
        const x = 1;
        ' ); + expect( result ).toContain( '' ); + expect( result ).toContain( '
        ' );
        +		expect( result ).toContain( 'const x = 1;' );
        +		expect( result ).toContain( '' );
        +	} );
        +
        +	test( 'handles mix of headings and lists (issue example)', () => {
        +		const html = `

        In Unreal Engine: How to Choose

        +
          +
        • Unreal Engine allows developers to switch: +
            +
          • Deferred Rendering: Default for most projects.
          • +
          • Forward Rendering: For VR projects.
          • +
          +
        • +
        +

        Each technique aligns with different requirements.

        `; + + const result = htmlToBlocks( html ); + + // Heading should be converted. + expect( result ).toContain( '' ); + expect( result ).toContain( 'In Unreal Engine: How to Choose' ); + + // List should be converted. + expect( result ).toContain( '' ); + expect( result ).toContain( 'Deferred Rendering' ); + expect( result ).toContain( 'Forward Rendering' ); + + // Paragraph should be converted. + expect( result ).toContain( '' ); + expect( result ).toContain( 'Each technique aligns' ); + } ); + + test( 'strips script elements for safety', () => { + const html = '

        Safe text

        More text

        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Safe text' ); + expect( result ).not.toContain( 'alert' ); + expect( result ).not.toContain( '">Link

        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Link' ); + expect( result ).not.toContain( 'data:' ); + } ); + + test( 'strips vbscript: href for safety', () => { + const html = '

        Link

        '; + const result = htmlToBlocks( html ); + expect( result ).toContain( 'Link' ); + expect( result ).not.toContain( 'vbscript:' ); + } ); + + test( 'unwraps non-allowlist elements but keeps their text', () => { + //
        ,