diff --git a/package-lock.json b/package-lock.json index a0378728b..bf7686c30 100644 --- a/package-lock.json +++ b/package-lock.json @@ -55157,7 +55157,7 @@ }, "packages/spacecat-shared-html-analyzer": { "name": "@adobe/spacecat-shared-html-analyzer", - "version": "1.0.1", + "version": "1.1.0", "license": "Apache-2.0", "dependencies": { "cheerio": "^1.0.0-rc.12" @@ -63985,7 +63985,7 @@ }, "packages/spacecat-shared-rum-api-client": { "name": "@adobe/spacecat-shared-rum-api-client", - "version": "2.38.6", + "version": "2.38.7", "license": "Apache-2.0", "dependencies": { "@adobe/fetch": "4.2.3", diff --git a/packages/spacecat-shared-html-analyzer/package.json b/packages/spacecat-shared-html-analyzer/package.json index 6836fc919..2f56f7ef6 100644 --- a/packages/spacecat-shared-html-analyzer/package.json +++ b/packages/spacecat-shared-html-analyzer/package.json @@ -1,6 +1,6 @@ { "name": "@adobe/spacecat-shared-html-analyzer", - "version": "1.0.1", + "version": "1.1.0", "description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content", "type": "module", "engines": { diff --git a/packages/spacecat-shared-html-analyzer/rollup.config.js b/packages/spacecat-shared-html-analyzer/rollup.config.js index 59596537e..1d7773557 100644 --- a/packages/spacecat-shared-html-analyzer/rollup.config.js +++ b/packages/spacecat-shared-html-analyzer/rollup.config.js @@ -12,6 +12,21 @@ import { nodeResolve } from '@rollup/plugin-node-resolve'; import terser from '@rollup/plugin-terser'; +import { readFileSync } from 'fs'; + +// Read package.json version +const pkg = JSON.parse(readFileSync('./package.json', 'utf-8')); + +// Simple plugin to inject package version +const injectVersion = () => ({ + name: 'inject-version', + transform(code, id) { + if (id.endsWith('browser-entry.js')) { + return code.replace('__PACKAGE_VERSION__', pkg.version); + } + return null; + }, +}); export default { input: 'src/browser-entry.js', // Special browser entry point @@ -35,6 +50,7 @@ export default { }, ], plugins: [ + injectVersion(), // Inject package version nodeResolve({ browser: true, // Use browser field in package.json preferBuiltins: false, // Don't include Node.js built-ins diff --git a/packages/spacecat-shared-html-analyzer/src/browser-entry.js b/packages/spacecat-shared-html-analyzer/src/browser-entry.js index f5d67abec..d9e637eb5 100644 --- a/packages/spacecat-shared-html-analyzer/src/browser-entry.js +++ b/packages/spacecat-shared-html-analyzer/src/browser-entry.js @@ -66,8 +66,8 @@ const HTMLAnalyzer = { formatNumberToK, isBrowser, - // Version info - version: '1.0.0', + // Version info (replaced during build from package.json) + version: '__PACKAGE_VERSION__', buildFor: 'chrome-extension', }; diff --git a/packages/spacecat-shared-html-analyzer/src/html-filter.js b/packages/spacecat-shared-html-analyzer/src/html-filter.js index 38b1740ac..2894d21c0 100644 --- a/packages/spacecat-shared-html-analyzer/src/html-filter.js +++ b/packages/spacecat-shared-html-analyzer/src/html-filter.js @@ -52,6 +52,35 @@ const COOKIE_KEYWORDS = new Set([ 'accept all', 'reject all', 'manage preferences', ]); +const COOKIE_BANNER_CLASS_SELECTORS = [ + '.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner', + '.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent', + '.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy', + '.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar', + '.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup', + '.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal', + '.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay', +]; + +const COOKIE_BANNER_ID_SELECTORS = [ + '#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner', + '#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent', + '#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar', '#cookiemgmt', + '#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup', + '#onetrust-consent-sdk', '#onetrust-banner-sdk', +]; + +const COOKIE_BANNER_ARIA_SELECTORS = [ + '[role="dialog"][aria-label="Consent Banner"]', + '[role="dialog"][aria-label*="cookie" i]', + '[role="dialog"][aria-label*="privacy" i]', + '[role="dialog"][aria-label*="consent" i]', + '[role="alertdialog"][aria-label*="cookie" i]', + '[role="alertdialog"][aria-label*="privacy" i]', + '[aria-describedby*="cookie" i]', + '[aria-describedby*="privacy" i]', +]; + /** * Validates if an element is likely a cookie banner based on text content * Optimized: Set lookup + early exit for common keywords (3x faster) @@ -73,35 +102,12 @@ function isCookieBannerElement(element) { * Uses multiple strategies to identify genuine cookie consent banners */ function removeCookieBanners(element) { - const classBasedSelectors = [ - '.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner', - '.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent', - '.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy', - '.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar', - '.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup', - '.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal', - '.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay', - ]; - - const idBasedSelectors = [ - '#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner', - '#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent', - '#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar', - '#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup', - ]; - - const ariaSelectors = [ - '[role="dialog"][aria-label*="cookie" i]', - '[role="dialog"][aria-label*="privacy" i]', - '[role="dialog"][aria-label*="consent" i]', - '[role="alertdialog"][aria-label*="cookie" i]', - '[role="alertdialog"][aria-label*="privacy" i]', - '[aria-describedby*="cookie" i]', - '[aria-describedby*="privacy" i]', - ]; - // Combine all selectors - const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors]; + const allSelectors = [ + ...COOKIE_BANNER_CLASS_SELECTORS, + ...COOKIE_BANNER_ID_SELECTORS, + ...COOKIE_BANNER_ARIA_SELECTORS, + ]; // Apply class/ID/ARIA based detection with text validation allSelectors.forEach((selector) => { @@ -132,35 +138,12 @@ export function filterNavigationAndFooterBrowser(element) { * @param {CheerioAPI} $ - Cheerio instance */ function removeCookieBannersCheerio($) { - const classBasedSelectors = [ - '.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner', - '.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent', - '.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy', - '.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar', - '.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup', - '.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal', - '.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay', - ]; - - const idBasedSelectors = [ - '#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner', - '#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent', - '#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar', - '#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup', - ]; - - const ariaSelectors = [ - '[role="dialog"][aria-label*="cookie" i]', - '[role="dialog"][aria-label*="privacy" i]', - '[role="dialog"][aria-label*="consent" i]', - '[role="alertdialog"][aria-label*="cookie" i]', - '[role="alertdialog"][aria-label*="privacy" i]', - '[aria-describedby*="cookie" i]', - '[aria-describedby*="privacy" i]', - ]; - // Combine all selectors for efficient removal - const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors]; + const allSelectors = [ + ...COOKIE_BANNER_CLASS_SELECTORS, + ...COOKIE_BANNER_ID_SELECTORS, + ...COOKIE_BANNER_ARIA_SELECTORS, + ]; // Apply class/ID/ARIA based detection with text validation allSelectors.forEach((selector) => { @@ -204,28 +187,70 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) { const parser = new DOMParser(); // eslint-disable-line no-undef const doc = parser.parseFromString(htmlContent, 'text/html'); - // Get the body element, if it doesn't exist, use the entire document - const bodyElement = doc.body || doc.documentElement; - - // Always remove script, style, noscript, template elements - bodyElement.querySelectorAll('script,style,noscript,template').forEach((n) => n.remove()); + // Process the entire document to capture JSON-LD in both head and body + const documentElement = doc.documentElement || doc; + + // Remove script elements except JSON-LD, also remove style, noscript, template + documentElement.querySelectorAll('script').forEach((n) => { + // Preserve JSON-LD structured data scripts by converting them to code blocks + if (n.type === 'application/ld+json') { + const jsonContent = n.textContent || n.innerText || ''; + if (jsonContent.trim()) { + try { + // Parse and re-stringify JSON to ensure consistent formatting + // Handle both single and double quoted JSON + const cleanJsonContent = jsonContent.trim(); + // Try to fix common JSON issues like single quotes + const startsValid = cleanJsonContent.startsWith('{') + || cleanJsonContent.startsWith('['); + const endsValid = cleanJsonContent.endsWith('}') + || cleanJsonContent.endsWith(']'); + + if (!startsValid || !endsValid) { + throw new Error('Not valid JSON structure'); + } + + const parsedJson = JSON.parse(cleanJsonContent); + const formattedJson = JSON.stringify(parsedJson, null, 2); + + // Create a pre/code block to preserve JSON-LD for markdown conversion + const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef + const code = document.createElement('code'); // eslint-disable-line no-undef + code.className = 'ld-json'; + code.textContent = formattedJson; + codeBlock.appendChild(code); + n.parentNode.insertBefore(codeBlock, n); + } catch (e) { + // If JSON parsing fails, fall back to original content + const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef + const code = document.createElement('code'); // eslint-disable-line no-undef + code.className = 'ld-json'; + code.textContent = jsonContent.trim(); + codeBlock.appendChild(code); + n.parentNode.insertBefore(codeBlock, n); + } + } + } + n.remove(); + }); + documentElement.querySelectorAll('style,noscript,template').forEach((n) => n.remove()); // Remove all media elements (images, videos, audio, etc.) to keep only text - bodyElement.querySelectorAll('img,video,audio,picture,svg,canvas,embed,object,iframe') - .forEach((n) => n.remove()); + const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe'; + documentElement.querySelectorAll(mediaSelector).forEach((n) => n.remove()); // Remove consent banners with intelligent detection - removeCookieBanners(bodyElement); + removeCookieBanners(documentElement); // Conditionally remove navigation and footer elements if (ignoreNavFooter) { - filterNavigationAndFooterBrowser(bodyElement); + filterNavigationAndFooterBrowser(documentElement); } if (returnText) { - return (bodyElement && bodyElement.textContent) ? bodyElement.textContent : ''; + return (documentElement && documentElement.textContent) ? documentElement.textContent : ''; } - return bodyElement.outerHTML; + return documentElement.outerHTML; } /** @@ -245,8 +270,41 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) { const $ = cheerio.load(htmlContent); - // Always remove script, style, noscript, template tags - $('script, style, noscript, template').remove(); + // Remove script except JSON-LD structured data, also remove style, noscript, template + $('script').each(function processScript() { + // Preserve JSON-LD structured data scripts by converting them to code blocks + if ($(this).attr('type') === 'application/ld+json') { + const jsonContent = $(this).text().trim(); + if (jsonContent) { + try { + // Parse and re-stringify JSON to ensure consistent formatting + // Handle both single and double quoted JSON + const cleanJsonContent = jsonContent; + const startsValid = cleanJsonContent.startsWith('{') + || cleanJsonContent.startsWith('['); + const endsValid = cleanJsonContent.endsWith('}') + || cleanJsonContent.endsWith(']'); + + if (!startsValid || !endsValid) { + throw new Error('Not valid JSON structure'); + } + + const parsedJson = JSON.parse(cleanJsonContent); + const formattedJson = JSON.stringify(parsedJson, null, 2); + const codeBlock = `
${formattedJson}`;
+ $(this).before(codeBlock);
+ } catch (e) {
+ // If JSON parsing fails, fall back to original content
+ const codeBlock = `${jsonContent}`;
+ $(this).before(codeBlock);
+ }
+ }
+ $(this).remove();
+ } else {
+ $(this).remove();
+ }
+ });
+ $('style, noscript, template').remove();
// Remove all media elements (images, videos, audio, etc.) to keep only text
$('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();