Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions package-lock.json

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion packages/spacecat-shared-html-analyzer/package.json
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
{
"name": "@adobe/spacecat-shared-html-analyzer",
"version": "1.0.1",
"version": "1.1.0",
"description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
"type": "module",
"engines": {
Expand Down
16 changes: 16 additions & 0 deletions packages/spacecat-shared-html-analyzer/rollup.config.js
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,21 @@

import { nodeResolve } from '@rollup/plugin-node-resolve';
import terser from '@rollup/plugin-terser';
import { readFileSync } from 'fs';

// Read package.json version
const pkg = JSON.parse(readFileSync('./package.json', 'utf-8'));

// Simple plugin to inject package version
const injectVersion = () => ({
name: 'inject-version',
transform(code, id) {
if (id.endsWith('browser-entry.js')) {
return code.replace('__PACKAGE_VERSION__', pkg.version);
}
return null;
},
});

export default {
input: 'src/browser-entry.js', // Special browser entry point
Expand All @@ -35,6 +50,7 @@ export default {
},
],
plugins: [
injectVersion(), // Inject package version
nodeResolve({
browser: true, // Use browser field in package.json
preferBuiltins: false, // Don't include Node.js built-ins
Expand Down
4 changes: 2 additions & 2 deletions packages/spacecat-shared-html-analyzer/src/browser-entry.js
Original file line number Diff line number Diff line change
Expand Up @@ -66,8 +66,8 @@ const HTMLAnalyzer = {
formatNumberToK,
isBrowser,

// Version info
version: '1.0.0',
// Version info (replaced during build from package.json)
version: '__PACKAGE_VERSION__',
buildFor: 'chrome-extension',
};

Expand Down
196 changes: 127 additions & 69 deletions packages/spacecat-shared-html-analyzer/src/html-filter.js
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,35 @@ const COOKIE_KEYWORDS = new Set([
'accept all', 'reject all', 'manage preferences',
]);

const COOKIE_BANNER_CLASS_SELECTORS = [
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
];

const COOKIE_BANNER_ID_SELECTORS = [
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar', '#cookiemgmt',
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
'#onetrust-consent-sdk', '#onetrust-banner-sdk',
];

const COOKIE_BANNER_ARIA_SELECTORS = [
'[role="dialog"][aria-label="Consent Banner"]',
'[role="dialog"][aria-label*="cookie" i]',
'[role="dialog"][aria-label*="privacy" i]',
'[role="dialog"][aria-label*="consent" i]',
'[role="alertdialog"][aria-label*="cookie" i]',
'[role="alertdialog"][aria-label*="privacy" i]',
'[aria-describedby*="cookie" i]',
'[aria-describedby*="privacy" i]',
];

/**
* Validates if an element is likely a cookie banner based on text content
* Optimized: Set lookup + early exit for common keywords (3x faster)
Expand All @@ -73,35 +102,12 @@ function isCookieBannerElement(element) {
* Uses multiple strategies to identify genuine cookie consent banners
*/
function removeCookieBanners(element) {
const classBasedSelectors = [
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
];

const idBasedSelectors = [
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
];

const ariaSelectors = [
'[role="dialog"][aria-label*="cookie" i]',
'[role="dialog"][aria-label*="privacy" i]',
'[role="dialog"][aria-label*="consent" i]',
'[role="alertdialog"][aria-label*="cookie" i]',
'[role="alertdialog"][aria-label*="privacy" i]',
'[aria-describedby*="cookie" i]',
'[aria-describedby*="privacy" i]',
];

// Combine all selectors
const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors];
const allSelectors = [
...COOKIE_BANNER_CLASS_SELECTORS,
...COOKIE_BANNER_ID_SELECTORS,
...COOKIE_BANNER_ARIA_SELECTORS,
];

// Apply class/ID/ARIA based detection with text validation
allSelectors.forEach((selector) => {
Expand Down Expand Up @@ -132,35 +138,12 @@ export function filterNavigationAndFooterBrowser(element) {
* @param {CheerioAPI} $ - Cheerio instance
*/
function removeCookieBannersCheerio($) {
const classBasedSelectors = [
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
];

const idBasedSelectors = [
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
];

const ariaSelectors = [
'[role="dialog"][aria-label*="cookie" i]',
'[role="dialog"][aria-label*="privacy" i]',
'[role="dialog"][aria-label*="consent" i]',
'[role="alertdialog"][aria-label*="cookie" i]',
'[role="alertdialog"][aria-label*="privacy" i]',
'[aria-describedby*="cookie" i]',
'[aria-describedby*="privacy" i]',
];

// Combine all selectors for efficient removal
const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors];
const allSelectors = [
...COOKIE_BANNER_CLASS_SELECTORS,
...COOKIE_BANNER_ID_SELECTORS,
...COOKIE_BANNER_ARIA_SELECTORS,
];

// Apply class/ID/ARIA based detection with text validation
allSelectors.forEach((selector) => {
Expand Down Expand Up @@ -204,28 +187,70 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
const parser = new DOMParser(); // eslint-disable-line no-undef
const doc = parser.parseFromString(htmlContent, 'text/html');

// Get the body element, if it doesn't exist, use the entire document
const bodyElement = doc.body || doc.documentElement;

// Always remove script, style, noscript, template elements
bodyElement.querySelectorAll('script,style,noscript,template').forEach((n) => n.remove());
// Process the entire document to capture JSON-LD in both head and body
const documentElement = doc.documentElement || doc;

// Remove script elements except JSON-LD, also remove style, noscript, template
documentElement.querySelectorAll('script').forEach((n) => {
// Preserve JSON-LD structured data scripts by converting them to code blocks
if (n.type === 'application/ld+json') {
const jsonContent = n.textContent || n.innerText || '';
if (jsonContent.trim()) {
try {
// Parse and re-stringify JSON to ensure consistent formatting
// Handle both single and double quoted JSON
const cleanJsonContent = jsonContent.trim();
// Try to fix common JSON issues like single quotes
const startsValid = cleanJsonContent.startsWith('{')
|| cleanJsonContent.startsWith('[');
const endsValid = cleanJsonContent.endsWith('}')
|| cleanJsonContent.endsWith(']');

if (!startsValid || !endsValid) {
throw new Error('Not valid JSON structure');
}

const parsedJson = JSON.parse(cleanJsonContent);
const formattedJson = JSON.stringify(parsedJson, null, 2);

// Create a pre/code block to preserve JSON-LD for markdown conversion
const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
const code = document.createElement('code'); // eslint-disable-line no-undef
code.className = 'ld-json';
code.textContent = formattedJson;
codeBlock.appendChild(code);
n.parentNode.insertBefore(codeBlock, n);
} catch (e) {
// If JSON parsing fails, fall back to original content
const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
const code = document.createElement('code'); // eslint-disable-line no-undef
code.className = 'ld-json';
code.textContent = jsonContent.trim();
codeBlock.appendChild(code);
n.parentNode.insertBefore(codeBlock, n);
}
}
}
n.remove();
});
documentElement.querySelectorAll('style,noscript,template').forEach((n) => n.remove());

// Remove all media elements (images, videos, audio, etc.) to keep only text
bodyElement.querySelectorAll('img,video,audio,picture,svg,canvas,embed,object,iframe')
.forEach((n) => n.remove());
const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
documentElement.querySelectorAll(mediaSelector).forEach((n) => n.remove());

// Remove consent banners with intelligent detection
removeCookieBanners(bodyElement);
removeCookieBanners(documentElement);

// Conditionally remove navigation and footer elements
if (ignoreNavFooter) {
filterNavigationAndFooterBrowser(bodyElement);
filterNavigationAndFooterBrowser(documentElement);
}

if (returnText) {
return (bodyElement && bodyElement.textContent) ? bodyElement.textContent : '';
return (documentElement && documentElement.textContent) ? documentElement.textContent : '';
}
return bodyElement.outerHTML;
return documentElement.outerHTML;
}

/**
Expand All @@ -245,8 +270,41 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {

const $ = cheerio.load(htmlContent);

// Always remove script, style, noscript, template tags
$('script, style, noscript, template').remove();
// Remove script except JSON-LD structured data, also remove style, noscript, template
$('script').each(function processScript() {
// Preserve JSON-LD structured data scripts by converting them to code blocks
if ($(this).attr('type') === 'application/ld+json') {
const jsonContent = $(this).text().trim();
if (jsonContent) {
try {
// Parse and re-stringify JSON to ensure consistent formatting
// Handle both single and double quoted JSON
const cleanJsonContent = jsonContent;
const startsValid = cleanJsonContent.startsWith('{')
|| cleanJsonContent.startsWith('[');
const endsValid = cleanJsonContent.endsWith('}')
|| cleanJsonContent.endsWith(']');

if (!startsValid || !endsValid) {
throw new Error('Not valid JSON structure');
}

const parsedJson = JSON.parse(cleanJsonContent);
const formattedJson = JSON.stringify(parsedJson, null, 2);
const codeBlock = `<pre><code class="ld-json">${formattedJson}</code></pre>`;
$(this).before(codeBlock);
} catch (e) {
// If JSON parsing fails, fall back to original content
const codeBlock = `<pre><code class="ld-json">${jsonContent}</code></pre>`;
$(this).before(codeBlock);
}
}
$(this).remove();
} else {
$(this).remove();
}
});
$('style, noscript, template').remove();

// Remove all media elements (images, videos, audio, etc.) to keep only text
$('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();
Expand Down