Skip to content

Commit 9f993fe

Browse files
authored
fix: json-ld inclusion, cc banner updates (#1054)
## Changes - Updated list of cookie consent banner selectors - Inclusion of json-ld in content gains analysis and metrics - Dynamic loading of version via package.json Please ensure your pull request adheres to the following guidelines: - [ ] make sure to link the related issues in this description - [ ] when merging / squashing, make sure the fixed issue references are visible in the commits, for easy compilation of release notes ## Related Issues Thanks for contributing!
1 parent 388d077 commit 9f993fe

File tree

5 files changed

+148
-74
lines changed

5 files changed

+148
-74
lines changed

package-lock.json

Lines changed: 2 additions & 2 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

packages/spacecat-shared-html-analyzer/package.json

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
{
22
"name": "@adobe/spacecat-shared-html-analyzer",
3-
"version": "1.0.1",
3+
"version": "1.1.0",
44
"description": "Analyze HTML content visibility for AI crawlers and citations - compare static HTML vs fully rendered content",
55
"type": "module",
66
"engines": {

packages/spacecat-shared-html-analyzer/rollup.config.js

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,21 @@
1212

1313
import { nodeResolve } from '@rollup/plugin-node-resolve';
1414
import terser from '@rollup/plugin-terser';
15+
import { readFileSync } from 'fs';
16+
17+
// Read package.json version
18+
const pkg = JSON.parse(readFileSync('./package.json', 'utf-8'));
19+
20+
// Simple plugin to inject package version
21+
const injectVersion = () => ({
22+
name: 'inject-version',
23+
transform(code, id) {
24+
if (id.endsWith('browser-entry.js')) {
25+
return code.replace('__PACKAGE_VERSION__', pkg.version);
26+
}
27+
return null;
28+
},
29+
});
1530

1631
export default {
1732
input: 'src/browser-entry.js', // Special browser entry point
@@ -35,6 +50,7 @@ export default {
3550
},
3651
],
3752
plugins: [
53+
injectVersion(), // Inject package version
3854
nodeResolve({
3955
browser: true, // Use browser field in package.json
4056
preferBuiltins: false, // Don't include Node.js built-ins

packages/spacecat-shared-html-analyzer/src/browser-entry.js

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -66,8 +66,8 @@ const HTMLAnalyzer = {
6666
formatNumberToK,
6767
isBrowser,
6868

69-
// Version info
70-
version: '1.0.0',
69+
// Version info (replaced during build from package.json)
70+
version: '__PACKAGE_VERSION__',
7171
buildFor: 'chrome-extension',
7272
};
7373

packages/spacecat-shared-html-analyzer/src/html-filter.js

Lines changed: 127 additions & 69 deletions
Original file line numberDiff line numberDiff line change
@@ -52,6 +52,35 @@ const COOKIE_KEYWORDS = new Set([
5252
'accept all', 'reject all', 'manage preferences',
5353
]);
5454

55+
const COOKIE_BANNER_CLASS_SELECTORS = [
56+
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
57+
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
58+
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
59+
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
60+
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
61+
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
62+
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
63+
];
64+
65+
const COOKIE_BANNER_ID_SELECTORS = [
66+
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
67+
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
68+
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar', '#cookiemgmt',
69+
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
70+
'#onetrust-consent-sdk', '#onetrust-banner-sdk',
71+
];
72+
73+
const COOKIE_BANNER_ARIA_SELECTORS = [
74+
'[role="dialog"][aria-label="Consent Banner"]',
75+
'[role="dialog"][aria-label*="cookie" i]',
76+
'[role="dialog"][aria-label*="privacy" i]',
77+
'[role="dialog"][aria-label*="consent" i]',
78+
'[role="alertdialog"][aria-label*="cookie" i]',
79+
'[role="alertdialog"][aria-label*="privacy" i]',
80+
'[aria-describedby*="cookie" i]',
81+
'[aria-describedby*="privacy" i]',
82+
];
83+
5584
/**
5685
* Validates if an element is likely a cookie banner based on text content
5786
* Optimized: Set lookup + early exit for common keywords (3x faster)
@@ -73,35 +102,12 @@ function isCookieBannerElement(element) {
73102
* Uses multiple strategies to identify genuine cookie consent banners
74103
*/
75104
function removeCookieBanners(element) {
76-
const classBasedSelectors = [
77-
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
78-
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
79-
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
80-
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
81-
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
82-
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
83-
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
84-
];
85-
86-
const idBasedSelectors = [
87-
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
88-
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
89-
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
90-
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
91-
];
92-
93-
const ariaSelectors = [
94-
'[role="dialog"][aria-label*="cookie" i]',
95-
'[role="dialog"][aria-label*="privacy" i]',
96-
'[role="dialog"][aria-label*="consent" i]',
97-
'[role="alertdialog"][aria-label*="cookie" i]',
98-
'[role="alertdialog"][aria-label*="privacy" i]',
99-
'[aria-describedby*="cookie" i]',
100-
'[aria-describedby*="privacy" i]',
101-
];
102-
103105
// Combine all selectors
104-
const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors];
106+
const allSelectors = [
107+
...COOKIE_BANNER_CLASS_SELECTORS,
108+
...COOKIE_BANNER_ID_SELECTORS,
109+
...COOKIE_BANNER_ARIA_SELECTORS,
110+
];
105111

106112
// Apply class/ID/ARIA based detection with text validation
107113
allSelectors.forEach((selector) => {
@@ -132,35 +138,12 @@ export function filterNavigationAndFooterBrowser(element) {
132138
* @param {CheerioAPI} $ - Cheerio instance
133139
*/
134140
function removeCookieBannersCheerio($) {
135-
const classBasedSelectors = [
136-
'.cc-banner', '.cc-grower', '.consent-banner', '.cookie-banner',
137-
'.privacy-banner', '.gdpr-banner', '.cookie-consent', '.privacy-consent',
138-
'.cookie-notice', '.privacy-notice', '.cookie-policy', '.privacy-policy',
139-
'.cookie-bar', '.privacy-bar', '.consent-bar', '.gdpr-bar',
140-
'.cookie-popup', '.privacy-popup', '.consent-popup', '.gdpr-popup',
141-
'.cookie-modal', '.privacy-modal', '.consent-modal', '.gdpr-modal',
142-
'.cookie-overlay', '.privacy-overlay', '.consent-overlay', '.gdpr-overlay',
143-
];
144-
145-
const idBasedSelectors = [
146-
'#cookie-banner', '#privacy-banner', '#consent-banner', '#gdpr-banner',
147-
'#cookie-notice', '#privacy-notice', '#cookie-consent', '#privacy-consent',
148-
'#cookie-bar', '#privacy-bar', '#consent-bar', '#gdpr-bar',
149-
'#cookie-popup', '#privacy-popup', '#consent-popup', '#gdpr-popup',
150-
];
151-
152-
const ariaSelectors = [
153-
'[role="dialog"][aria-label*="cookie" i]',
154-
'[role="dialog"][aria-label*="privacy" i]',
155-
'[role="dialog"][aria-label*="consent" i]',
156-
'[role="alertdialog"][aria-label*="cookie" i]',
157-
'[role="alertdialog"][aria-label*="privacy" i]',
158-
'[aria-describedby*="cookie" i]',
159-
'[aria-describedby*="privacy" i]',
160-
];
161-
162141
// Combine all selectors for efficient removal
163-
const allSelectors = [...classBasedSelectors, ...idBasedSelectors, ...ariaSelectors];
142+
const allSelectors = [
143+
...COOKIE_BANNER_CLASS_SELECTORS,
144+
...COOKIE_BANNER_ID_SELECTORS,
145+
...COOKIE_BANNER_ARIA_SELECTORS,
146+
];
164147

165148
// Apply class/ID/ARIA based detection with text validation
166149
allSelectors.forEach((selector) => {
@@ -204,28 +187,70 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
204187
const parser = new DOMParser(); // eslint-disable-line no-undef
205188
const doc = parser.parseFromString(htmlContent, 'text/html');
206189

207-
// Get the body element, if it doesn't exist, use the entire document
208-
const bodyElement = doc.body || doc.documentElement;
209-
210-
// Always remove script, style, noscript, template elements
211-
bodyElement.querySelectorAll('script,style,noscript,template').forEach((n) => n.remove());
190+
// Process the entire document to capture JSON-LD in both head and body
191+
const documentElement = doc.documentElement || doc;
192+
193+
// Remove script elements except JSON-LD, also remove style, noscript, template
194+
documentElement.querySelectorAll('script').forEach((n) => {
195+
// Preserve JSON-LD structured data scripts by converting them to code blocks
196+
if (n.type === 'application/ld+json') {
197+
const jsonContent = n.textContent || n.innerText || '';
198+
if (jsonContent.trim()) {
199+
try {
200+
// Parse and re-stringify JSON to ensure consistent formatting
201+
// Handle both single and double quoted JSON
202+
const cleanJsonContent = jsonContent.trim();
203+
// Try to fix common JSON issues like single quotes
204+
const startsValid = cleanJsonContent.startsWith('{')
205+
|| cleanJsonContent.startsWith('[');
206+
const endsValid = cleanJsonContent.endsWith('}')
207+
|| cleanJsonContent.endsWith(']');
208+
209+
if (!startsValid || !endsValid) {
210+
throw new Error('Not valid JSON structure');
211+
}
212+
213+
const parsedJson = JSON.parse(cleanJsonContent);
214+
const formattedJson = JSON.stringify(parsedJson, null, 2);
215+
216+
// Create a pre/code block to preserve JSON-LD for markdown conversion
217+
const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
218+
const code = document.createElement('code'); // eslint-disable-line no-undef
219+
code.className = 'ld-json';
220+
code.textContent = formattedJson;
221+
codeBlock.appendChild(code);
222+
n.parentNode.insertBefore(codeBlock, n);
223+
} catch (e) {
224+
// If JSON parsing fails, fall back to original content
225+
const codeBlock = document.createElement('pre'); // eslint-disable-line no-undef
226+
const code = document.createElement('code'); // eslint-disable-line no-undef
227+
code.className = 'ld-json';
228+
code.textContent = jsonContent.trim();
229+
codeBlock.appendChild(code);
230+
n.parentNode.insertBefore(codeBlock, n);
231+
}
232+
}
233+
}
234+
n.remove();
235+
});
236+
documentElement.querySelectorAll('style,noscript,template').forEach((n) => n.remove());
212237

213238
// Remove all media elements (images, videos, audio, etc.) to keep only text
214-
bodyElement.querySelectorAll('img,video,audio,picture,svg,canvas,embed,object,iframe')
215-
.forEach((n) => n.remove());
239+
const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe';
240+
documentElement.querySelectorAll(mediaSelector).forEach((n) => n.remove());
216241

217242
// Remove consent banners with intelligent detection
218-
removeCookieBanners(bodyElement);
243+
removeCookieBanners(documentElement);
219244

220245
// Conditionally remove navigation and footer elements
221246
if (ignoreNavFooter) {
222-
filterNavigationAndFooterBrowser(bodyElement);
247+
filterNavigationAndFooterBrowser(documentElement);
223248
}
224249

225250
if (returnText) {
226-
return (bodyElement && bodyElement.textContent) ? bodyElement.textContent : '';
251+
return (documentElement && documentElement.textContent) ? documentElement.textContent : '';
227252
}
228-
return bodyElement.outerHTML;
253+
return documentElement.outerHTML;
229254
}
230255

231256
/**
@@ -245,8 +270,41 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
245270

246271
const $ = cheerio.load(htmlContent);
247272

248-
// Always remove script, style, noscript, template tags
249-
$('script, style, noscript, template').remove();
273+
// Remove script except JSON-LD structured data, also remove style, noscript, template
274+
$('script').each(function processScript() {
275+
// Preserve JSON-LD structured data scripts by converting them to code blocks
276+
if ($(this).attr('type') === 'application/ld+json') {
277+
const jsonContent = $(this).text().trim();
278+
if (jsonContent) {
279+
try {
280+
// Parse and re-stringify JSON to ensure consistent formatting
281+
// Handle both single and double quoted JSON
282+
const cleanJsonContent = jsonContent;
283+
const startsValid = cleanJsonContent.startsWith('{')
284+
|| cleanJsonContent.startsWith('[');
285+
const endsValid = cleanJsonContent.endsWith('}')
286+
|| cleanJsonContent.endsWith(']');
287+
288+
if (!startsValid || !endsValid) {
289+
throw new Error('Not valid JSON structure');
290+
}
291+
292+
const parsedJson = JSON.parse(cleanJsonContent);
293+
const formattedJson = JSON.stringify(parsedJson, null, 2);
294+
const codeBlock = `<pre><code class="ld-json">${formattedJson}</code></pre>`;
295+
$(this).before(codeBlock);
296+
} catch (e) {
297+
// If JSON parsing fails, fall back to original content
298+
const codeBlock = `<pre><code class="ld-json">${jsonContent}</code></pre>`;
299+
$(this).before(codeBlock);
300+
}
301+
}
302+
$(this).remove();
303+
} else {
304+
$(this).remove();
305+
}
306+
});
307+
$('style, noscript, template').remove();
250308

251309
// Remove all media elements (images, videos, audio, etc.) to keep only text
252310
$('img, video, audio, picture, svg, canvas, embed, object, iframe').remove();

0 commit comments

Comments
 (0)