@@ -52,6 +52,35 @@ const COOKIE_KEYWORDS = new Set([
5252 'accept all' , 'reject all' , 'manage preferences' ,
5353] ) ;
5454
55+ const COOKIE_BANNER_CLASS_SELECTORS = [
56+ '.cc-banner' , '.cc-grower' , '.consent-banner' , '.cookie-banner' ,
57+ '.privacy-banner' , '.gdpr-banner' , '.cookie-consent' , '.privacy-consent' ,
58+ '.cookie-notice' , '.privacy-notice' , '.cookie-policy' , '.privacy-policy' ,
59+ '.cookie-bar' , '.privacy-bar' , '.consent-bar' , '.gdpr-bar' ,
60+ '.cookie-popup' , '.privacy-popup' , '.consent-popup' , '.gdpr-popup' ,
61+ '.cookie-modal' , '.privacy-modal' , '.consent-modal' , '.gdpr-modal' ,
62+ '.cookie-overlay' , '.privacy-overlay' , '.consent-overlay' , '.gdpr-overlay' ,
63+ ] ;
64+
65+ const COOKIE_BANNER_ID_SELECTORS = [
66+ '#cookie-banner' , '#privacy-banner' , '#consent-banner' , '#gdpr-banner' ,
67+ '#cookie-notice' , '#privacy-notice' , '#cookie-consent' , '#privacy-consent' ,
68+ '#cookie-bar' , '#privacy-bar' , '#consent-bar' , '#gdpr-bar' , '#cookiemgmt' ,
69+ '#cookie-popup' , '#privacy-popup' , '#consent-popup' , '#gdpr-popup' ,
70+ '#onetrust-consent-sdk' , '#onetrust-banner-sdk' ,
71+ ] ;
72+
73+ const COOKIE_BANNER_ARIA_SELECTORS = [
74+ '[role="dialog"][aria-label="Consent Banner"]' ,
75+ '[role="dialog"][aria-label*="cookie" i]' ,
76+ '[role="dialog"][aria-label*="privacy" i]' ,
77+ '[role="dialog"][aria-label*="consent" i]' ,
78+ '[role="alertdialog"][aria-label*="cookie" i]' ,
79+ '[role="alertdialog"][aria-label*="privacy" i]' ,
80+ '[aria-describedby*="cookie" i]' ,
81+ '[aria-describedby*="privacy" i]' ,
82+ ] ;
83+
5584/**
5685 * Validates if an element is likely a cookie banner based on text content
5786 * Optimized: Set lookup + early exit for common keywords (3x faster)
@@ -73,35 +102,12 @@ function isCookieBannerElement(element) {
73102 * Uses multiple strategies to identify genuine cookie consent banners
74103 */
75104function removeCookieBanners ( element ) {
76- const classBasedSelectors = [
77- '.cc-banner' , '.cc-grower' , '.consent-banner' , '.cookie-banner' ,
78- '.privacy-banner' , '.gdpr-banner' , '.cookie-consent' , '.privacy-consent' ,
79- '.cookie-notice' , '.privacy-notice' , '.cookie-policy' , '.privacy-policy' ,
80- '.cookie-bar' , '.privacy-bar' , '.consent-bar' , '.gdpr-bar' ,
81- '.cookie-popup' , '.privacy-popup' , '.consent-popup' , '.gdpr-popup' ,
82- '.cookie-modal' , '.privacy-modal' , '.consent-modal' , '.gdpr-modal' ,
83- '.cookie-overlay' , '.privacy-overlay' , '.consent-overlay' , '.gdpr-overlay' ,
84- ] ;
85-
86- const idBasedSelectors = [
87- '#cookie-banner' , '#privacy-banner' , '#consent-banner' , '#gdpr-banner' ,
88- '#cookie-notice' , '#privacy-notice' , '#cookie-consent' , '#privacy-consent' ,
89- '#cookie-bar' , '#privacy-bar' , '#consent-bar' , '#gdpr-bar' ,
90- '#cookie-popup' , '#privacy-popup' , '#consent-popup' , '#gdpr-popup' ,
91- ] ;
92-
93- const ariaSelectors = [
94- '[role="dialog"][aria-label*="cookie" i]' ,
95- '[role="dialog"][aria-label*="privacy" i]' ,
96- '[role="dialog"][aria-label*="consent" i]' ,
97- '[role="alertdialog"][aria-label*="cookie" i]' ,
98- '[role="alertdialog"][aria-label*="privacy" i]' ,
99- '[aria-describedby*="cookie" i]' ,
100- '[aria-describedby*="privacy" i]' ,
101- ] ;
102-
103105 // Combine all selectors
104- const allSelectors = [ ...classBasedSelectors , ...idBasedSelectors , ...ariaSelectors ] ;
106+ const allSelectors = [
107+ ...COOKIE_BANNER_CLASS_SELECTORS ,
108+ ...COOKIE_BANNER_ID_SELECTORS ,
109+ ...COOKIE_BANNER_ARIA_SELECTORS ,
110+ ] ;
105111
106112 // Apply class/ID/ARIA based detection with text validation
107113 allSelectors . forEach ( ( selector ) => {
@@ -132,35 +138,12 @@ export function filterNavigationAndFooterBrowser(element) {
132138 * @param {CheerioAPI } $ - Cheerio instance
133139 */
134140function removeCookieBannersCheerio ( $ ) {
135- const classBasedSelectors = [
136- '.cc-banner' , '.cc-grower' , '.consent-banner' , '.cookie-banner' ,
137- '.privacy-banner' , '.gdpr-banner' , '.cookie-consent' , '.privacy-consent' ,
138- '.cookie-notice' , '.privacy-notice' , '.cookie-policy' , '.privacy-policy' ,
139- '.cookie-bar' , '.privacy-bar' , '.consent-bar' , '.gdpr-bar' ,
140- '.cookie-popup' , '.privacy-popup' , '.consent-popup' , '.gdpr-popup' ,
141- '.cookie-modal' , '.privacy-modal' , '.consent-modal' , '.gdpr-modal' ,
142- '.cookie-overlay' , '.privacy-overlay' , '.consent-overlay' , '.gdpr-overlay' ,
143- ] ;
144-
145- const idBasedSelectors = [
146- '#cookie-banner' , '#privacy-banner' , '#consent-banner' , '#gdpr-banner' ,
147- '#cookie-notice' , '#privacy-notice' , '#cookie-consent' , '#privacy-consent' ,
148- '#cookie-bar' , '#privacy-bar' , '#consent-bar' , '#gdpr-bar' ,
149- '#cookie-popup' , '#privacy-popup' , '#consent-popup' , '#gdpr-popup' ,
150- ] ;
151-
152- const ariaSelectors = [
153- '[role="dialog"][aria-label*="cookie" i]' ,
154- '[role="dialog"][aria-label*="privacy" i]' ,
155- '[role="dialog"][aria-label*="consent" i]' ,
156- '[role="alertdialog"][aria-label*="cookie" i]' ,
157- '[role="alertdialog"][aria-label*="privacy" i]' ,
158- '[aria-describedby*="cookie" i]' ,
159- '[aria-describedby*="privacy" i]' ,
160- ] ;
161-
162141 // Combine all selectors for efficient removal
163- const allSelectors = [ ...classBasedSelectors , ...idBasedSelectors , ...ariaSelectors ] ;
142+ const allSelectors = [
143+ ...COOKIE_BANNER_CLASS_SELECTORS ,
144+ ...COOKIE_BANNER_ID_SELECTORS ,
145+ ...COOKIE_BANNER_ARIA_SELECTORS ,
146+ ] ;
164147
165148 // Apply class/ID/ARIA based detection with text validation
166149 allSelectors . forEach ( ( selector ) => {
@@ -204,28 +187,70 @@ function filterHtmlBrowser(htmlContent, ignoreNavFooter, returnText) {
204187 const parser = new DOMParser ( ) ; // eslint-disable-line no-undef
205188 const doc = parser . parseFromString ( htmlContent , 'text/html' ) ;
206189
207- // Get the body element, if it doesn't exist, use the entire document
208- const bodyElement = doc . body || doc . documentElement ;
209-
210- // Always remove script, style, noscript, template elements
211- bodyElement . querySelectorAll ( 'script,style,noscript,template' ) . forEach ( ( n ) => n . remove ( ) ) ;
190+ // Process the entire document to capture JSON-LD in both head and body
191+ const documentElement = doc . documentElement || doc ;
192+
193+ // Remove script elements except JSON-LD, also remove style, noscript, template
194+ documentElement . querySelectorAll ( 'script' ) . forEach ( ( n ) => {
195+ // Preserve JSON-LD structured data scripts by converting them to code blocks
196+ if ( n . type === 'application/ld+json' ) {
197+ const jsonContent = n . textContent || n . innerText || '' ;
198+ if ( jsonContent . trim ( ) ) {
199+ try {
200+ // Parse and re-stringify JSON to ensure consistent formatting
201+ // Handle both single and double quoted JSON
202+ const cleanJsonContent = jsonContent . trim ( ) ;
203+ // Try to fix common JSON issues like single quotes
204+ const startsValid = cleanJsonContent . startsWith ( '{' )
205+ || cleanJsonContent . startsWith ( '[' ) ;
206+ const endsValid = cleanJsonContent . endsWith ( '}' )
207+ || cleanJsonContent . endsWith ( ']' ) ;
208+
209+ if ( ! startsValid || ! endsValid ) {
210+ throw new Error ( 'Not valid JSON structure' ) ;
211+ }
212+
213+ const parsedJson = JSON . parse ( cleanJsonContent ) ;
214+ const formattedJson = JSON . stringify ( parsedJson , null , 2 ) ;
215+
216+ // Create a pre/code block to preserve JSON-LD for markdown conversion
217+ const codeBlock = document . createElement ( 'pre' ) ; // eslint-disable-line no-undef
218+ const code = document . createElement ( 'code' ) ; // eslint-disable-line no-undef
219+ code . className = 'ld-json' ;
220+ code . textContent = formattedJson ;
221+ codeBlock . appendChild ( code ) ;
222+ n . parentNode . insertBefore ( codeBlock , n ) ;
223+ } catch ( e ) {
224+ // If JSON parsing fails, fall back to original content
225+ const codeBlock = document . createElement ( 'pre' ) ; // eslint-disable-line no-undef
226+ const code = document . createElement ( 'code' ) ; // eslint-disable-line no-undef
227+ code . className = 'ld-json' ;
228+ code . textContent = jsonContent . trim ( ) ;
229+ codeBlock . appendChild ( code ) ;
230+ n . parentNode . insertBefore ( codeBlock , n ) ;
231+ }
232+ }
233+ }
234+ n . remove ( ) ;
235+ } ) ;
236+ documentElement . querySelectorAll ( 'style,noscript,template' ) . forEach ( ( n ) => n . remove ( ) ) ;
212237
213238 // Remove all media elements (images, videos, audio, etc.) to keep only text
214- bodyElement . querySelectorAll ( 'img,video,audio,picture,svg,canvas,embed,object,iframe' )
215- . forEach ( ( n ) => n . remove ( ) ) ;
239+ const mediaSelector = 'img,video,audio,picture,svg,canvas,embed,object,iframe' ;
240+ documentElement . querySelectorAll ( mediaSelector ) . forEach ( ( n ) => n . remove ( ) ) ;
216241
217242 // Remove consent banners with intelligent detection
218- removeCookieBanners ( bodyElement ) ;
243+ removeCookieBanners ( documentElement ) ;
219244
220245 // Conditionally remove navigation and footer elements
221246 if ( ignoreNavFooter ) {
222- filterNavigationAndFooterBrowser ( bodyElement ) ;
247+ filterNavigationAndFooterBrowser ( documentElement ) ;
223248 }
224249
225250 if ( returnText ) {
226- return ( bodyElement && bodyElement . textContent ) ? bodyElement . textContent : '' ;
251+ return ( documentElement && documentElement . textContent ) ? documentElement . textContent : '' ;
227252 }
228- return bodyElement . outerHTML ;
253+ return documentElement . outerHTML ;
229254}
230255
231256/**
@@ -245,8 +270,41 @@ async function filterHtmlNode(htmlContent, ignoreNavFooter, returnText) {
245270
246271 const $ = cheerio . load ( htmlContent ) ;
247272
248- // Always remove script, style, noscript, template tags
249- $ ( 'script, style, noscript, template' ) . remove ( ) ;
273+ // Remove script except JSON-LD structured data, also remove style, noscript, template
274+ $ ( 'script' ) . each ( function processScript ( ) {
275+ // Preserve JSON-LD structured data scripts by converting them to code blocks
276+ if ( $ ( this ) . attr ( 'type' ) === 'application/ld+json' ) {
277+ const jsonContent = $ ( this ) . text ( ) . trim ( ) ;
278+ if ( jsonContent ) {
279+ try {
280+ // Parse and re-stringify JSON to ensure consistent formatting
281+ // Handle both single and double quoted JSON
282+ const cleanJsonContent = jsonContent ;
283+ const startsValid = cleanJsonContent . startsWith ( '{' )
284+ || cleanJsonContent . startsWith ( '[' ) ;
285+ const endsValid = cleanJsonContent . endsWith ( '}' )
286+ || cleanJsonContent . endsWith ( ']' ) ;
287+
288+ if ( ! startsValid || ! endsValid ) {
289+ throw new Error ( 'Not valid JSON structure' ) ;
290+ }
291+
292+ const parsedJson = JSON . parse ( cleanJsonContent ) ;
293+ const formattedJson = JSON . stringify ( parsedJson , null , 2 ) ;
294+ const codeBlock = `<pre><code class="ld-json">${ formattedJson } </code></pre>` ;
295+ $ ( this ) . before ( codeBlock ) ;
296+ } catch ( e ) {
297+ // If JSON parsing fails, fall back to original content
298+ const codeBlock = `<pre><code class="ld-json">${ jsonContent } </code></pre>` ;
299+ $ ( this ) . before ( codeBlock ) ;
300+ }
301+ }
302+ $ ( this ) . remove ( ) ;
303+ } else {
304+ $ ( this ) . remove ( ) ;
305+ }
306+ } ) ;
307+ $ ( 'style, noscript, template' ) . remove ( ) ;
250308
251309 // Remove all media elements (images, videos, audio, etc.) to keep only text
252310 $ ( 'img, video, audio, picture, svg, canvas, embed, object, iframe' ) . remove ( ) ;
0 commit comments