getsentry · sergical · Oct 25, 2025 · Oct 26, 2025 · Oct 27, 2025 · Oct 27, 2025
diff --git a/scripts/generate-md-exports.mjs b/scripts/generate-md-exports.mjs
@@ -1,11 +1,12 @@
 #!/usr/bin/env node
+// testing cache
 /* eslint-disable no-console */
 import {ListObjectsV2Command, PutObjectCommand, S3Client} from '@aws-sdk/client-s3';
 import imgLinks from '@pondorasti/remark-img-links';
 import {selectAll} from 'hast-util-select';
 import {createHash} from 'node:crypto';
 import {createReadStream, createWriteStream, existsSync} from 'node:fs';
-import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises';
+import {mkdir, opendir, readdir, readFile, rm, writeFile} from 'node:fs/promises';
 import {cpus} from 'node:os';
 import * as path from 'node:path';
 import {compose, Readable} from 'node:stream';
@@ -91,12 +92,15 @@
   console.log(`💰 Cache directory: ${CACHE_DIR}`);
   const noCache = !existsSync(CACHE_DIR);
   if (noCache) {
-    console.log(`ℹ️ No cache directory found, this will take a while...`);
+    console.log(`ℹ️ No cache directory found, creating fresh cache...`);
     await mkdir(CACHE_DIR, {recursive: true});
+  } else {
+    console.log(`✅ Cache directory exists, will attempt to use cached files`);
   }
 
-  // On a 16-core machine, 8 workers were optimal (and slightly faster than 16)
-  const numWorkers = Math.max(Math.floor(cpus().length / 2), 2);
+  // Use 75% of CPU cores for optimal performance
+  const numWorkers = Math.max(Math.floor(cpus().length * 0.75), 2);
+  console.log(`⚙️  Using ${numWorkers} workers for ${cpus().length} CPU cores`);
   const workerTasks = new Array(numWorkers).fill(null).map(() => []);
 
   let existingFilesOnR2 = null;
@@ -196,13 +200,60 @@
 
 const md5 = data => createHash('md5').update(data).digest('hex');
 
+// Initialize debug counter
+genMDFromHTML.debugCount = 0;
+
 async function genMDFromHTML(source, target, {cacheDir, noCache}) {
-  const leanHTML = (await readFile(source, {encoding: 'utf8'}))
-    // Remove all script tags, as they are not needed in markdown
-    // and they are not stable across builds, causing cache misses
-    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '');
+  const rawHTML = await readFile(source, {encoding: 'utf8'});
+
+  // Debug: Log first 3 files to understand what's being removed
+  const shouldDebug = genMDFromHTML.debugCount < 3;
+  if (shouldDebug) {
+    genMDFromHTML.debugCount++;
+    const fileName = path.basename(source);
+    console.log(`\n🔍 DEBUG: Processing ${fileName}`);
+    console.log(`📏 Raw HTML length: ${rawHTML.length} chars`);
+
+    // Extract what we're removing to see if it's stable
+    const scripts = rawHTML.match(/<script[^>]*src="[^"]*"/gi);
+    const links = rawHTML.match(/<link[^>]*>/gi);
+
+    console.log(`📦 Found ${scripts?.length || 0} script tags with src`);
+    if (scripts && scripts.length > 0) {
+      console.log(`   First 3: ${scripts.slice(0, 3).join(', ')}`);
+    }
+    console.log(`🔗 Found ${links?.length || 0} link tags`);
+    if (links && links.length > 0) {
+      console.log(`   First 3: ${links.slice(0, 3).join(', ')}`);
+    }
+  }
+
+  // Normalize HTML to make cache keys deterministic across builds
+  // Remove elements that change between builds but don't affect markdown output
+  const leanHTML = rawHTML
+    // Remove all script tags (build IDs, chunk hashes, Vercel injections)
+    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
@@ -26,6 +26,7 @@
 import remarkStringify from 'remark-stringify';
 import {unified} from 'unified';
 import {remove} from 'unist-util-remove';
+import rehypeStringify from 'rehype-stringify';
 const DOCS_ORIGIN = 'https://docs.sentry.io';
 const CACHE_VERSION = 3;
@@ -230,17 +231,44 @@
  // Normalize HTML to make cache keys deterministic across builds
  // Remove elements that change between builds but don't affect markdown output
-  const leanHTML = rawHTML
-    // Remove all script tags (build IDs, chunk hashes, Vercel injections)
-    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
-    // Remove link tags for stylesheets and preloads (chunk hashes change)
-    .replace(/<link[^>]*>/gi, '')
-    // Remove meta tags that might have build-specific content
-    .replace(/<meta name="next-size-adjust"[^>]*>/gi, '')
-    // Remove data attributes that Next.js/Vercel add (build IDs, etc.)
-    .replace(/\s+data-next-[a-z-]+="[^"]*"/gi, '')
-    .replace(/\s+data-nextjs-[a-z-]+="[^"]*"/gi, '');
+  // Remove all <script>, <link>, and next-size-adjust <meta> tags, as well as data-* attributes, using an HTML parser.
+  const parsedHtmlTree = unified()
+    .use(rehypeParse, {fragment: true})
+    .parse(rawHTML);
+  // Remove unwanted elements using unist-util-remove
+  // Remove <script> tags
+  remove(parsedHtmlTree, (node) => node.type === 'element' && node.tagName === 'script');
+  // Remove <link> tags
+  remove(parsedHtmlTree, (node) => node.type === 'element' && node.tagName === 'link');
+  // Remove <meta name="next-size-adjust" ...>
+  remove(parsedHtmlTree, (node) =>
+    node.type === 'element' &&
+    node.tagName === 'meta' &&
+    node.properties &&
+    node.properties.name === 'next-size-adjust'
+  );
+  // Remove data-next-* and data-nextjs-* attributes from all elements
+  function cleanseDataAttrs(node) {
+    if (node && node.type === 'element' && node.properties) {
+      Object.keys(node.properties).forEach((key) => {
+        if (/^data-next(-|js-)/.test(key)) {
+          delete node.properties[key];
+        }
+      });
+    }
+    if (node.children) {
+      node.children.forEach(cleanseDataAttrs);
+    }
+  }
+  cleanseDataAttrs(parsedHtmlTree);
+
+  // Convert AST back to HTML
+  const leanHTML = unified()
+    .use(() => (tree) => tree) // identity plugin since tree already processed
+    .use(rehypeStringify)
+    .stringify(parsedHtmlTree);
+
  if (shouldDebug) {
    console.log(
      `✂️  Lean HTML length: ${leanHTML.length} chars (removed ${rawHTML.length - leanHTML.length} chars)`
@@ -26,6 +26,7 @@
 import remarkStringify from 'remark-stringify';
 import {unified} from 'unified';
 import {remove} from 'unist-util-remove';
+import rehypeStringify from 'rehype-stringify';

 const DOCS_ORIGIN = 'https://docs.sentry.io';
 const CACHE_VERSION = 3;
@@ -230,17 +231,44 @@

  // Normalize HTML to make cache keys deterministic across builds
  // Remove elements that change between builds but don't affect markdown output
-  const leanHTML = rawHTML
-    // Remove all script tags (build IDs, chunk hashes, Vercel injections)
-    .replace(/<script[^>]*>[\s\S]*?<\/script>/gi, '')
-    // Remove link tags for stylesheets and preloads (chunk hashes change)
-    .replace(/<link[^>]*>/gi, '')
-    // Remove meta tags that might have build-specific content
-    .replace(/<meta name="next-size-adjust"[^>]*>/gi, '')
-    // Remove data attributes that Next.js/Vercel add (build IDs, etc.)
-    .replace(/\s+data-next-[a-z-]+="[^"]*"/gi, '')
-    .replace(/\s+data-nextjs-[a-z-]+="[^"]*"/gi, '');
+  // Remove all <script>, <link>, and next-size-adjust <meta> tags, as well as data-* attributes, using an HTML parser.
+  const parsedHtmlTree = unified()
+    .use(rehypeParse, {fragment: true})
+    .parse(rawHTML);

+  // Remove unwanted elements using unist-util-remove
+  // Remove <script> tags
+  remove(parsedHtmlTree, (node) => node.type === 'element' && node.tagName === 'script');
+  // Remove <link> tags
+  remove(parsedHtmlTree, (node) => node.type === 'element' && node.tagName === 'link');
+  // Remove <meta name="next-size-adjust" ...>
+  remove(parsedHtmlTree, (node) =>
+    node.type === 'element' &&
+    node.tagName === 'meta' &&
+    node.properties &&
+    node.properties.name === 'next-size-adjust'
+  );
+  // Remove data-next-* and data-nextjs-* attributes from all elements
+  function cleanseDataAttrs(node) {
+    if (node && node.type === 'element' && node.properties) {
+      Object.keys(node.properties).forEach((key) => {
+        if (/^data-next(-|js-)/.test(key)) {
+          delete node.properties[key];
+        }
+      });
+    }
+    if (node.children) {
+      node.children.forEach(cleanseDataAttrs);
+    }
+  }
+  cleanseDataAttrs(parsedHtmlTree);
+
+  // Convert AST back to HTML
+  const leanHTML = unified()
+    .use(() => (tree) => tree) // identity plugin since tree already processed
+    .use(rehypeStringify)
+    .stringify(parsedHtmlTree);
+
  if (shouldDebug) {
    console.log(
      `✂️  Lean HTML length: ${leanHTML.length} chars (removed ${rawHTML.length - leanHTML.length} chars)`
+    // Remove link tags for stylesheets and preloads (chunk hashes change)
+    .replace(/<link[^>]*>/gi, '')
+    // Remove meta tags that might have build-specific content
+    .replace(/<meta name="next-size-adjust"[^>]*>/gi, '')
+    // Remove data attributes that Next.js/Vercel add (build IDs, etc.)
+    .replace(/\s+data-next-[a-z-]+="[^"]*"/gi, '')
+    .replace(/\s+data-nextjs-[a-z-]+="[^"]*"/gi, '');
+
+  if (shouldDebug) {
+    console.log(
+      `✂️  Lean HTML length: ${leanHTML.length} chars (removed ${rawHTML.length - leanHTML.length} chars)`
+    );
+  }
+
   const cacheKey = `v${CACHE_VERSION}_${md5(leanHTML)}`;
   const cacheFile = path.join(cacheDir, cacheKey);
+
+  if (shouldDebug) {
+    console.log(`🔑 Cache key: ${cacheKey}`);
+  }
+
   if (!noCache) {
     try {
       const data = await text(
@@ -214,6 +265,17 @@
     } catch (err) {
       if (err.code !== 'ENOENT') {
         console.warn(`Error using cache file ${cacheFile}:`, err);
+      } else if (shouldDebug) {
+        // Cache miss on debug file - show what's in cache
+        console.log(`❌ Cache miss! Looking for: ${cacheKey}`);
+        try {
+          const allCacheFiles = await readdir(cacheDir);
+          const v3Files = allCacheFiles.filter(f => f.startsWith('v3_')).slice(0, 3);
+          console.log(`   Existing v3 files in cache:`);
+          v3Files.forEach(f => console.log(`     - ${f}`));
+        } catch (e) {
+          console.log(`   Could not read cache dir: ${e.message}`);
+        }
       }
     }
   }
@@ -311,8 +373,11 @@
   const s3Client = getS3Client();
   const failedTasks = [];
   let cacheMisses = [];
+  let cacheHits = 0;
   let r2CacheMisses = [];
-  console.log(`🤖 Worker[${id}]: Starting to process ${tasks.length} files...`);
+  console.log(
+    `🤖 Worker[${id}]: Starting to process ${tasks.length} files... (noCache=${noCache})`
+  );
   for (const {sourcePath, targetPath, relativePath, r2Hash} of tasks) {
     try {
       const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, {
@@ -321,6 +386,8 @@
       });
       if (!cacheHit) {
         cacheMisses.push(relativePath);
+      } else {
+        cacheHits++;
       }
 
       if (r2Hash !== null) {
@@ -336,6 +403,14 @@
     }
   }
   const success = tasks.length - failedTasks.length;
+
+  // Log cache statistics
+  const cacheHitRate = ((cacheHits / tasks.length) * 100).toFixed(1);
+  const cacheMissRate = ((cacheMisses.length / tasks.length) * 100).toFixed(1);
+  console.log(
+    `📊 Worker[${id}]: Cache stats - ${cacheHits}/${tasks.length} hits (${cacheHitRate}%), ${cacheMisses.length} misses (${cacheMissRate}%)`
+  );
+
   if (r2CacheMisses.length / tasks.length > 0.1) {
     console.warn(
       `⚠️ Worker[${id}]: More than 10% of files had a different hash on R2 with the generation process.`

diff --git a/src/mdx.ts b/src/mdx.ts
@@ -65,6 +65,80 @@ if (process.env.CI) {
   mkdirSync(CACHE_DIR, {recursive: true});
 }
 
+// Cache registry hash per worker to avoid recomputing for every file
+let cachedRegistryHash: Promise<string> | null = null;
+async function getRegistryHashWithRetry(
+  maxRetries = 3,
+  initialDelayMs = 1000
+): Promise<string> {
+  let lastError: Error | null = null;
+  let delayMs = initialDelayMs;
+  for (let attempt = 0; attempt <= maxRetries; attempt++) {
+    try {
+      const [apps, packages] = await Promise.all([
+        getAppRegistry(),
+        getPackageRegistry(),
+      ]);
+      return md5(JSON.stringify({apps, packages}));
+    } catch (err) {
+      lastError = err as Error;
+      if (attempt < maxRetries) {
+        const currentDelay = delayMs;
+        // eslint-disable-next-line no-console
+        console.warn(
+          `Failed to fetch registry (attempt ${attempt + 1}/${maxRetries + 1}): ${lastError.message}. Retrying in ${currentDelay}ms...`
+        );
+        await new Promise(resolve => setTimeout(resolve, currentDelay));
+        delayMs *= 2; // Exponential backoff
+      }
+    }
+  }
+  throw new Error(
+    `Failed to fetch registry after ${maxRetries + 1} attempts: ${lastError?.message}`
+  );
+}
+
+function getRegistryHash(): Promise<string> {
+  if (!cachedRegistryHash) {
+    cachedRegistryHash = getRegistryHashWithRetry().catch(err => {
+      // Clear cache on error to allow retry on next call
+      cachedRegistryHash = null;
+      throw err;
+    });
+  }
+  return cachedRegistryHash;
+}
+
+// Track cache statistics per worker (silent tracking)
+const cacheStats = {
+  registryHits: 0,
+  registryMisses: 0,
+  uniqueRegistryFiles: new Set<string>(),
+};
+
+// Log summary at end
+function logCacheSummary() {
+  const total = cacheStats.registryHits + cacheStats.registryMisses;
+  if (total === 0) {
+    return;
+  }
+
+  const hitRate = ((cacheStats.registryHits / total) * 100).toFixed(1);
+  const uniqueFiles = cacheStats.uniqueRegistryFiles.size;
+
+  // eslint-disable-next-line no-console
+  console.log(
+    `📊 [MDX Cache] ${cacheStats.registryHits}/${total} registry files cached (${hitRate}% hit rate, ${uniqueFiles} unique files)`
+  );
+}
+
+// Log final summary when worker exits
+if (typeof process !== 'undefined') {
+  process.on('beforeExit', () => {
+    logCacheSummary();
+  });
+}
+
 const md5 = (data: BinaryLike) => createHash('md5').update(data).digest('hex');
 
 async function readCacheFile<T>(file: string): Promise<T> {
@@ -209,6 +283,7 @@ export async function getDevDocsFrontMatterUncached(): Promise<FrontMatter[]> {
       )
     )
   ).filter(isNotNil);
+
   return frontMatters;
 }
 
@@ -396,6 +471,7 @@ async function getAllFilesFrontMatter(): Promise<FrontMatter[]> {
       );
     }
   }
+
   return allFrontMatter;
 }
 
@@ -531,45 +607,61 @@ export async function getFileBySlug(slug: string): Promise<SlugFile> {
   const outdir = path.join(root, 'public', 'mdx-images');
   await mkdir(outdir, {recursive: true});
 
-  // If the file contains content that depends on the Release Registry (such as an SDK's latest version), avoid using the cache for that file, i.e. always rebuild it.
-  // This is because the content from the registry might have changed since the last time the file was cached.
-  // If a new component that injects content from the registry is introduced, it should be added to the patterns below.
-  const skipCache =
+  // Check if file depends on Release Registry
+  const dependsOnRegistry =
     source.includes('@inject') ||
     source.includes('<PlatformSDKPackageName') ||
     source.includes('<LambdaLayerDetail');
 
   if (process.env.CI) {
-    if (skipCache) {
-      // eslint-disable-next-line no-console
-      console.info(
-        `Not using cached version of ${sourcePath}, as its content depends on the Release Registry`
-      );
+    // Build cache key from source content
+    const sourceHash = md5(source);
+
+    // For files that depend on registry, include registry version in cache key
+    // This prevents serving stale content when registry is updated
+    if (dependsOnRegistry) {
+      // Get registry hash (cached per worker to avoid redundant fetches)
+      // If this fails, the build will fail - registry is required for these files
+      const registryHash = await getRegistryHash();
+      cacheKey = `${sourceHash}-${registryHash}`;
     } else {
-      cacheKey = md5(source);
-      cacheFile = path.join(CACHE_DIR, `${cacheKey}.br`);
-      assetsCacheDir = path.join(CACHE_DIR, cacheKey);
+      // Regular files without registry dependencies
+      cacheKey = sourceHash;
+    }
 
-      try {
-        const [cached, _] = await Promise.all([
-          readCacheFile<SlugFile>(cacheFile),
-          cp(assetsCacheDir, outdir, {recursive: true}),
-        ]);
-        return cached;
-      } catch (err) {
-        if (
-          err.code !== 'ENOENT' &&
-          err.code !== 'ABORT_ERR' &&
-          err.code !== 'Z_BUF_ERROR'
-        ) {
-          // If cache is corrupted, ignore and proceed
-          // eslint-disable-next-line no-console
-          console.warn(`Failed to read MDX cache: ${cacheFile}`, err);
-        }
+    cacheFile = path.join(CACHE_DIR, `${cacheKey}.br`);
+    assetsCacheDir = path.join(CACHE_DIR, cacheKey);
+
+    try {
+      const [cached, _] = await Promise.all([
+        readCacheFile<SlugFile>(cacheFile),
+        cp(assetsCacheDir, outdir, {recursive: true}),
+      ]);
+      // Track cache hit silently
+      if (dependsOnRegistry) {
+        cacheStats.registryHits++;
+        cacheStats.uniqueRegistryFiles.add(sourcePath);
+      }
+      return cached;
+    } catch (err) {
+      if (
+        err.code !== 'ENOENT' &&
+        err.code !== 'ABORT_ERR' &&
+        err.code !== 'Z_BUF_ERROR'
+      ) {
+        // If cache is corrupted, ignore and proceed
+        // eslint-disable-next-line no-console
+        console.warn(`Failed to read MDX cache: ${cacheFile}`, err);
       }
     }
   }
 
+  // Track cache miss silently
+  if (process.env.CI && dependsOnRegistry) {
+    cacheStats.registryMisses++;
+    cacheStats.uniqueRegistryFiles.add(sourcePath);
+  }
+
   process.env.ESBUILD_BINARY_PATH = path.join(
     root,
     'node_modules',
@@ -700,7 +792,8 @@ export async function getFileBySlug(slug: string): Promise<SlugFile> {
     },
   };
 
-  if (assetsCacheDir && cacheFile && !skipCache) {
+  // Save to cache if we have a cache key (we now cache everything, including registry-dependent files)
+  if (assetsCacheDir && cacheFile && cacheKey) {
     await cp(assetsCacheDir, outdir, {recursive: true});
     writeCacheFile(cacheFile, JSON.stringify(resultObj)).catch(e => {
       // eslint-disable-next-line no-console