diff --git a/scripts/generate-md-exports.mjs b/scripts/generate-md-exports.mjs index 229a700691e29..20d40bd9d35ff 100644 --- a/scripts/generate-md-exports.mjs +++ b/scripts/generate-md-exports.mjs @@ -1,11 +1,12 @@ #!/usr/bin/env node +// testing cache /* eslint-disable no-console */ import {ListObjectsV2Command, PutObjectCommand, S3Client} from '@aws-sdk/client-s3'; import imgLinks from '@pondorasti/remark-img-links'; import {selectAll} from 'hast-util-select'; import {createHash} from 'node:crypto'; import {createReadStream, createWriteStream, existsSync} from 'node:fs'; -import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises'; +import {mkdir, opendir, readdir, readFile, rm, writeFile} from 'node:fs/promises'; import {cpus} from 'node:os'; import * as path from 'node:path'; import {compose, Readable} from 'node:stream'; @@ -91,12 +92,15 @@ async function createWork() { console.log(`šŸ’° Cache directory: ${CACHE_DIR}`); const noCache = !existsSync(CACHE_DIR); if (noCache) { - console.log(`ā„¹ļø No cache directory found, this will take a while...`); + console.log(`ā„¹ļø No cache directory found, creating fresh cache...`); await mkdir(CACHE_DIR, {recursive: true}); + } else { + console.log(`āœ… Cache directory exists, will attempt to use cached files`); } - // On a 16-core machine, 8 workers were optimal (and slightly faster than 16) - const numWorkers = Math.max(Math.floor(cpus().length / 2), 2); + // Use 75% of CPU cores for optimal performance + const numWorkers = Math.max(Math.floor(cpus().length * 0.75), 2); + console.log(`āš™ļø Using ${numWorkers} workers for ${cpus().length} CPU cores`); const workerTasks = new Array(numWorkers).fill(null).map(() => []); let existingFilesOnR2 = null; @@ -196,13 +200,60 @@ async function createWork() { const md5 = data => createHash('md5').update(data).digest('hex'); +// Initialize debug counter +genMDFromHTML.debugCount = 0; + async function genMDFromHTML(source, target, {cacheDir, noCache}) { - const leanHTML = (await readFile(source, {encoding: 'utf8'})) - // Remove all script tags, as they are not needed in markdown - // and they are not stable across builds, causing cache misses - .replace(/]*>[\s\S]*?<\/script>/gi, ''); + const rawHTML = await readFile(source, {encoding: 'utf8'}); + + // Debug: Log first 3 files to understand what's being removed + const shouldDebug = genMDFromHTML.debugCount < 3; + if (shouldDebug) { + genMDFromHTML.debugCount++; + const fileName = path.basename(source); + console.log(`\nšŸ” DEBUG: Processing ${fileName}`); + console.log(`šŸ“ Raw HTML length: ${rawHTML.length} chars`); + + // Extract what we're removing to see if it's stable + const scripts = rawHTML.match(/]*src="[^"]*"/gi); + const links = rawHTML.match(/]*>/gi); + + console.log(`šŸ“¦ Found ${scripts?.length || 0} script tags with src`); + if (scripts && scripts.length > 0) { + console.log(` First 3: ${scripts.slice(0, 3).join(', ')}`); + } + console.log(`šŸ”— Found ${links?.length || 0} link tags`); + if (links && links.length > 0) { + console.log(` First 3: ${links.slice(0, 3).join(', ')}`); + } + } + + // Normalize HTML to make cache keys deterministic across builds + // Remove elements that change between builds but don't affect markdown output + const leanHTML = rawHTML + // Remove all script tags (build IDs, chunk hashes, Vercel injections) + .replace(/]*>[\s\S]*?<\/script>/gi, '') + // Remove link tags for stylesheets and preloads (chunk hashes change) + .replace(/]*>/gi, '') + // Remove meta tags that might have build-specific content + .replace(/]*>/gi, '') + // Remove data attributes that Next.js/Vercel add (build IDs, etc.) + .replace(/\s+data-next-[a-z-]+="[^"]*"/gi, '') + .replace(/\s+data-nextjs-[a-z-]+="[^"]*"/gi, ''); + + if (shouldDebug) { + console.log( + `āœ‚ļø Lean HTML length: ${leanHTML.length} chars (removed ${rawHTML.length - leanHTML.length} chars)` + ); + } + const cacheKey = `v${CACHE_VERSION}_${md5(leanHTML)}`; const cacheFile = path.join(cacheDir, cacheKey); + + if (shouldDebug) { + console.log(`šŸ”‘ Cache key: ${cacheKey}`); + } + if (!noCache) { try { const data = await text( @@ -214,6 +265,17 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) { } catch (err) { if (err.code !== 'ENOENT') { console.warn(`Error using cache file ${cacheFile}:`, err); + } else if (shouldDebug) { + // Cache miss on debug file - show what's in cache + console.log(`āŒ Cache miss! Looking for: ${cacheKey}`); + try { + const allCacheFiles = await readdir(cacheDir); + const v3Files = allCacheFiles.filter(f => f.startsWith('v3_')).slice(0, 3); + console.log(` Existing v3 files in cache:`); + v3Files.forEach(f => console.log(` - ${f}`)); + } catch (e) { + console.log(` Could not read cache dir: ${e.message}`); + } } } } @@ -311,8 +373,11 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { const s3Client = getS3Client(); const failedTasks = []; let cacheMisses = []; + let cacheHits = 0; let r2CacheMisses = []; - console.log(`šŸ¤– Worker[${id}]: Starting to process ${tasks.length} files...`); + console.log( + `šŸ¤– Worker[${id}]: Starting to process ${tasks.length} files... (noCache=${noCache})` + ); for (const {sourcePath, targetPath, relativePath, r2Hash} of tasks) { try { const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, { @@ -321,6 +386,8 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { }); if (!cacheHit) { cacheMisses.push(relativePath); + } else { + cacheHits++; } if (r2Hash !== null) { @@ -336,6 +403,14 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { } } const success = tasks.length - failedTasks.length; + + // Log cache statistics + const cacheHitRate = ((cacheHits / tasks.length) * 100).toFixed(1); + const cacheMissRate = ((cacheMisses.length / tasks.length) * 100).toFixed(1); + console.log( + `šŸ“Š Worker[${id}]: Cache stats - ${cacheHits}/${tasks.length} hits (${cacheHitRate}%), ${cacheMisses.length} misses (${cacheMissRate}%)` + ); + if (r2CacheMisses.length / tasks.length > 0.1) { console.warn( `āš ļø Worker[${id}]: More than 10% of files had a different hash on R2 with the generation process.` diff --git a/src/mdx.ts b/src/mdx.ts index 169bd8492cbb5..d4908beb296c6 100644 --- a/src/mdx.ts +++ b/src/mdx.ts @@ -65,6 +65,80 @@ if (process.env.CI) { mkdirSync(CACHE_DIR, {recursive: true}); } +// Cache registry hash per worker to avoid recomputing for every file +let cachedRegistryHash: Promise | null = null; +async function getRegistryHashWithRetry( + maxRetries = 3, + initialDelayMs = 1000 +): Promise { + let lastError: Error | null = null; + let delayMs = initialDelayMs; + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + const [apps, packages] = await Promise.all([ + getAppRegistry(), + getPackageRegistry(), + ]); + return md5(JSON.stringify({apps, packages})); + } catch (err) { + lastError = err as Error; + if (attempt < maxRetries) { + const currentDelay = delayMs; + // eslint-disable-next-line no-console + console.warn( + `Failed to fetch registry (attempt ${attempt + 1}/${maxRetries + 1}): ${lastError.message}. Retrying in ${currentDelay}ms...` + ); + await new Promise(resolve => setTimeout(resolve, currentDelay)); + delayMs *= 2; // Exponential backoff + } + } + } + throw new Error( + `Failed to fetch registry after ${maxRetries + 1} attempts: ${lastError?.message}` + ); +} + +function getRegistryHash(): Promise { + if (!cachedRegistryHash) { + cachedRegistryHash = getRegistryHashWithRetry().catch(err => { + // Clear cache on error to allow retry on next call + cachedRegistryHash = null; + throw err; + }); + } + return cachedRegistryHash; +} + +// Track cache statistics per worker (silent tracking) +const cacheStats = { + registryHits: 0, + registryMisses: 0, + uniqueRegistryFiles: new Set(), +}; + +// Log summary at end +function logCacheSummary() { + const total = cacheStats.registryHits + cacheStats.registryMisses; + if (total === 0) { + return; + } + + const hitRate = ((cacheStats.registryHits / total) * 100).toFixed(1); + const uniqueFiles = cacheStats.uniqueRegistryFiles.size; + + // eslint-disable-next-line no-console + console.log( + `šŸ“Š [MDX Cache] ${cacheStats.registryHits}/${total} registry files cached (${hitRate}% hit rate, ${uniqueFiles} unique files)` + ); +} + +// Log final summary when worker exits +if (typeof process !== 'undefined') { + process.on('beforeExit', () => { + logCacheSummary(); + }); +} + const md5 = (data: BinaryLike) => createHash('md5').update(data).digest('hex'); async function readCacheFile(file: string): Promise { @@ -209,6 +283,7 @@ export async function getDevDocsFrontMatterUncached(): Promise { ) ) ).filter(isNotNil); + return frontMatters; } @@ -396,6 +471,7 @@ async function getAllFilesFrontMatter(): Promise { ); } } + return allFrontMatter; } @@ -531,45 +607,61 @@ export async function getFileBySlug(slug: string): Promise { const outdir = path.join(root, 'public', 'mdx-images'); await mkdir(outdir, {recursive: true}); - // If the file contains content that depends on the Release Registry (such as an SDK's latest version), avoid using the cache for that file, i.e. always rebuild it. - // This is because the content from the registry might have changed since the last time the file was cached. - // If a new component that injects content from the registry is introduced, it should be added to the patterns below. - const skipCache = + // Check if file depends on Release Registry + const dependsOnRegistry = source.includes('@inject') || source.includes('(cacheFile), - cp(assetsCacheDir, outdir, {recursive: true}), - ]); - return cached; - } catch (err) { - if ( - err.code !== 'ENOENT' && - err.code !== 'ABORT_ERR' && - err.code !== 'Z_BUF_ERROR' - ) { - // If cache is corrupted, ignore and proceed - // eslint-disable-next-line no-console - console.warn(`Failed to read MDX cache: ${cacheFile}`, err); - } + cacheFile = path.join(CACHE_DIR, `${cacheKey}.br`); + assetsCacheDir = path.join(CACHE_DIR, cacheKey); + + try { + const [cached, _] = await Promise.all([ + readCacheFile(cacheFile), + cp(assetsCacheDir, outdir, {recursive: true}), + ]); + // Track cache hit silently + if (dependsOnRegistry) { + cacheStats.registryHits++; + cacheStats.uniqueRegistryFiles.add(sourcePath); + } + return cached; + } catch (err) { + if ( + err.code !== 'ENOENT' && + err.code !== 'ABORT_ERR' && + err.code !== 'Z_BUF_ERROR' + ) { + // If cache is corrupted, ignore and proceed + // eslint-disable-next-line no-console + console.warn(`Failed to read MDX cache: ${cacheFile}`, err); } } } + // Track cache miss silently + if (process.env.CI && dependsOnRegistry) { + cacheStats.registryMisses++; + cacheStats.uniqueRegistryFiles.add(sourcePath); + } + process.env.ESBUILD_BINARY_PATH = path.join( root, 'node_modules', @@ -700,7 +792,8 @@ export async function getFileBySlug(slug: string): Promise { }, }; - if (assetsCacheDir && cacheFile && !skipCache) { + // Save to cache if we have a cache key (we now cache everything, including registry-dependent files) + if (assetsCacheDir && cacheFile && cacheKey) { await cp(assetsCacheDir, outdir, {recursive: true}); writeCacheFile(cacheFile, JSON.stringify(resultObj)).catch(e => { // eslint-disable-next-line no-console