diff --git a/scripts/generate-md-exports.mjs b/scripts/generate-md-exports.mjs index 229a700691e298..090ecfed7a0bc2 100644 --- a/scripts/generate-md-exports.mjs +++ b/scripts/generate-md-exports.mjs @@ -5,7 +5,7 @@ import imgLinks from '@pondorasti/remark-img-links'; import {selectAll} from 'hast-util-select'; import {createHash} from 'node:crypto'; import {createReadStream, createWriteStream, existsSync} from 'node:fs'; -import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises'; +import {mkdir, opendir, readdir, readFile, rm, writeFile} from 'node:fs/promises'; import {cpus} from 'node:os'; import * as path from 'node:path'; import {compose, Readable} from 'node:stream'; @@ -58,7 +58,15 @@ async function uploadToCFR2(s3Client, relativePath, data) { return; } -function taskFinishHandler({id, success, failedTasks}) { +function taskFinishHandler( + {id, success, failedTasks, usedCacheFiles}, + allUsedCacheFiles +) { + // Collect cache files used by this worker + if (usedCacheFiles) { + usedCacheFiles.forEach(file => allUsedCacheFiles.add(file)); + } + if (failedTasks.length === 0) { console.log(`โœ… Worker[${id}]: converted ${success} files successfully.`); return false; @@ -95,6 +103,9 @@ async function createWork() { await mkdir(CACHE_DIR, {recursive: true}); } + // Track which cache files are used during this build + const usedCacheFiles = new Set(); + // On a 16-core machine, 8 workers were optimal (and slightly faster than 16) const numWorkers = Math.max(Math.floor(cpus().length / 2), 2); const workerTasks = new Array(numWorkers).fill(null).map(() => []); @@ -163,7 +174,7 @@ async function createWork() { }, }); let hasErrors = false; - worker.on('message', data => (hasErrors = taskFinishHandler(data))); + worker.on('message', data => (hasErrors = taskFinishHandler(data, usedCacheFiles))); worker.on('error', reject); worker.on('exit', code => { if (code !== 0) { @@ -175,14 +186,16 @@ async function createWork() { }); }); // The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson + const mainThreadUsedFiles = new Set(); workerPromises.push( processTaskList({ id: workerTasks.length - 1, tasks: workerTasks[workerTasks.length - 1], cacheDir: CACHE_DIR, noCache, + usedCacheFiles: mainThreadUsedFiles, }).then(data => { - if (taskFinishHandler(data)) { + if (taskFinishHandler(data, usedCacheFiles)) { throw new Error(`Worker[${data.id}] had some errors.`); } }) @@ -190,13 +203,34 @@ async function createWork() { await Promise.all(workerPromises); + // Clean up unused cache files to prevent unbounded growth + if (!noCache) { + try { + const allFiles = await readdir(CACHE_DIR); + let cleanedCount = 0; + + for (const file of allFiles) { + if (!usedCacheFiles.has(file)) { + await rm(path.join(CACHE_DIR, file), {force: true}); + cleanedCount++; + } + } + + if (cleanedCount > 0) { + console.log(`๐Ÿงน Cleaned up ${cleanedCount} unused cache files`); + } + } catch (err) { + console.warn('Failed to clean unused cache files:', err); + } + } + console.log(`๐Ÿ“„ Generated ${numFiles} markdown files from HTML.`); console.log('โœ… Markdown export generation complete!'); } const md5 = data => createHash('md5').update(data).digest('hex'); -async function genMDFromHTML(source, target, {cacheDir, noCache}) { +async function genMDFromHTML(source, target, {cacheDir, noCache, usedCacheFiles}) { const leanHTML = (await readFile(source, {encoding: 'utf8'})) // Remove all script tags, as they are not needed in markdown // and they are not stable across builds, causing cache misses @@ -210,6 +244,11 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) { ); await writeFile(target, data, {encoding: 'utf8'}); + // Track that we used this cache file + if (usedCacheFiles) { + usedCacheFiles.add(cacheKey); + } + return {cacheHit: true, data}; } catch (err) { if (err.code !== 'ENOENT') { @@ -304,10 +343,20 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) { ).catch(err => console.warn('Error writing cache file:', err)), ]); + // Track that we created this cache file + if (usedCacheFiles) { + usedCacheFiles.add(cacheKey); + } + return {cacheHit: false, data}; } -async function processTaskList({id, tasks, cacheDir, noCache}) { +async function processTaskList({id, tasks, cacheDir, noCache, usedCacheFiles}) { + // Workers don't receive usedCacheFiles in workerData, so create a new Set + if (!usedCacheFiles) { + usedCacheFiles = new Set(); + } + const s3Client = getS3Client(); const failedTasks = []; let cacheMisses = []; @@ -318,6 +367,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, { cacheDir, noCache, + usedCacheFiles, }); if (!cacheHit) { cacheMisses.push(relativePath); @@ -357,6 +407,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) { id, success, failedTasks, + usedCacheFiles: Array.from(usedCacheFiles), }; } diff --git a/src/mdx.ts b/src/mdx.ts index 169bd8492cbb55..4419145af7e343 100644 --- a/src/mdx.ts +++ b/src/mdx.ts @@ -4,7 +4,7 @@ import yaml from 'js-yaml'; import {bundleMDX} from 'mdx-bundler'; import {BinaryLike, createHash} from 'node:crypto'; import {createReadStream, createWriteStream, mkdirSync} from 'node:fs'; -import {access, cp, mkdir, opendir, readFile} from 'node:fs/promises'; +import {access, cp, mkdir, opendir, readFile, rm, stat} from 'node:fs/promises'; import path from 'node:path'; // @ts-expect-error ts(2305) -- For some reason "compose" is not recognized in the types import {compose, Readable} from 'node:stream'; @@ -63,10 +63,104 @@ const CACHE_COMPRESS_LEVEL = 4; const CACHE_DIR = path.join(root, '.next', 'cache', 'mdx-bundler'); if (process.env.CI) { mkdirSync(CACHE_DIR, {recursive: true}); + + // Clean up old cache files in background to prevent unbounded growth + // Delete any file not accessed in the last 24 hours (meaning it wasn't used in recent builds) + // This runs once per worker process and doesn't block the build + (async () => { + try { + const MAX_CACHE_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours + const now = Date.now(); + let cleanedCount = 0; + + const dir = await opendir(CACHE_DIR); + + for await (const dirent of dir) { + if (!dirent.isFile() && !dirent.isDirectory()) { + continue; + } + + const itemPath = path.join(CACHE_DIR, dirent.name); + try { + const stats = await stat(itemPath); + const age = now - stats.atimeMs; // Time since last access + + if (age > MAX_CACHE_AGE_MS) { + await rm(itemPath, {recursive: true, force: true}); + cleanedCount++; + } + } catch (err) { + // Skip items we can't stat/delete + } + } + + if (cleanedCount > 0) { + // eslint-disable-next-line no-console + console.log(`๐Ÿงน MDX cache: Cleaned up ${cleanedCount} unused items (>24h)`); + } + } catch (err) { + // Silently fail - cache cleanup is not critical + } + })(); } const md5 = (data: BinaryLike) => createHash('md5').update(data).digest('hex'); +// Worker-level registry cache to avoid fetching multiple times per worker +let cachedRegistryHash: Promise | null = null; + +/** + * Fetch registry data and compute its hash, with retry logic and exponential backoff. + * Retries up to maxRetries times with exponential backoff starting at initialDelayMs. + */ +async function getRegistryHashWithRetry( + maxRetries = 3, + initialDelayMs = 1000 +): Promise { + let lastError: Error | null = null; + + for (let attempt = 0; attempt <= maxRetries; attempt++) { + try { + const [apps, packages] = await Promise.all([ + getAppRegistry(), + getPackageRegistry(), + ]); + return md5(JSON.stringify({apps, packages})); + } catch (err) { + lastError = err as Error; + + if (attempt < maxRetries) { + const delay = initialDelayMs * Math.pow(2, attempt); + // eslint-disable-next-line no-console + console.warn( + `Failed to fetch registry (attempt ${attempt + 1}/${maxRetries + 1}). Retrying in ${delay}ms...`, + err + ); + await new Promise(resolve => setTimeout(resolve, delay)); + } + } + } + + throw lastError || new Error('Failed to fetch registry after all retries'); +} + +/** + * Get the registry hash, using cached value if available. + * This ensures we only fetch the registry once per worker process. + */ +function getRegistryHash(): Promise { + if (!cachedRegistryHash) { + // eslint-disable-next-line no-console + console.info('Fetching registry hash for the first time in this worker'); + cachedRegistryHash = getRegistryHashWithRetry().catch(err => { + // Reset cache on error so next call will retry + cachedRegistryHash = null; + throw err; + }); + } + return cachedRegistryHash; +} + async function readCacheFile(file: string): Promise { const reader = createReadStream(file); const decompressor = createBrotliDecompress(); @@ -531,22 +625,40 @@ export async function getFileBySlug(slug: string): Promise { const outdir = path.join(root, 'public', 'mdx-images'); await mkdir(outdir, {recursive: true}); - // If the file contains content that depends on the Release Registry (such as an SDK's latest version), avoid using the cache for that file, i.e. always rebuild it. - // This is because the content from the registry might have changed since the last time the file was cached. - // If a new component that injects content from the registry is introduced, it should be added to the patterns below. - const skipCache = + // Detect if file contains content that depends on the Release Registry + // If it does, we include the registry hash in the cache key so the cache + // is invalidated when the registry changes. + const dependsOnRegistry = source.includes('@inject') || source.includes(' { }, }; - if (assetsCacheDir && cacheFile && !skipCache) { + if (assetsCacheDir && cacheFile && cacheKey) { await cp(assetsCacheDir, outdir, {recursive: true}); writeCacheFile(cacheFile, JSON.stringify(resultObj)).catch(e => { // eslint-disable-next-line no-console