Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
63 changes: 57 additions & 6 deletions scripts/generate-md-exports.mjs
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ import imgLinks from '@pondorasti/remark-img-links';
import {selectAll} from 'hast-util-select';
import {createHash} from 'node:crypto';
import {createReadStream, createWriteStream, existsSync} from 'node:fs';
import {mkdir, opendir, readFile, rm, writeFile} from 'node:fs/promises';
import {mkdir, opendir, readdir, readFile, rm, writeFile} from 'node:fs/promises';
import {cpus} from 'node:os';
import * as path from 'node:path';
import {compose, Readable} from 'node:stream';
Expand Down Expand Up @@ -58,7 +58,15 @@ async function uploadToCFR2(s3Client, relativePath, data) {
return;
}

function taskFinishHandler({id, success, failedTasks}) {
function taskFinishHandler(
{id, success, failedTasks, usedCacheFiles},
allUsedCacheFiles
) {
// Collect cache files used by this worker
if (usedCacheFiles) {
usedCacheFiles.forEach(file => allUsedCacheFiles.add(file));
}

if (failedTasks.length === 0) {
console.log(`✅ Worker[${id}]: converted ${success} files successfully.`);
return false;
Expand Down Expand Up @@ -95,6 +103,9 @@ async function createWork() {
await mkdir(CACHE_DIR, {recursive: true});
}

// Track which cache files are used during this build
const usedCacheFiles = new Set();

// On a 16-core machine, 8 workers were optimal (and slightly faster than 16)
const numWorkers = Math.max(Math.floor(cpus().length / 2), 2);
const workerTasks = new Array(numWorkers).fill(null).map(() => []);
Expand Down Expand Up @@ -163,7 +174,7 @@ async function createWork() {
},
});
let hasErrors = false;
worker.on('message', data => (hasErrors = taskFinishHandler(data)));
worker.on('message', data => (hasErrors = taskFinishHandler(data, usedCacheFiles)));
worker.on('error', reject);
worker.on('exit', code => {
if (code !== 0) {
Expand All @@ -175,28 +186,51 @@ async function createWork() {
});
});
// The main thread can also process tasks -- That's 65% more bullet per bullet! -Cave Johnson
const mainThreadUsedFiles = new Set();
workerPromises.push(
processTaskList({
id: workerTasks.length - 1,
tasks: workerTasks[workerTasks.length - 1],
cacheDir: CACHE_DIR,
noCache,
usedCacheFiles: mainThreadUsedFiles,
}).then(data => {
if (taskFinishHandler(data)) {
if (taskFinishHandler(data, usedCacheFiles)) {
throw new Error(`Worker[${data.id}] had some errors.`);
}
})
);

await Promise.all(workerPromises);

// Clean up unused cache files to prevent unbounded growth
if (!noCache) {
try {
const allFiles = await readdir(CACHE_DIR);
let cleanedCount = 0;

for (const file of allFiles) {
if (!usedCacheFiles.has(file)) {
await rm(path.join(CACHE_DIR, file), {force: true});
cleanedCount++;
}
}

if (cleanedCount > 0) {
console.log(`🧹 Cleaned up ${cleanedCount} unused cache files`);
}
} catch (err) {
console.warn('Failed to clean unused cache files:', err);
}
}

console.log(`📄 Generated ${numFiles} markdown files from HTML.`);
console.log('✅ Markdown export generation complete!');
}

const md5 = data => createHash('md5').update(data).digest('hex');

async function genMDFromHTML(source, target, {cacheDir, noCache}) {
async function genMDFromHTML(source, target, {cacheDir, noCache, usedCacheFiles}) {
const leanHTML = (await readFile(source, {encoding: 'utf8'}))
// Remove all script tags, as they are not needed in markdown
// and they are not stable across builds, causing cache misses
Expand All @@ -210,6 +244,11 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
);
await writeFile(target, data, {encoding: 'utf8'});

// Track that we used this cache file
if (usedCacheFiles) {
usedCacheFiles.add(cacheKey);
}

return {cacheHit: true, data};
} catch (err) {
if (err.code !== 'ENOENT') {
Expand Down Expand Up @@ -304,10 +343,20 @@ async function genMDFromHTML(source, target, {cacheDir, noCache}) {
).catch(err => console.warn('Error writing cache file:', err)),
]);

// Track that we created this cache file
if (usedCacheFiles) {
usedCacheFiles.add(cacheKey);
}

return {cacheHit: false, data};
}

async function processTaskList({id, tasks, cacheDir, noCache}) {
async function processTaskList({id, tasks, cacheDir, noCache, usedCacheFiles}) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Cache Miss Handling Fails

The cache cleanup logic immediately deletes newly created cache files. When genMDFromHTML generates a cache file due to a miss, it doesn't add the file's key to the usedCacheFiles set. This prevents the cache from being effective for those files in subsequent builds.

Additional Locations (1)

Fix in Cursor Fix in Web

// Workers don't receive usedCacheFiles in workerData, so create a new Set
if (!usedCacheFiles) {
usedCacheFiles = new Set();
}

const s3Client = getS3Client();
const failedTasks = [];
let cacheMisses = [];
Expand All @@ -318,6 +367,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
const {data, cacheHit} = await genMDFromHTML(sourcePath, targetPath, {
cacheDir,
noCache,
usedCacheFiles,
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Bug: Worker Initialization Missing Cache Set

Worker threads are initialized without a usedCacheFiles set in their workerData. This causes usedCacheFiles to be undefined within the worker's processTaskList context, leading to a TypeError when genMDFromHTML attempts to call usedCacheFiles.add() on a cache hit.

Additional Locations (1)

Fix in Cursor Fix in Web

});
if (!cacheHit) {
cacheMisses.push(relativePath);
Expand Down Expand Up @@ -357,6 +407,7 @@ async function processTaskList({id, tasks, cacheDir, noCache}) {
id,
success,
failedTasks,
usedCacheFiles: Array.from(usedCacheFiles),
};
}

Expand Down
136 changes: 124 additions & 12 deletions src/mdx.ts
Original file line number Diff line number Diff line change
Expand Up @@ -4,7 +4,7 @@ import yaml from 'js-yaml';
import {bundleMDX} from 'mdx-bundler';
import {BinaryLike, createHash} from 'node:crypto';
import {createReadStream, createWriteStream, mkdirSync} from 'node:fs';
import {access, cp, mkdir, opendir, readFile} from 'node:fs/promises';
import {access, cp, mkdir, opendir, readFile, rm, stat} from 'node:fs/promises';
import path from 'node:path';
// @ts-expect-error ts(2305) -- For some reason "compose" is not recognized in the types
import {compose, Readable} from 'node:stream';
Expand Down Expand Up @@ -63,10 +63,104 @@ const CACHE_COMPRESS_LEVEL = 4;
const CACHE_DIR = path.join(root, '.next', 'cache', 'mdx-bundler');
if (process.env.CI) {
mkdirSync(CACHE_DIR, {recursive: true});

// Clean up old cache files in background to prevent unbounded growth
// Delete any file not accessed in the last 24 hours (meaning it wasn't used in recent builds)
// This runs once per worker process and doesn't block the build
(async () => {
try {
const MAX_CACHE_AGE_MS = 24 * 60 * 60 * 1000; // 24 hours
const now = Date.now();
let cleanedCount = 0;

const dir = await opendir(CACHE_DIR);

for await (const dirent of dir) {
if (!dirent.isFile() && !dirent.isDirectory()) {
continue;
}

const itemPath = path.join(CACHE_DIR, dirent.name);
try {
const stats = await stat(itemPath);
const age = now - stats.atimeMs; // Time since last access

if (age > MAX_CACHE_AGE_MS) {
await rm(itemPath, {recursive: true, force: true});
cleanedCount++;
}
} catch (err) {
// Skip items we can't stat/delete
}
}

if (cleanedCount > 0) {
// eslint-disable-next-line no-console
console.log(`🧹 MDX cache: Cleaned up ${cleanedCount} unused items (>24h)`);
}
} catch (err) {
// Silently fail - cache cleanup is not critical
}
})();
}

const md5 = (data: BinaryLike) => createHash('md5').update(data).digest('hex');

// Worker-level registry cache to avoid fetching multiple times per worker
let cachedRegistryHash: Promise<string> | null = null;

/**
* Fetch registry data and compute its hash, with retry logic and exponential backoff.
* Retries up to maxRetries times with exponential backoff starting at initialDelayMs.
*/
async function getRegistryHashWithRetry(
maxRetries = 3,
initialDelayMs = 1000
): Promise<string> {
let lastError: Error | null = null;

for (let attempt = 0; attempt <= maxRetries; attempt++) {
try {
const [apps, packages] = await Promise.all([
getAppRegistry(),
getPackageRegistry(),
]);
return md5(JSON.stringify({apps, packages}));
} catch (err) {
lastError = err as Error;

if (attempt < maxRetries) {
const delay = initialDelayMs * Math.pow(2, attempt);
// eslint-disable-next-line no-console
console.warn(
`Failed to fetch registry (attempt ${attempt + 1}/${maxRetries + 1}). Retrying in ${delay}ms...`,
err
);
await new Promise(resolve => setTimeout(resolve, delay));
}
}
}

throw lastError || new Error('Failed to fetch registry after all retries');
}

/**
* Get the registry hash, using cached value if available.
* This ensures we only fetch the registry once per worker process.
*/
function getRegistryHash(): Promise<string> {
if (!cachedRegistryHash) {
// eslint-disable-next-line no-console
console.info('Fetching registry hash for the first time in this worker');
cachedRegistryHash = getRegistryHashWithRetry().catch(err => {
// Reset cache on error so next call will retry
cachedRegistryHash = null;
throw err;
});
}
return cachedRegistryHash;
}

async function readCacheFile<T>(file: string): Promise<T> {
const reader = createReadStream(file);
const decompressor = createBrotliDecompress();
Expand Down Expand Up @@ -531,22 +625,40 @@ export async function getFileBySlug(slug: string): Promise<SlugFile> {
const outdir = path.join(root, 'public', 'mdx-images');
await mkdir(outdir, {recursive: true});

// If the file contains content that depends on the Release Registry (such as an SDK's latest version), avoid using the cache for that file, i.e. always rebuild it.
// This is because the content from the registry might have changed since the last time the file was cached.
// If a new component that injects content from the registry is introduced, it should be added to the patterns below.
const skipCache =
// Detect if file contains content that depends on the Release Registry
// If it does, we include the registry hash in the cache key so the cache
// is invalidated when the registry changes.
const dependsOnRegistry =
source.includes('@inject') ||
source.includes('<PlatformSDKPackageName') ||
source.includes('<LambdaLayerDetail');

if (process.env.CI) {
if (skipCache) {
// eslint-disable-next-line no-console
console.info(
`Not using cached version of ${sourcePath}, as its content depends on the Release Registry`
);
const sourceHash = md5(source);

// Include registry hash in cache key for registry-dependent files
if (dependsOnRegistry) {
try {
const registryHash = await getRegistryHash();
cacheKey = `${sourceHash}-${registryHash}`;
// eslint-disable-next-line no-console
console.info(
`Using registry-aware cache for ${sourcePath} (registry hash: ${registryHash.slice(0, 8)}...)`
);
} catch (err) {
// If we can't get registry hash, skip cache for this file
// eslint-disable-next-line no-console
console.warn(
`Failed to get registry hash for ${sourcePath}, skipping cache:`,
err
);
cacheKey = null;
}
} else {
cacheKey = md5(source);
cacheKey = sourceHash;
}

if (cacheKey) {
cacheFile = path.join(CACHE_DIR, `${cacheKey}.br`);
assetsCacheDir = path.join(CACHE_DIR, cacheKey);

Expand Down Expand Up @@ -700,7 +812,7 @@ export async function getFileBySlug(slug: string): Promise<SlugFile> {
},
};

if (assetsCacheDir && cacheFile && !skipCache) {
if (assetsCacheDir && cacheFile && cacheKey) {
await cp(assetsCacheDir, outdir, {recursive: true});
writeCacheFile(cacheFile, JSON.stringify(resultObj)).catch(e => {
// eslint-disable-next-line no-console
Expand Down
Loading