Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -1,5 +1,7 @@
import { readFile } from 'node:fs/promises';
import { URL } from 'node:url';
import { promisify } from 'node:util';
import { gunzip as zlibGunzip } from 'node:zlib';

import type { CrawlingContext } from '@crawlee/core';
import type {
Expand Down Expand Up @@ -46,6 +48,8 @@ const SCHEMA = JSON.parse(

const REQUESTS_BATCH_SIZE = 25;
const SITEMAP_DISCOVERY_TIMEOUT_MILLIS = 30_000;
const GZIP_MIME_TYPES = new Set(['application/gzip', 'application/x-gzip']);
const gunzip = promisify(zlibGunzip);

const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
Expand Down Expand Up @@ -278,6 +282,8 @@ export class CrawlerSetup {
'application/rss+xml',
'application/atom+xml',
'text/plain',
'application/gzip',
'application/x-gzip',
],
requestHandler: this._createRequestHandler(),
preNavigationHooks: [],
Expand Down Expand Up @@ -365,15 +371,18 @@ export class CrawlerSetup {
protected async _handleSitemapRequest(
crawlingContext: HttpCrawlingContext,
) {
const { request, body } = crawlingContext;
const { request, body, contentType } = crawlingContext;

// Make sure that an object containing internal metadata
// is present on every request.
tools.ensureMetaData(request as any);

log.info('Processing sitemap', { url: request.url });
const sitemapContent =
typeof body === 'string' ? body : body.toString('utf8');
const sitemapContent = await this.getSitemapContent(
request.url,
body,
contentType.type,
);
const parsed = parseSitemap(
[{ type: 'raw', content: sitemapContent }],
await this.proxyConfiguration?.newUrl(),
Expand All @@ -383,7 +392,6 @@ export class CrawlerSetup {
httpClient: this.sitemapHttpClient,
},
);

const nestedSitemaps: string[] = [];
const urls: string[] = [];
let scrapedAnyPageUrls = false;
Expand All @@ -400,6 +408,7 @@ export class CrawlerSetup {
await this._enqueueSitemapRequests(nestedSitemaps, crawlingContext);
nestedSitemaps.length = 0;
};

for await (const item of parsed) {
if (!item.originSitemapUrl) {
log.debug('Handling nested sitemap', {
Expand Down Expand Up @@ -541,6 +550,54 @@ export class CrawlerSetup {
};
}

private async getSitemapContent(
requestUrl: string,
body: string | Buffer,
contentType: string,
): Promise<string> {
if (typeof body === 'string') {
return body;
}

if (!this.isGzippedSitemapContent(requestUrl, body, contentType)) {
return body.toString('utf8');
}

try {
let decompressed = await gunzip(body);
// Some endpoints can apply transport gzip on top of .xml.gz payloads.
if (this.hasGzipMagicBytes(decompressed)) {
decompressed = await gunzip(decompressed);
}
return decompressed.toString('utf8');
} catch (error) {
throw new Error(
`Failed to decompress gzipped sitemap ${requestUrl}: ${String(error)}`,
);
}
}

private isGzippedSitemapContent(
requestUrl: string,
body: Buffer,
contentType: string,
): boolean {
const normalizedContentType = this.normalizeContentType(contentType);
return (
GZIP_MIME_TYPES.has(normalizedContentType) ||
requestUrl.endsWith('.gz') ||
this.hasGzipMagicBytes(body)
);
}

private normalizeContentType(contentType: string): string {
return contentType.split(';')[0]?.trim().toLowerCase();
}

private hasGzipMagicBytes(body: Buffer): boolean {
return body.length >= 2 && body[0] === 0x1f && body[1] === 0x8b;
}

private async _enqueuePageRequests(
urls: string[],
{ request, enqueueLinks }: HttpCrawlingContext,
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,112 @@
import { gzipSync } from 'node:zlib';

import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';

import { type Input, ProxyRotation } from '../src/internals/consts.js';
import { CrawlerSetup } from '../src/internals/crawler_setup.js';

vi.mock('apify', () => ({
Actor: {
isAtHome: () => true,
getEnv: () => ({}),
createProxyConfiguration: async () => ({
newUrl: async () => undefined,
}),
fail: async (message: string) => new Error(message),
},
}));

const createInput = (overrides: Partial<Input> = {}): Input => ({
startUrls: [{ url: 'https://example.com' }],
keepUrlFragments: false,
respectRobotsTxtFile: true,
pageFunction: '() => ({})',
proxyConfiguration: { useApifyProxy: false },
proxyRotation: ProxyRotation.Recommended,
maxRequestRetries: 3,
maxCrawlingDepth: 0,
debugLog: false,
customData: {},
...overrides,
});

describe('CrawlerSetup sitemap content parsing', () => {
let initSpy: ReturnType<typeof vi.spyOn>;

beforeEach(() => {
initSpy = vi
.spyOn(CrawlerSetup.prototype as any, '_initializeAsync')
.mockResolvedValue(undefined);
});

afterEach(() => {
initSpy.mockRestore();
});

it('keeps plain XML body untouched', async () => {
const setup = new CrawlerSetup(createInput());
const sitemapContent = await (setup as any).getSitemapContent(
'https://example.com/sitemap.xml',
'<urlset></urlset>',
'application/xml',
);

expect(sitemapContent).toBe('<urlset></urlset>');
});

it('decompresses gzip sitemap body by MIME type', async () => {
const setup = new CrawlerSetup(createInput());
const xml =
'<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>https://example.com/</loc></url></urlset>';
const gzippedXml = gzipSync(Buffer.from(xml, 'utf8'));

const sitemapContent = await (setup as any).getSitemapContent(
'https://example.com/sitemap.xml.gz',
gzippedXml,
'application/gzip',
);

expect(sitemapContent).toBe(xml);
});

it('keeps plain text body untouched', async () => {
const setup = new CrawlerSetup(createInput());
const sitemapContent = await (setup as any).getSitemapContent(
'https://example.com/sitemap.txt',
Buffer.from('https://example.com/page', 'utf8'),
'text/plain; charset=utf-8',
);

expect(sitemapContent).toBe('https://example.com/page');
});

it('decompresses gzip sitemap body by URL extension', async () => {
const setup = new CrawlerSetup(createInput());
const xml =
'<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>https://example.com/blog</loc></url></urlset>';
const gzippedXml = gzipSync(Buffer.from(xml, 'utf8'));

const sitemapContent = await (setup as any).getSitemapContent(
'https://example.com/sitemap_index.xml.gz',
gzippedXml,
'application/octet-stream',
);

expect(sitemapContent).toBe(xml);
});

it('decompresses double-gzipped sitemap body', async () => {
const setup = new CrawlerSetup(createInput());
const xml =
'<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>https://example.com/double</loc></url></urlset>';
const doubleGzippedXml = gzipSync(gzipSync(Buffer.from(xml, 'utf8')));

const sitemapContent = await (setup as any).getSitemapContent(
'https://example.com/sitemap.xml.gz',
doubleGzippedXml,
'application/gzip',
);

expect(sitemapContent).toBe(xml);
});
});
Loading