apify · nikitachapovskii-dev · Feb 27, 2026 · Feb 26, 2026 · Feb 26, 2026
diff --git a/packages/actor-scraper/sitemap-scraper/src/internals/crawler_setup.ts b/packages/actor-scraper/sitemap-scraper/src/internals/crawler_setup.ts
@@ -1,5 +1,7 @@
 import { readFile } from 'node:fs/promises';
 import { URL } from 'node:url';
+import { promisify } from 'node:util';
+import { gunzip as zlibGunzip } from 'node:zlib';
 
 import type { CrawlingContext } from '@crawlee/core';
 import type {
@@ -46,6 +48,8 @@ const SCHEMA = JSON.parse(
 
 const REQUESTS_BATCH_SIZE = 25;
 const SITEMAP_DISCOVERY_TIMEOUT_MILLIS = 30_000;
+const GZIP_MIME_TYPES = new Set(['application/gzip', 'application/x-gzip']);
+const gunzip = promisify(zlibGunzip);
 
 const MAX_EVENT_LOOP_OVERLOADED_RATIO = 0.9;
 const REQUEST_QUEUE_INIT_FLAG_KEY = 'REQUEST_QUEUE_INITIALIZED';
@@ -278,6 +282,8 @@ export class CrawlerSetup {
                 'application/rss+xml',
                 'application/atom+xml',
                 'text/plain',
+                'application/gzip',
+                'application/x-gzip',
             ],
             requestHandler: this._createRequestHandler(),
             preNavigationHooks: [],
@@ -365,15 +371,18 @@ export class CrawlerSetup {
     protected async _handleSitemapRequest(
         crawlingContext: HttpCrawlingContext,
     ) {
-        const { request, body } = crawlingContext;
+        const { request, body, contentType } = crawlingContext;
 
         // Make sure that an object containing internal metadata
         // is present on every request.
         tools.ensureMetaData(request as any);
 
         log.info('Processing sitemap', { url: request.url });
-        const sitemapContent =
-            typeof body === 'string' ? body : body.toString('utf8');
+        const sitemapContent = await this.getSitemapContent(
+            request.url,
+            body,
+            contentType.type,
+        );
         const parsed = parseSitemap(
             [{ type: 'raw', content: sitemapContent }],
             await this.proxyConfiguration?.newUrl(),
@@ -383,7 +392,6 @@ export class CrawlerSetup {
                 httpClient: this.sitemapHttpClient,
             },
         );
-
         const nestedSitemaps: string[] = [];
         const urls: string[] = [];
         let scrapedAnyPageUrls = false;
@@ -400,6 +408,7 @@ export class CrawlerSetup {
             await this._enqueueSitemapRequests(nestedSitemaps, crawlingContext);
             nestedSitemaps.length = 0;
         };
+
         for await (const item of parsed) {
             if (!item.originSitemapUrl) {
                 log.debug('Handling nested sitemap', {
@@ -541,6 +550,54 @@ export class CrawlerSetup {
         };
     }
 
+    private async getSitemapContent(
+        requestUrl: string,
+        body: string | Buffer,
+        contentType: string,
+    ): Promise<string> {
+        if (typeof body === 'string') {
+            return body;
+        }
+
+        if (!this.isGzippedSitemapContent(requestUrl, body, contentType)) {
+            return body.toString('utf8');
+        }
+
+        try {
+            let decompressed = await gunzip(body);
+            // Some endpoints can apply transport gzip on top of .xml.gz payloads.
+            if (this.hasGzipMagicBytes(decompressed)) {
+                decompressed = await gunzip(decompressed);
+            }
+            return decompressed.toString('utf8');
+        } catch (error) {
+            throw new Error(
+                `Failed to decompress gzipped sitemap ${requestUrl}: ${String(error)}`,
+            );
+        }
+    }
+
+    private isGzippedSitemapContent(
+        requestUrl: string,
+        body: Buffer,
+        contentType: string,
+    ): boolean {
+        const normalizedContentType = this.normalizeContentType(contentType);
+        return (
+            GZIP_MIME_TYPES.has(normalizedContentType) ||
+            requestUrl.endsWith('.gz') ||
+            this.hasGzipMagicBytes(body)
+        );
+    }
+
+    private normalizeContentType(contentType: string): string {
+        return contentType.split(';')[0]?.trim().toLowerCase();
+    }
+
+    private hasGzipMagicBytes(body: Buffer): boolean {
+        return body.length >= 2 && body[0] === 0x1f && body[1] === 0x8b;
+    }
+
     private async _enqueuePageRequests(
         urls: string[],
         { request, enqueueLinks }: HttpCrawlingContext,

diff --git a/packages/actor-scraper/sitemap-scraper/test/crawler-setup-sitemap-content.test.ts b/packages/actor-scraper/sitemap-scraper/test/crawler-setup-sitemap-content.test.ts
@@ -0,0 +1,112 @@
+import { gzipSync } from 'node:zlib';
+
+import { afterEach, beforeEach, describe, expect, it, vi } from 'vitest';
+
+import { type Input, ProxyRotation } from '../src/internals/consts.js';
+import { CrawlerSetup } from '../src/internals/crawler_setup.js';
+
+vi.mock('apify', () => ({
+    Actor: {
+        isAtHome: () => true,
+        getEnv: () => ({}),
+        createProxyConfiguration: async () => ({
+            newUrl: async () => undefined,
+        }),
+        fail: async (message: string) => new Error(message),
+    },
+}));
+
+const createInput = (overrides: Partial<Input> = {}): Input => ({
+    startUrls: [{ url: 'https://example.com' }],
+    keepUrlFragments: false,
+    respectRobotsTxtFile: true,
+    pageFunction: '() => ({})',
+    proxyConfiguration: { useApifyProxy: false },
+    proxyRotation: ProxyRotation.Recommended,
+    maxRequestRetries: 3,
+    maxCrawlingDepth: 0,
+    debugLog: false,
+    customData: {},
+    ...overrides,
+});
+
+describe('CrawlerSetup sitemap content parsing', () => {
+    let initSpy: ReturnType<typeof vi.spyOn>;
+
+    beforeEach(() => {
+        initSpy = vi
+            .spyOn(CrawlerSetup.prototype as any, '_initializeAsync')
+            .mockResolvedValue(undefined);
+    });
+
+    afterEach(() => {
+        initSpy.mockRestore();
+    });
+
+    it('keeps plain XML body untouched', async () => {
+        const setup = new CrawlerSetup(createInput());
+        const sitemapContent = await (setup as any).getSitemapContent(
+            'https://example.com/sitemap.xml',
+            '<urlset></urlset>',
+            'application/xml',
+        );
+
+        expect(sitemapContent).toBe('<urlset></urlset>');
+    });
+
+    it('decompresses gzip sitemap body by MIME type', async () => {
+        const setup = new CrawlerSetup(createInput());
+        const xml =
+            '<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>https://example.com/</loc></url></urlset>';
+        const gzippedXml = gzipSync(Buffer.from(xml, 'utf8'));
+
+        const sitemapContent = await (setup as any).getSitemapContent(
+            'https://example.com/sitemap.xml.gz',
+            gzippedXml,
+            'application/gzip',
+        );
+
+        expect(sitemapContent).toBe(xml);
+    });
+
+    it('keeps plain text body untouched', async () => {
+        const setup = new CrawlerSetup(createInput());
+        const sitemapContent = await (setup as any).getSitemapContent(
+            'https://example.com/sitemap.txt',
+            Buffer.from('https://example.com/page', 'utf8'),
+            'text/plain; charset=utf-8',
+        );
+
+        expect(sitemapContent).toBe('https://example.com/page');
+    });
+
+    it('decompresses gzip sitemap body by URL extension', async () => {
+        const setup = new CrawlerSetup(createInput());
+        const xml =
+            '<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>https://example.com/blog</loc></url></urlset>';
+        const gzippedXml = gzipSync(Buffer.from(xml, 'utf8'));
+
+        const sitemapContent = await (setup as any).getSitemapContent(
+            'https://example.com/sitemap_index.xml.gz',
+            gzippedXml,
+            'application/octet-stream',
+        );
+
+        expect(sitemapContent).toBe(xml);
+    });
+
+    it('decompresses double-gzipped sitemap body', async () => {
+        const setup = new CrawlerSetup(createInput());
+        const xml =
+            '<?xml version="1.0" encoding="UTF-8"?><urlset><url><loc>https://example.com/double</loc></url></urlset>';
+        const doubleGzippedXml = gzipSync(gzipSync(Buffer.from(xml, 'utf8')));
+
+        const sitemapContent = await (setup as any).getSitemapContent(
+            'https://example.com/sitemap.xml.gz',
+            doubleGzippedXml,
+            'application/gzip',
+        );
+
+        expect(sitemapContent).toBe(xml);
+    });
+});