diff --git a/.actor/actor.json b/.actor/actor.json index 2e971df..1e50242 100644 --- a/.actor/actor.json +++ b/.actor/actor.json @@ -7,5 +7,6 @@ "meta": { "templateId": "ts-crawlee-playwright-chrome" }, - "input": "./input_schema.json" + "input": "./input_schema.json", + "webServerSchema": "./openapi.json" } diff --git a/.actor/openapi.json b/.actor/openapi.json new file mode 100644 index 0000000..339c2a1 --- /dev/null +++ b/.actor/openapi.json @@ -0,0 +1,1169 @@ +{ + "openapi": "3.0.3", + "info": { + "title": "SuperScraper API", + "description": "SuperScraper is a unified web scraping API that provides compatibility with multiple scraping services (ScrapingBee, ScrapingAnt, ScraperAPI).\n\n## Features\n- JavaScript rendering with headless browser\n- Screenshot capture (viewport, full page, or specific element)\n- Custom JavaScript execution via scenarios\n- Data extraction with CSS selectors\n- Proxy support (datacenter and residential)\n- Cookie and header forwarding\n- XHR/Fetch request capture\n\n## Response Formats\n- **HTML (default)**: Returns raw HTML content\n- **JSON (json_response=true)**: Returns structured response with metadata\n- **Screenshot**: Returns PNG image when only screenshot is requested\n- **Extracted data**: Returns JSON when extract_rules are provided\n\n## Compatibility\nThis API accepts parameters from multiple scraping services:\n- **ScrapingBee** (primary): All parameters use ScrapingBee naming\n- **ScrapingAnt**: Compatible parameters like `browser`, `js_snippet`, `proxy_type`\n- **ScraperAPI**: Compatible parameters like `render`, `premium`, `binary_target`", + "version": "1.0.0", + "contact": { + "name": "Apify", + "url": "https://apify.com" + }, + "license": { + "name": "ISC" + } + }, + "servers": [ + { + "url": "https://super-scraper.apify.actor", + "description": "Production server" + }, + { + "url": "http://localhost:3000", + "description": "Local development server" + } + ], + "tags": [ + { + "name": "Scraping", + "description": "Web scraping operations" + } + ], + "paths": { + "/": { + "get": { + "summary": "Scrape a web page", + "description": "Fetches and processes a web page with optional JavaScript rendering, screenshots, and data extraction.\n\n## Basic Usage\n```\nGET /?url=https://example.com\n```\n\n## With JavaScript Rendering\n```\nGET /?url=https://example.com&render_js=true&wait=2000\n```\n\n## With Data Extraction\n```\nGET /?url=https://example.com&extract_rules={\"title\":{\"selector\":\"h1\",\"type\":\"item\",\"output\":\"@text\"}}\n```\n\n## With Screenshot\n```\nGET /?url=https://example.com&screenshot=true&json_response=true\n```", + "operationId": "scrape", + "tags": [ + "Scraping" + ], + "parameters": [ + { + "name": "url", + "in": "query", + "description": "The URL to scrape. Must be a fully qualified URL including the protocol (http:// or https://).", + "required": true, + "schema": { + "type": "string" + }, + "example": "https://example.com" + }, + { + "name": "render_js", + "in": "query", + "description": "Enable JavaScript rendering using a headless browser. When false, uses a simple HTTP request without browser. Also accepts: `browser` (ScrapingAnt), `render` (ScraperAPI).", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "device", + "in": "query", + "description": "Device type to emulate. Affects User-Agent and viewport dimensions. Also accepts: `device_type` (ScraperAPI).", + "required": false, + "schema": { + "type": "string", + "enum": [ + "desktop", + "mobile" + ] + }, + "example": "desktop" + }, + { + "name": "window_width", + "in": "query", + "description": "Browser viewport width in pixels. Only applies when render_js is enabled.", + "required": false, + "schema": { + "type": "integer", + "minimum": 100, + "maximum": 3840 + }, + "example": 1920 + }, + { + "name": "window_height", + "in": "query", + "description": "Browser viewport height in pixels. Only applies when render_js is enabled.", + "required": false, + "schema": { + "type": "integer", + "minimum": 100, + "maximum": 2160 + }, + "example": 1080 + }, + { + "name": "wait", + "in": "query", + "description": "Time to wait in milliseconds after page load before returning content.", + "required": false, + "schema": { + "type": "integer", + "minimum": 0, + "maximum": 35000 + }, + "example": 1000 + }, + { + "name": "wait_for", + "in": "query", + "description": "CSS selector to wait for before returning content. Useful for SPAs where content loads dynamically. Also accepts: `wait_for_selector` (ScrapingAnt/ScraperAPI).", + "required": false, + "schema": { + "type": "string" + }, + "example": "#main-content" + }, + { + "name": "wait_browser", + "in": "query", + "description": "Browser event to wait for before considering the page loaded.", + "required": false, + "schema": { + "type": "string", + "enum": [ + "load", + "domcontentloaded", + "networkidle" + ] + }, + "example": "networkidle" + }, + { + "name": "screenshot", + "in": "query", + "description": "Take a screenshot of the visible viewport. Returns base64-encoded PNG in json_response mode, or raw binary otherwise.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "screenshot_full_page", + "in": "query", + "description": "Take a full-page screenshot (entire scrollable area). Overrides screenshot parameter.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "screenshot_selector", + "in": "query", + "description": "CSS selector of element to screenshot. Overrides screenshot and screenshot_full_page parameters.", + "required": false, + "schema": { + "type": "string" + }, + "example": "#hero-image" + }, + { + "name": "extract_rules", + "in": "query", + "description": "JSON object defining extraction rules. Keys are output field names, values define selectors and extraction behavior. Returns extracted data as JSON.", + "required": false, + "schema": { + "type": "string" + }, + "example": "{\"title\": {\"selector\": \"h1\", \"type\": \"item\"}}" + }, + { + "name": "js_scenario", + "in": "query", + "description": "JSON object defining a sequence of browser actions. Each instruction is an object with action name as key and parameter as value. Actions: wait (ms), wait_for (selector), click (selector), scroll_x/scroll_y (pixels), fill ([selector, value]), wait_browser (load|domcontentloaded|networkidle), evaluate (js code), wait_for_and_click (selector).", + "required": false, + "schema": { + "type": "string" + }, + "example": "{\"instructions\": [{\"click\": \"#load-more\"}, {\"wait\": 1000}]}" + }, + { + "name": "json_response", + "in": "query", + "description": "Return response as JSON with metadata including cookies, headers, XHR requests, and more. Without this, returns raw HTML/binary content.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "return_page_source", + "in": "query", + "description": "Return the original page source HTML instead of the rendered DOM. Useful for debugging or when you need the unmodified HTML.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": false + }, + { + "name": "transparent_status_code", + "in": "query", + "description": "Return the actual HTTP status code from the target website instead of 200. Useful for detecting errors or redirects.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "forward_headers", + "in": "query", + "description": "Forward custom headers to the target website. Headers should be prefixed with \"Spb-\" or \"spb-\" (prefix is stripped before forwarding).", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "forward_headers_pure", + "in": "query", + "description": "Forward all non-prefixed headers directly to the target website without modification.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "cookies", + "in": "query", + "description": "Cookies to send with the request. Format: \"name1=value1;name2=value2\" or JSON array of cookie objects.", + "required": false, + "schema": { + "type": "string" + }, + "example": "session_id=abc123;user=john" + }, + { + "name": "timeout", + "in": "query", + "description": "Maximum time in milliseconds to wait for the page to load. Includes all network requests and JavaScript execution.", + "required": false, + "schema": { + "type": "integer", + "minimum": 1000, + "maximum": 3600000 + }, + "example": 30000 + }, + { + "name": "own_proxy", + "in": "query", + "description": "Use your own proxy server. Format: \"http://user:pass@host:port\" or \"http://host:port\".", + "required": false, + "schema": { + "type": "string" + }, + "example": "http://user:pass@proxy.example.com:8080" + }, + { + "name": "premium_proxy", + "in": "query", + "description": "Use premium residential proxies for better success rates on difficult targets. Also accepts: `stealth_proxy`, `premium` (ScraperAPI), `ultra_premium` (ScraperAPI).", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "stealth_proxy", + "in": "query", + "description": "Alias for premium_proxy. Use premium residential proxies for better success rates.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "country_code", + "in": "query", + "description": "Two-letter ISO country code for geo-targeting. Proxy will use an IP from the specified country. Also accepts: `proxy_country` (ScrapingAnt).", + "required": false, + "schema": { + "type": "string" + }, + "example": "US" + }, + { + "name": "block_resources", + "in": "query", + "description": "Block resource types to speed up page loading. Set to true to block common resource types (images, fonts, stylesheets, media).", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "custom_google", + "in": "query", + "description": "Enable optimizations for scraping Google Search results.", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "js_snippet", + "in": "query", + "description": "Base64-encoded JavaScript snippet to execute on the page. The script runs after page load but before content extraction. (ScrapingAnt compatible)", + "required": false, + "schema": { + "type": "string" + }, + "example": "Y29uc29sZS5sb2coIkhlbGxvIik=" + }, + { + "name": "proxy_type", + "in": "query", + "description": "Type of proxy to use. \"datacenter\" is faster but may be blocked by some sites. \"residential\" has better success rates. (ScrapingAnt compatible)", + "required": false, + "schema": { + "type": "string", + "enum": [ + "datacenter", + "residential" + ] + }, + "example": "residential" + }, + { + "name": "block_resource", + "in": "query", + "description": "Comma-separated list of resource types to block (e.g., \"image,stylesheet,font\"). More granular than block_resources. (ScrapingAnt compatible)", + "required": false, + "schema": { + "type": "string" + }, + "example": "image,stylesheet,font,media" + }, + { + "name": "binary_target", + "in": "query", + "description": "Fetch binary files (images, PDFs, etc.) instead of HTML. Returns the raw binary content. (ScraperAPI compatible)", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + }, + { + "name": "keep_headers", + "in": "query", + "description": "Forward all request headers to the target website. Similar to forward_headers_pure but may include additional headers. (ScraperAPI compatible)", + "required": false, + "schema": { + "type": "boolean" + }, + "example": true + } + ], + "responses": { + "200": { + "description": "Successful response. Content type depends on request parameters.", + "content": { + "text/html": { + "schema": { + "type": "string", + "description": "Raw HTML content of the page" + } + }, + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/VerboseResult" + }, + { + "type": "object", + "description": "Extracted data (when using extract_rules without json_response)", + "additionalProperties": true + } + ] + } + }, + "image/png": { + "schema": { + "type": "string", + "format": "binary", + "description": "Screenshot image (when screenshot requested without json_response)" + } + }, + "application/octet-stream": { + "schema": { + "type": "string", + "format": "binary", + "description": "Binary file content (when binary_target=true)" + } + } + } + }, + "400": { + "description": "Bad request - missing or invalid parameters", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "408": { + "description": "Request timeout - page took too long to load", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "502": { + "description": "Target website error (when transparent_status_code=true)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "VerboseResult": { + "type": "object", + "description": "Full response with metadata (returned when json_response=true)", + "properties": { + "body": { + "oneOf": [ + { + "type": "string", + "description": "HTML content or extracted data as string" + }, + { + "type": "object", + "description": "Extracted data as JSON object" + } + ], + "description": "Page content or extracted data" + }, + "cookies": { + "type": "array", + "items": { + "type": "object", + "description": "Browser cookie", + "properties": { + "name": { + "type": "string", + "description": "Cookie name" + }, + "value": { + "type": "string", + "description": "Cookie value" + }, + "domain": { + "type": "string", + "description": "Cookie domain" + }, + "path": { + "type": "string", + "description": "Cookie path" + }, + "expires": { + "type": "number", + "description": "Expiration timestamp", + "nullable": true + }, + "httpOnly": { + "type": "boolean", + "description": "HTTP-only flag" + }, + "secure": { + "type": "boolean", + "description": "Secure flag" + }, + "sameSite": { + "type": "string", + "enum": [ + "Strict", + "Lax", + "None" + ], + "description": "SameSite attribute" + } + } + }, + "description": "Cookies set by the page" + }, + "evaluateResults": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Results from evaluate actions in js_scenario" + }, + "jsScenarioReport": { + "oneOf": [ + { + "type": "object", + "description": "Report of JS scenario execution", + "properties": { + "tasks": { + "type": "array", + "items": { + "type": "object", + "description": "Report for a single JS scenario instruction", + "properties": { + "task": { + "type": "string", + "enum": [ + "wait", + "wait_for", + "click", + "scroll_x", + "scroll_y", + "fill", + "wait_browser", + "evaluate" + ], + "description": "The action that was executed" + }, + "params": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Parameters passed to the action" + }, + "success": { + "type": "boolean", + "description": "Whether the action succeeded" + }, + "duration": { + "type": "number", + "description": "Execution time in milliseconds" + } + }, + "required": [ + "task", + "params", + "success", + "duration" + ] + }, + "description": "Individual task reports" + }, + "taskExecuted": { + "type": "integer", + "description": "Number of tasks executed" + }, + "taskSuccess": { + "type": "integer", + "description": "Number of successful tasks" + }, + "taskFailure": { + "type": "integer", + "description": "Number of failed tasks" + }, + "totalDuration": { + "type": "number", + "description": "Total execution time in milliseconds" + } + }, + "required": [ + "tasks", + "taskExecuted", + "taskSuccess", + "taskFailure", + "totalDuration" + ] + }, + { + "type": "object", + "additionalProperties": false, + "description": "Empty object if no scenario was executed" + } + ], + "description": "JS scenario execution report" + }, + "headers": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "description": "Response headers from the target page" + }, + "type": { + "type": "string", + "enum": [ + "html", + "json", + "file" + ], + "description": "Content type of the response body" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Base64-encoded PNG screenshot (if requested)" + }, + "iframes": { + "type": "array", + "items": { + "type": "object", + "description": "Iframe content data", + "properties": { + "src": { + "type": "string", + "description": "Iframe source URL" + }, + "content": { + "type": "string", + "description": "Iframe HTML content" + } + }, + "required": [ + "src", + "content" + ] + }, + "description": "Content of iframes on the page" + }, + "xhr": { + "type": "array", + "items": { + "type": "object", + "description": "Captured XHR/Fetch request data", + "properties": { + "url": { + "type": "string", + "description": "Request URL" + }, + "statusCode": { + "type": "integer", + "description": "HTTP status code" + }, + "method": { + "type": "string", + "description": "HTTP method (GET, POST, etc.)" + }, + "requestHeaders": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Request headers sent" + }, + "headers": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Response headers received" + }, + "body": { + "type": "string", + "description": "Response body" + } + }, + "required": [ + "url", + "statusCode", + "method", + "requestHeaders", + "headers", + "body" + ] + }, + "description": "Captured XHR/Fetch requests made by the page" + }, + "initialStatusCode": { + "type": "integer", + "nullable": true, + "description": "HTTP status code of the initial page request" + }, + "resolvedUrl": { + "type": "string", + "description": "Final URL after any redirects" + }, + "metadata": { + "type": "string", + "description": "Additional metadata (if available)" + } + }, + "required": [ + "body", + "cookies", + "evaluateResults", + "jsScenarioReport", + "headers", + "type", + "screenshot", + "iframes", + "xhr", + "initialStatusCode", + "resolvedUrl" + ] + }, + "ErrorResponse": { + "type": "object", + "description": "Error response", + "properties": { + "errorMessage": { + "type": "string", + "description": "Human-readable error message" + } + }, + "required": [ + "errorMessage" + ] + }, + "Cookie": { + "type": "object", + "description": "Browser cookie", + "properties": { + "name": { + "type": "string", + "description": "Cookie name" + }, + "value": { + "type": "string", + "description": "Cookie value" + }, + "domain": { + "type": "string", + "description": "Cookie domain" + }, + "path": { + "type": "string", + "description": "Cookie path" + }, + "expires": { + "type": "number", + "description": "Expiration timestamp", + "nullable": true + }, + "httpOnly": { + "type": "boolean", + "description": "HTTP-only flag" + }, + "secure": { + "type": "boolean", + "description": "Secure flag" + }, + "sameSite": { + "type": "string", + "enum": [ + "Strict", + "Lax", + "None" + ], + "description": "SameSite attribute" + } + } + }, + "XHRRequestData": { + "type": "object", + "description": "Captured XHR/Fetch request data", + "properties": { + "url": { + "type": "string", + "description": "Request URL" + }, + "statusCode": { + "type": "integer", + "description": "HTTP status code" + }, + "method": { + "type": "string", + "description": "HTTP method (GET, POST, etc.)" + }, + "requestHeaders": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Request headers sent" + }, + "headers": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Response headers received" + }, + "body": { + "type": "string", + "description": "Response body" + } + }, + "required": [ + "url", + "statusCode", + "method", + "requestHeaders", + "headers", + "body" + ] + }, + "IFrameData": { + "type": "object", + "description": "Iframe content data", + "properties": { + "src": { + "type": "string", + "description": "Iframe source URL" + }, + "content": { + "type": "string", + "description": "Iframe HTML content" + } + }, + "required": [ + "src", + "content" + ] + }, + "JsScenarioReport": { + "type": "object", + "description": "Report of JS scenario execution", + "properties": { + "tasks": { + "type": "array", + "items": { + "type": "object", + "description": "Report for a single JS scenario instruction", + "properties": { + "task": { + "type": "string", + "enum": [ + "wait", + "wait_for", + "click", + "scroll_x", + "scroll_y", + "fill", + "wait_browser", + "evaluate" + ], + "description": "The action that was executed" + }, + "params": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Parameters passed to the action" + }, + "success": { + "type": "boolean", + "description": "Whether the action succeeded" + }, + "duration": { + "type": "number", + "description": "Execution time in milliseconds" + } + }, + "required": [ + "task", + "params", + "success", + "duration" + ] + }, + "description": "Individual task reports" + }, + "taskExecuted": { + "type": "integer", + "description": "Number of tasks executed" + }, + "taskSuccess": { + "type": "integer", + "description": "Number of successful tasks" + }, + "taskFailure": { + "type": "integer", + "description": "Number of failed tasks" + }, + "totalDuration": { + "type": "number", + "description": "Total execution time in milliseconds" + } + }, + "required": [ + "tasks", + "taskExecuted", + "taskSuccess", + "taskFailure", + "totalDuration" + ] + }, + "IndividualInstructionReport": { + "type": "object", + "description": "Report for a single JS scenario instruction", + "properties": { + "task": { + "type": "string", + "enum": [ + "wait", + "wait_for", + "click", + "scroll_x", + "scroll_y", + "fill", + "wait_browser", + "evaluate" + ], + "description": "The action that was executed" + }, + "params": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Parameters passed to the action" + }, + "success": { + "type": "boolean", + "description": "Whether the action succeeded" + }, + "duration": { + "type": "number", + "description": "Execution time in milliseconds" + } + }, + "required": [ + "task", + "params", + "success", + "duration" + ] + }, + "ExtractRule": { + "type": "object", + "description": "Rule for extracting data from the page", + "properties": { + "selector": { + "type": "string", + "description": "CSS selector to target elements" + }, + "type": { + "type": "string", + "enum": [ + "list", + "item" + ], + "description": "Whether to extract a single item or a list of items" + }, + "output": { + "oneOf": [ + { + "type": "string", + "description": "Attribute name or special value (@text, @html)" + }, + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ExtractRule" + }, + "description": "Nested extraction rules" + } + ], + "description": "What to extract from matched elements" + }, + "clean": { + "type": "boolean", + "description": "Whether to clean/trim the extracted text" + } + }, + "required": [ + "selector", + "type", + "output" + ] + }, + "ExtractRules": { + "type": "object", + "description": "Extract rules object. Keys are output field names, values are extraction rules.", + "additionalProperties": { + "type": "object", + "description": "Rule for extracting data from the page", + "properties": { + "selector": { + "type": "string", + "description": "CSS selector to target elements" + }, + "type": { + "type": "string", + "enum": [ + "list", + "item" + ], + "description": "Whether to extract a single item or a list of items" + }, + "output": { + "oneOf": [ + { + "type": "string", + "description": "Attribute name or special value (@text, @html)" + }, + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ExtractRule" + }, + "description": "Nested extraction rules" + } + ], + "description": "What to extract from matched elements" + }, + "clean": { + "type": "boolean", + "description": "Whether to clean/trim the extracted text" + } + }, + "required": [ + "selector", + "type", + "output" + ] + }, + "example": { + "title": { + "selector": "h1", + "type": "item", + "output": "@text" + }, + "links": { + "selector": "a", + "type": "list", + "output": "@href" + } + } + }, + "JsScenario": { + "type": "object", + "description": "JavaScript scenario to execute on the page. Each instruction is an object with the action name as key and parameter as value.", + "properties": { + "instructions": { + "type": "array", + "items": { + "type": "object", + "description": "Single instruction object where key is the action name and value is the parameter. Actions: wait (ms), wait_for (selector), click (selector), scroll_x (pixels), scroll_y (pixels), fill ([selector, value]), wait_browser (load|domcontentloaded|networkidle), evaluate (js code), wait_for_and_click (selector)", + "minProperties": 1, + "maxProperties": 1, + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "description": "List of actions to perform in order" + }, + "strict": { + "type": "boolean", + "description": "If true, stop execution on first failure. Default is true.", + "default": true + } + }, + "required": [ + "instructions" + ], + "example": { + "instructions": [ + { + "click": "#load-more" + }, + { + "wait": 1000 + }, + { + "scroll_y": 500 + }, + { + "wait_for": ".lazy-content" + } + ], + "strict": false + } + } + } + } +} \ No newline at end of file diff --git a/openapi.json b/openapi.json new file mode 100644 index 0000000..4762eac --- /dev/null +++ b/openapi.json @@ -0,0 +1,1189 @@ +{ + "openapi": "3.0.3", + "info": { + "title": "SuperScraper API", + "description": "SuperScraper is a unified web scraping API that provides compatibility with multiple scraping services (ScrapingBee, ScrapingAnt, ScraperAPI).\n\n## Features\n- JavaScript rendering with headless browser\n- Screenshot capture (viewport, full page, or specific element)\n- Custom JavaScript execution via scenarios\n- Data extraction with CSS selectors\n- Proxy support (datacenter and residential)\n- Cookie and header forwarding\n- XHR/Fetch request capture\n\n## Response Formats\n- **HTML (default)**: Returns raw HTML content\n- **JSON (json_response=true)**: Returns structured response with metadata\n- **Screenshot**: Returns PNG image when only screenshot is requested\n- **Extracted data**: Returns JSON when extract_rules are provided\n\n## Compatibility\nThis API accepts parameters from multiple scraping services:\n- **ScrapingBee** (primary): All parameters use ScrapingBee naming\n- **ScrapingAnt**: Compatible parameters like `browser`, `js_snippet`, `proxy_type`\n- **ScraperAPI**: Compatible parameters like `render`, `premium`, `binary_target`", + "version": "1.0.0", + "contact": { + "name": "Apify", + "url": "https://apify.com" + }, + "license": { + "name": "ISC" + } + }, + "servers": [ + { + "url": "https://super-scraper.apify.actor", + "description": "Production server" + }, + { + "url": "http://localhost:3000", + "description": "Local development server" + } + ], + "tags": [ + { + "name": "Scraping", + "description": "Web scraping operations" + } + ], + "paths": { + "/": { + "get": { + "summary": "Scrape a web page", + "description": "Fetches and processes a web page with optional JavaScript rendering, screenshots, and data extraction.\n\n## Basic Usage\n```\nGET /?url=https://example.com\n```\n\n## With JavaScript Rendering\n```\nGET /?url=https://example.com&render_js=true&wait=2000\n```\n\n## With Data Extraction\n```\nGET /?url=https://example.com&extract_rules={\"title\":{\"selector\":\"h1\",\"type\":\"item\",\"output\":\"@text\"}}\n```\n\n## With Screenshot\n```\nGET /?url=https://example.com&screenshot=true&json_response=true\n```", + "operationId": "scrape", + "tags": [ + "Scraping" + ], + "parameters": [ + { + "name": "url", + "in": "query", + "description": "The URL to scrape. Must be a fully qualified URL including the protocol (http:// or https://).", + "required": true, + "schema": { + "type": "string" + }, + "example": "https://example.com" + }, + { + "name": "render_js", + "in": "query", + "description": "Enable JavaScript rendering using a headless browser. When false, uses a simple HTTP request without browser. Also accepts: `browser` (ScrapingAnt), `render` (ScraperAPI).", + "required": false, + "schema": { + "type": "boolean", + "default": true + }, + "example": true + }, + { + "name": "device", + "in": "query", + "description": "Device type to emulate. Affects User-Agent and viewport dimensions. Also accepts: `device_type` (ScraperAPI).", + "required": false, + "schema": { + "type": "string", + "enum": [ + "desktop", + "mobile" + ], + "default": "desktop" + }, + "example": "desktop" + }, + { + "name": "window_width", + "in": "query", + "description": "Browser viewport width in pixels. Only applies when render_js is enabled.", + "required": false, + "schema": { + "type": "integer", + "minimum": 100, + "maximum": 3840, + "default": 1920 + }, + "example": 1920 + }, + { + "name": "window_height", + "in": "query", + "description": "Browser viewport height in pixels. Only applies when render_js is enabled.", + "required": false, + "schema": { + "type": "integer", + "minimum": 100, + "maximum": 2160, + "default": 1080 + }, + "example": 1080 + }, + { + "name": "wait", + "in": "query", + "description": "Time to wait in milliseconds after page load before returning content.", + "required": false, + "schema": { + "type": "integer", + "minimum": 0, + "maximum": 35000 + }, + "example": 1000 + }, + { + "name": "wait_for", + "in": "query", + "description": "CSS selector to wait for before returning content. Useful for SPAs where content loads dynamically. Also accepts: `wait_for_selector` (ScrapingAnt/ScraperAPI).", + "required": false, + "schema": { + "type": "string" + }, + "example": "#main-content" + }, + { + "name": "wait_browser", + "in": "query", + "description": "Browser event to wait for before considering the page loaded.", + "required": false, + "schema": { + "type": "string", + "enum": [ + "load", + "domcontentloaded", + "networkidle" + ], + "default": "load" + }, + "example": "networkidle" + }, + { + "name": "screenshot", + "in": "query", + "description": "Take a screenshot of the visible viewport. Returns base64-encoded PNG in json_response mode, or raw binary otherwise.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "screenshot_full_page", + "in": "query", + "description": "Take a full-page screenshot (entire scrollable area). Overrides screenshot parameter.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "screenshot_selector", + "in": "query", + "description": "CSS selector of element to screenshot. Overrides screenshot and screenshot_full_page parameters.", + "required": false, + "schema": { + "type": "string" + }, + "example": "#hero-image" + }, + { + "name": "extract_rules", + "in": "query", + "description": "JSON object defining extraction rules. Keys are output field names, values define selectors and extraction behavior. Returns extracted data as JSON.", + "required": false, + "schema": { + "type": "string" + }, + "example": "{\"title\": {\"selector\": \"h1\", \"type\": \"item\"}}" + }, + { + "name": "js_scenario", + "in": "query", + "description": "JSON object defining a sequence of browser actions. Each instruction is an object with action name as key and parameter as value. Actions: wait (ms), wait_for (selector), click (selector), scroll_x/scroll_y (pixels), fill ([selector, value]), wait_browser (load|domcontentloaded|networkidle), evaluate (js code), wait_for_and_click (selector).", + "required": false, + "schema": { + "type": "string" + }, + "example": "{\"instructions\": [{\"click\": \"#load-more\"}, {\"wait\": 1000}]}" + }, + { + "name": "json_response", + "in": "query", + "description": "Return response as JSON with metadata including cookies, headers, XHR requests, and more. Without this, returns raw HTML/binary content.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "return_page_source", + "in": "query", + "description": "Return the original page source HTML instead of the rendered DOM. Useful for debugging or when you need the unmodified HTML.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": false + }, + { + "name": "transparent_status_code", + "in": "query", + "description": "Return the actual HTTP status code from the target website instead of 200. Useful for detecting errors or redirects.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "forward_headers", + "in": "query", + "description": "Forward custom headers to the target website. Headers should be prefixed with \"Spb-\" or \"spb-\" (prefix is stripped before forwarding).", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "forward_headers_pure", + "in": "query", + "description": "Forward all non-prefixed headers directly to the target website without modification.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "cookies", + "in": "query", + "description": "Cookies to send with the request. Format: \"name1=value1;name2=value2\" or JSON array of cookie objects.", + "required": false, + "schema": { + "type": "string" + }, + "example": "session_id=abc123;user=john" + }, + { + "name": "timeout", + "in": "query", + "description": "Maximum time in milliseconds to wait for the page to load. Includes all network requests and JavaScript execution.", + "required": false, + "schema": { + "type": "integer", + "minimum": 1000, + "maximum": 3600000, + "default": 140000 + }, + "example": 30000 + }, + { + "name": "own_proxy", + "in": "query", + "description": "Use your own proxy server. Format: \"http://user:pass@host:port\" or \"http://host:port\".", + "required": false, + "schema": { + "type": "string" + }, + "example": "http://user:pass@proxy.example.com:8080" + }, + { + "name": "premium_proxy", + "in": "query", + "description": "Use premium residential proxies for better success rates on difficult targets. Also accepts: `stealth_proxy`, `premium` (ScraperAPI), `ultra_premium` (ScraperAPI).", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "stealth_proxy", + "in": "query", + "description": "Alias for premium_proxy. Use premium residential proxies for better success rates.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "country_code", + "in": "query", + "description": "Two-letter ISO country code for geo-targeting. Proxy will use an IP from the specified country. Also accepts: `proxy_country` (ScrapingAnt).", + "required": false, + "schema": { + "type": "string" + }, + "example": "US" + }, + { + "name": "block_resources", + "in": "query", + "description": "Block resource types to speed up page loading. Set to true to block common resource types (images, fonts, stylesheets, media).", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "custom_google", + "in": "query", + "description": "Enable optimizations for scraping Google Search results.", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "js_snippet", + "in": "query", + "description": "Base64-encoded JavaScript snippet to execute on the page. The script runs after page load but before content extraction. (ScrapingAnt compatible)", + "required": false, + "schema": { + "type": "string" + }, + "example": "Y29uc29sZS5sb2coIkhlbGxvIik=" + }, + { + "name": "proxy_type", + "in": "query", + "description": "Type of proxy to use. \"datacenter\" is faster but may be blocked by some sites. \"residential\" has better success rates. (ScrapingAnt compatible)", + "required": false, + "schema": { + "type": "string", + "enum": [ + "datacenter", + "residential" + ], + "default": "datacenter" + }, + "example": "residential" + }, + { + "name": "block_resource", + "in": "query", + "description": "Comma-separated list of resource types to block (e.g., \"image,stylesheet,font\"). More granular than block_resources. (ScrapingAnt compatible)", + "required": false, + "schema": { + "type": "string" + }, + "example": "image,stylesheet,font,media" + }, + { + "name": "binary_target", + "in": "query", + "description": "Fetch binary files (images, PDFs, etc.) instead of HTML. Returns the raw binary content. (ScraperAPI compatible)", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + }, + { + "name": "keep_headers", + "in": "query", + "description": "Forward all request headers to the target website. Similar to forward_headers_pure but may include additional headers. (ScraperAPI compatible)", + "required": false, + "schema": { + "type": "boolean", + "default": false + }, + "example": true + } + ], + "responses": { + "200": { + "description": "Successful response. Content type depends on request parameters.", + "content": { + "text/html": { + "schema": { + "type": "string", + "description": "Raw HTML content of the page" + } + }, + "application/json": { + "schema": { + "oneOf": [ + { + "$ref": "#/components/schemas/VerboseResult" + }, + { + "type": "object", + "description": "Extracted data (when using extract_rules without json_response)", + "additionalProperties": true + } + ] + } + }, + "image/png": { + "schema": { + "type": "string", + "format": "binary", + "description": "Screenshot image (when screenshot requested without json_response)" + } + }, + "application/octet-stream": { + "schema": { + "type": "string", + "format": "binary", + "description": "Binary file content (when binary_target=true)" + } + } + } + }, + "400": { + "description": "Bad request - missing or invalid parameters", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "408": { + "description": "Request timeout - page took too long to load", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "500": { + "description": "Internal server error", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + }, + "502": { + "description": "Target website error (when transparent_status_code=true)", + "content": { + "application/json": { + "schema": { + "$ref": "#/components/schemas/ErrorResponse" + } + } + } + } + } + } + } + }, + "components": { + "schemas": { + "VerboseResult": { + "type": "object", + "description": "Full response with metadata (returned when json_response=true)", + "properties": { + "body": { + "oneOf": [ + { + "type": "string", + "description": "HTML content or extracted data as string" + }, + { + "type": "object", + "description": "Extracted data as JSON object" + } + ], + "description": "Page content or extracted data" + }, + "cookies": { + "type": "array", + "items": { + "type": "object", + "description": "Browser cookie", + "properties": { + "name": { + "type": "string", + "description": "Cookie name" + }, + "value": { + "type": "string", + "description": "Cookie value" + }, + "domain": { + "type": "string", + "description": "Cookie domain" + }, + "path": { + "type": "string", + "description": "Cookie path" + }, + "expires": { + "type": "number", + "description": "Expiration timestamp", + "nullable": true + }, + "httpOnly": { + "type": "boolean", + "description": "HTTP-only flag" + }, + "secure": { + "type": "boolean", + "description": "Secure flag" + }, + "sameSite": { + "type": "string", + "enum": [ + "Strict", + "Lax", + "None" + ], + "description": "SameSite attribute" + } + } + }, + "description": "Cookies set by the page" + }, + "evaluateResults": { + "type": "array", + "items": { + "type": "string" + }, + "description": "Results from evaluate actions in js_scenario" + }, + "jsScenarioReport": { + "oneOf": [ + { + "type": "object", + "description": "Report of JS scenario execution", + "properties": { + "tasks": { + "type": "array", + "items": { + "type": "object", + "description": "Report for a single JS scenario instruction", + "properties": { + "task": { + "type": "string", + "enum": [ + "wait", + "wait_for", + "click", + "scroll_x", + "scroll_y", + "fill", + "wait_browser", + "evaluate" + ], + "description": "The action that was executed" + }, + "params": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Parameters passed to the action" + }, + "success": { + "type": "boolean", + "description": "Whether the action succeeded" + }, + "duration": { + "type": "number", + "description": "Execution time in milliseconds" + } + }, + "required": [ + "task", + "params", + "success", + "duration" + ] + }, + "description": "Individual task reports" + }, + "taskExecuted": { + "type": "integer", + "description": "Number of tasks executed" + }, + "taskSuccess": { + "type": "integer", + "description": "Number of successful tasks" + }, + "taskFailure": { + "type": "integer", + "description": "Number of failed tasks" + }, + "totalDuration": { + "type": "number", + "description": "Total execution time in milliseconds" + } + }, + "required": [ + "tasks", + "taskExecuted", + "taskSuccess", + "taskFailure", + "totalDuration" + ] + }, + { + "type": "object", + "additionalProperties": false, + "description": "Empty object if no scenario was executed" + } + ], + "description": "JS scenario execution report" + }, + "headers": { + "type": "object", + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + }, + "description": "Response headers from the target page" + }, + "type": { + "type": "string", + "enum": [ + "html", + "json", + "file" + ], + "description": "Content type of the response body" + }, + "screenshot": { + "type": "string", + "nullable": true, + "description": "Base64-encoded PNG screenshot (if requested)" + }, + "iframes": { + "type": "array", + "items": { + "type": "object", + "description": "Iframe content data", + "properties": { + "src": { + "type": "string", + "description": "Iframe source URL" + }, + "content": { + "type": "string", + "description": "Iframe HTML content" + } + }, + "required": [ + "src", + "content" + ] + }, + "description": "Content of iframes on the page" + }, + "xhr": { + "type": "array", + "items": { + "type": "object", + "description": "Captured XHR/Fetch request data", + "properties": { + "url": { + "type": "string", + "description": "Request URL" + }, + "statusCode": { + "type": "integer", + "description": "HTTP status code" + }, + "method": { + "type": "string", + "description": "HTTP method (GET, POST, etc.)" + }, + "requestHeaders": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Request headers sent" + }, + "headers": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Response headers received" + }, + "body": { + "type": "string", + "description": "Response body" + } + }, + "required": [ + "url", + "statusCode", + "method", + "requestHeaders", + "headers", + "body" + ] + }, + "description": "Captured XHR/Fetch requests made by the page" + }, + "initialStatusCode": { + "type": "integer", + "nullable": true, + "description": "HTTP status code of the initial page request" + }, + "resolvedUrl": { + "type": "string", + "description": "Final URL after any redirects" + }, + "metadata": { + "type": "string", + "description": "Additional metadata (if available)" + } + }, + "required": [ + "body", + "cookies", + "evaluateResults", + "jsScenarioReport", + "headers", + "type", + "screenshot", + "iframes", + "xhr", + "initialStatusCode", + "resolvedUrl" + ] + }, + "ErrorResponse": { + "type": "object", + "description": "Error response", + "properties": { + "errorMessage": { + "type": "string", + "description": "Human-readable error message" + } + }, + "required": [ + "errorMessage" + ] + }, + "Cookie": { + "type": "object", + "description": "Browser cookie", + "properties": { + "name": { + "type": "string", + "description": "Cookie name" + }, + "value": { + "type": "string", + "description": "Cookie value" + }, + "domain": { + "type": "string", + "description": "Cookie domain" + }, + "path": { + "type": "string", + "description": "Cookie path" + }, + "expires": { + "type": "number", + "description": "Expiration timestamp", + "nullable": true + }, + "httpOnly": { + "type": "boolean", + "description": "HTTP-only flag" + }, + "secure": { + "type": "boolean", + "description": "Secure flag" + }, + "sameSite": { + "type": "string", + "enum": [ + "Strict", + "Lax", + "None" + ], + "description": "SameSite attribute" + } + } + }, + "XHRRequestData": { + "type": "object", + "description": "Captured XHR/Fetch request data", + "properties": { + "url": { + "type": "string", + "description": "Request URL" + }, + "statusCode": { + "type": "integer", + "description": "HTTP status code" + }, + "method": { + "type": "string", + "description": "HTTP method (GET, POST, etc.)" + }, + "requestHeaders": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Request headers sent" + }, + "headers": { + "type": "object", + "additionalProperties": { + "type": "string" + }, + "description": "Response headers received" + }, + "body": { + "type": "string", + "description": "Response body" + } + }, + "required": [ + "url", + "statusCode", + "method", + "requestHeaders", + "headers", + "body" + ] + }, + "IFrameData": { + "type": "object", + "description": "Iframe content data", + "properties": { + "src": { + "type": "string", + "description": "Iframe source URL" + }, + "content": { + "type": "string", + "description": "Iframe HTML content" + } + }, + "required": [ + "src", + "content" + ] + }, + "JsScenarioReport": { + "type": "object", + "description": "Report of JS scenario execution", + "properties": { + "tasks": { + "type": "array", + "items": { + "type": "object", + "description": "Report for a single JS scenario instruction", + "properties": { + "task": { + "type": "string", + "enum": [ + "wait", + "wait_for", + "click", + "scroll_x", + "scroll_y", + "fill", + "wait_browser", + "evaluate" + ], + "description": "The action that was executed" + }, + "params": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Parameters passed to the action" + }, + "success": { + "type": "boolean", + "description": "Whether the action succeeded" + }, + "duration": { + "type": "number", + "description": "Execution time in milliseconds" + } + }, + "required": [ + "task", + "params", + "success", + "duration" + ] + }, + "description": "Individual task reports" + }, + "taskExecuted": { + "type": "integer", + "description": "Number of tasks executed" + }, + "taskSuccess": { + "type": "integer", + "description": "Number of successful tasks" + }, + "taskFailure": { + "type": "integer", + "description": "Number of failed tasks" + }, + "totalDuration": { + "type": "number", + "description": "Total execution time in milliseconds" + } + }, + "required": [ + "tasks", + "taskExecuted", + "taskSuccess", + "taskFailure", + "totalDuration" + ] + }, + "IndividualInstructionReport": { + "type": "object", + "description": "Report for a single JS scenario instruction", + "properties": { + "task": { + "type": "string", + "enum": [ + "wait", + "wait_for", + "click", + "scroll_x", + "scroll_y", + "fill", + "wait_browser", + "evaluate" + ], + "description": "The action that was executed" + }, + "params": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ], + "description": "Parameters passed to the action" + }, + "success": { + "type": "boolean", + "description": "Whether the action succeeded" + }, + "duration": { + "type": "number", + "description": "Execution time in milliseconds" + } + }, + "required": [ + "task", + "params", + "success", + "duration" + ] + }, + "ExtractRule": { + "type": "object", + "description": "Rule for extracting data from the page", + "properties": { + "selector": { + "type": "string", + "description": "CSS selector to target elements" + }, + "type": { + "type": "string", + "enum": [ + "list", + "item" + ], + "description": "Whether to extract a single item or a list of items" + }, + "output": { + "oneOf": [ + { + "type": "string", + "description": "Attribute name or special value (@text, @html)" + }, + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ExtractRule" + }, + "description": "Nested extraction rules" + } + ], + "description": "What to extract from matched elements" + }, + "clean": { + "type": "boolean", + "description": "Whether to clean/trim the extracted text" + } + }, + "required": [ + "selector", + "type", + "output" + ] + }, + "ExtractRules": { + "type": "object", + "description": "Extract rules object. Keys are output field names, values are extraction rules.", + "additionalProperties": { + "type": "object", + "description": "Rule for extracting data from the page", + "properties": { + "selector": { + "type": "string", + "description": "CSS selector to target elements" + }, + "type": { + "type": "string", + "enum": [ + "list", + "item" + ], + "description": "Whether to extract a single item or a list of items" + }, + "output": { + "oneOf": [ + { + "type": "string", + "description": "Attribute name or special value (@text, @html)" + }, + { + "type": "object", + "additionalProperties": { + "$ref": "#/components/schemas/ExtractRule" + }, + "description": "Nested extraction rules" + } + ], + "description": "What to extract from matched elements" + }, + "clean": { + "type": "boolean", + "description": "Whether to clean/trim the extracted text" + } + }, + "required": [ + "selector", + "type", + "output" + ] + }, + "example": { + "title": { + "selector": "h1", + "type": "item", + "output": "@text" + }, + "links": { + "selector": "a", + "type": "list", + "output": "@href" + } + } + }, + "JsScenario": { + "type": "object", + "description": "JavaScript scenario to execute on the page. Each instruction is an object with the action name as key and parameter as value.", + "properties": { + "instructions": { + "type": "array", + "items": { + "type": "object", + "description": "Single instruction object where key is the action name and value is the parameter. Actions: wait (ms), wait_for (selector), click (selector), scroll_x (pixels), scroll_y (pixels), fill ([selector, value]), wait_browser (load|domcontentloaded|networkidle), evaluate (js code), wait_for_and_click (selector)", + "minProperties": 1, + "maxProperties": 1, + "additionalProperties": { + "oneOf": [ + { + "type": "string" + }, + { + "type": "number" + }, + { + "type": "array", + "items": { + "type": "string" + } + } + ] + } + }, + "description": "List of actions to perform in order" + }, + "strict": { + "type": "boolean", + "description": "If true, stop execution on first failure. Default is true.", + "default": true + } + }, + "required": [ + "instructions" + ], + "example": { + "instructions": [ + { + "click": "#load-more" + }, + { + "wait": 1000 + }, + { + "scroll_y": 500 + }, + { + "wait_for": ".lazy-content" + } + ], + "strict": false + } + } + } + } +} \ No newline at end of file diff --git a/package.json b/package.json index c3efad5..c5f4a95 100644 --- a/package.json +++ b/package.json @@ -30,7 +30,8 @@ "start": "npm run start:dev", "start:prod": "node dist/main.js", "start:dev": "tsx src/main.ts", - "build": "tsc", + "generate:openapi": "tsx scripts/generate-openapi.ts", + "build": "npm run generate:openapi && tsc", "lint": "eslint ./src --ext .ts", "lint:fix": "eslint ./src --ext .ts --fix", "test": "echo \"Error: oops, the actor has no tests yet, sad!\" && exit 1", diff --git a/scripts/generate-openapi.ts b/scripts/generate-openapi.ts new file mode 100644 index 0000000..e18769f --- /dev/null +++ b/scripts/generate-openapi.ts @@ -0,0 +1,243 @@ +#!/usr/bin/env tsx +/** + * OpenAPI specification generator for SuperScraper API + * + * Generates openapi.json from: + * - Parameter enums from src/params.ts + * - Parameter metadata from src/openapi/parameter-metadata.ts + * - Response schemas from src/openapi/response-schemas.ts + */ + +import { writeFileSync } from 'fs'; +import { resolve, dirname } from 'path'; +import { fileURLToPath } from 'url'; + +import { parameterMetadata } from '../src/openapi/parameter-metadata.js'; +import { componentSchemas } from '../src/openapi/response-schemas.js'; +import type { OpenAPISpec, ParameterObject, SchemaObject } from '../src/openapi/types.js'; + +const __filename = fileURLToPath(import.meta.url); +const __dirname = dirname(__filename); + +/** + * Build OpenAPI parameter object from metadata + */ +function buildParameter(name: string, meta: typeof parameterMetadata[string]): ParameterObject { + const schema: SchemaObject = { + type: meta.type, + }; + + if (meta.enum) { + schema.enum = meta.enum; + } + if (meta.minimum !== undefined) { + schema.minimum = meta.minimum; + } + if (meta.maximum !== undefined) { + schema.maximum = meta.maximum; + } + // Don't include defaults in OpenAPI spec - let API apply them internally + // This prevents UI tools from pre-populating optional parameters + + const param: ParameterObject = { + name, + in: 'query', + description: meta.description, + required: meta.required ?? false, + schema, + }; + + if (meta.example !== undefined) { + param.example = meta.example; + } + + return param; +} + +/** + * Generate the complete OpenAPI specification + */ +function generateOpenAPISpec(): OpenAPISpec { + // Build parameters array + const parameters: ParameterObject[] = Object.entries(parameterMetadata).map( + ([name, meta]) => buildParameter(name, meta), + ); + + const spec: OpenAPISpec = { + openapi: '3.0.3', + info: { + title: 'SuperScraper API', + description: `SuperScraper is a unified web scraping API that provides compatibility with multiple scraping services (ScrapingBee, ScrapingAnt, ScraperAPI). + +## Features +- JavaScript rendering with headless browser +- Screenshot capture (viewport, full page, or specific element) +- Custom JavaScript execution via scenarios +- Data extraction with CSS selectors +- Proxy support (datacenter and residential) +- Cookie and header forwarding +- XHR/Fetch request capture + +## Response Formats +- **HTML (default)**: Returns raw HTML content +- **JSON (json_response=true)**: Returns structured response with metadata +- **Screenshot**: Returns PNG image when only screenshot is requested +- **Extracted data**: Returns JSON when extract_rules are provided + +## Compatibility +This API accepts parameters from multiple scraping services: +- **ScrapingBee** (primary): All parameters use ScrapingBee naming +- **ScrapingAnt**: Compatible parameters like \`browser\`, \`js_snippet\`, \`proxy_type\` +- **ScraperAPI**: Compatible parameters like \`render\`, \`premium\`, \`binary_target\``, + version: '1.0.0', + contact: { + name: 'Apify', + url: 'https://apify.com', + }, + license: { + name: 'ISC', + }, + }, + servers: [ + { + url: 'https://super-scraper.apify.actor', + description: 'Production server', + }, + { + url: 'http://localhost:3000', + description: 'Local development server', + }, + ], + tags: [ + { + name: 'Scraping', + description: 'Web scraping operations', + }, + ], + paths: { + '/': { + get: { + summary: 'Scrape a web page', + description: `Fetches and processes a web page with optional JavaScript rendering, screenshots, and data extraction. + +## Basic Usage +\`\`\` +GET /?url=https://example.com +\`\`\` + +## With JavaScript Rendering +\`\`\` +GET /?url=https://example.com&render_js=true&wait=2000 +\`\`\` + +## With Data Extraction +\`\`\` +GET /?url=https://example.com&extract_rules={"title":{"selector":"h1","type":"item","output":"@text"}} +\`\`\` + +## With Screenshot +\`\`\` +GET /?url=https://example.com&screenshot=true&json_response=true +\`\`\``, + operationId: 'scrape', + tags: ['Scraping'], + parameters, + responses: { + '200': { + description: 'Successful response. Content type depends on request parameters.', + content: { + 'text/html': { + schema: { + type: 'string', + description: 'Raw HTML content of the page', + }, + }, + 'application/json': { + schema: { + oneOf: [ + { $ref: '#/components/schemas/VerboseResult' }, + { + type: 'object', + description: 'Extracted data (when using extract_rules without json_response)', + additionalProperties: true, + }, + ], + }, + }, + 'image/png': { + schema: { + type: 'string', + format: 'binary', + description: 'Screenshot image (when screenshot requested without json_response)', + }, + }, + 'application/octet-stream': { + schema: { + type: 'string', + format: 'binary', + description: 'Binary file content (when binary_target=true)', + }, + }, + }, + }, + '400': { + description: 'Bad request - missing or invalid parameters', + content: { + 'application/json': { + schema: { $ref: '#/components/schemas/ErrorResponse' }, + }, + }, + }, + '408': { + description: 'Request timeout - page took too long to load', + content: { + 'application/json': { + schema: { $ref: '#/components/schemas/ErrorResponse' }, + }, + }, + }, + '500': { + description: 'Internal server error', + content: { + 'application/json': { + schema: { $ref: '#/components/schemas/ErrorResponse' }, + }, + }, + }, + '502': { + description: 'Target website error (when transparent_status_code=true)', + content: { + 'application/json': { + schema: { $ref: '#/components/schemas/ErrorResponse' }, + }, + }, + }, + }, + }, + }, + }, + components: { + schemas: componentSchemas, + }, + }; + + return spec; +} + +/** + * Main function + */ +function main(): void { + console.log('Generating OpenAPI specification...'); + + const spec = generateOpenAPISpec(); + const outputPath = resolve(__dirname, '../.actor/openapi.json'); + + writeFileSync(outputPath, JSON.stringify(spec, null, 2)); + + console.log(`OpenAPI specification written to: ${outputPath}`); + console.log(`Total parameters documented: ${Object.keys(parameterMetadata).length}`); + console.log(`Total schemas documented: ${Object.keys(componentSchemas).length}`); +} + +main(); diff --git a/src/openapi/parameter-metadata.ts b/src/openapi/parameter-metadata.ts new file mode 100644 index 0000000..6820789 --- /dev/null +++ b/src/openapi/parameter-metadata.ts @@ -0,0 +1,268 @@ +/** + * Metadata for OpenAPI query parameters. + * Contains descriptions, examples, defaults, and validation constraints. + */ + +export interface ParameterMetadata { + description: string; + example?: string | number | boolean; + default?: string | number | boolean; + type: 'string' | 'integer' | 'boolean'; + required?: boolean; + enum?: string[]; + minimum?: number; + maximum?: number; + aliases?: string[]; +} + +export const parameterMetadata: Record = { + // Required parameter + url: { + description: 'The URL to scrape. Must be a fully qualified URL including the protocol (http:// or https://).', + example: 'https://example.com', + type: 'string', + required: true, + }, + + // JavaScript rendering + render_js: { + description: 'Enable JavaScript rendering using a headless browser. When false, uses a simple HTTP request without browser. Also accepts: `browser` (ScrapingAnt), `render` (ScraperAPI).', + example: true, + default: true, + type: 'boolean', + aliases: ['browser', 'render'], + }, + + // Device and viewport + device: { + description: 'Device type to emulate. Affects User-Agent and viewport dimensions. Also accepts: `device_type` (ScraperAPI).', + example: 'desktop', + default: 'desktop', + type: 'string', + enum: ['desktop', 'mobile'], + aliases: ['device_type'], + }, + + window_width: { + description: 'Browser viewport width in pixels. Only applies when render_js is enabled.', + example: 1920, + default: 1920, + type: 'integer', + minimum: 100, + maximum: 3840, + }, + + window_height: { + description: 'Browser viewport height in pixels. Only applies when render_js is enabled.', + example: 1080, + default: 1080, + type: 'integer', + minimum: 100, + maximum: 2160, + }, + + // Waiting options + wait: { + description: 'Time to wait in milliseconds after page load before returning content.', + example: 1000, + type: 'integer', + minimum: 0, + maximum: 35000, + }, + + wait_for: { + description: 'CSS selector to wait for before returning content. Useful for SPAs where content loads dynamically. Also accepts: `wait_for_selector` (ScrapingAnt/ScraperAPI).', + example: '#main-content', + type: 'string', + aliases: ['wait_for_selector'], + }, + + wait_browser: { + description: 'Browser event to wait for before considering the page loaded.', + example: 'networkidle', + default: 'load', + type: 'string', + enum: ['load', 'domcontentloaded', 'networkidle'], + }, + + // Screenshot options + screenshot: { + description: 'Take a screenshot of the visible viewport. Returns base64-encoded PNG in json_response mode, or raw binary otherwise.', + example: true, + default: false, + type: 'boolean', + }, + + screenshot_full_page: { + description: 'Take a full-page screenshot (entire scrollable area). Overrides screenshot parameter.', + example: true, + default: false, + type: 'boolean', + }, + + screenshot_selector: { + description: 'CSS selector of element to screenshot. Overrides screenshot and screenshot_full_page parameters.', + example: '#hero-image', + type: 'string', + }, + + // Content extraction + extract_rules: { + description: 'JSON object defining extraction rules. Keys are output field names, values define selectors and extraction behavior. Returns extracted data as JSON.', + example: '{"title": {"selector": "h1", "type": "item"}}', + type: 'string', + }, + + js_scenario: { + description: 'JSON object defining a sequence of browser actions. Each instruction is an object with action name as key and parameter as value. Actions: wait (ms), wait_for (selector), click (selector), scroll_x/scroll_y (pixels), fill ([selector, value]), wait_browser (load|domcontentloaded|networkidle), evaluate (js code), wait_for_and_click (selector).', + example: '{"instructions": [{"click": "#load-more"}, {"wait": 1000}]}', + type: 'string', + }, + + // Response options + json_response: { + description: 'Return response as JSON with metadata including cookies, headers, XHR requests, and more. Without this, returns raw HTML/binary content.', + example: true, + default: false, + type: 'boolean', + }, + + return_page_source: { + description: 'Return the original page source HTML instead of the rendered DOM. Useful for debugging or when you need the unmodified HTML.', + example: false, + default: false, + type: 'boolean', + }, + + transparent_status_code: { + description: 'Return the actual HTTP status code from the target website instead of 200. Useful for detecting errors or redirects.', + example: true, + default: false, + type: 'boolean', + }, + + // Headers and cookies + forward_headers: { + description: 'Forward custom headers to the target website. Headers should be prefixed with "Spb-" or "spb-" (prefix is stripped before forwarding).', + example: true, + default: false, + type: 'boolean', + }, + + forward_headers_pure: { + description: 'Forward all non-prefixed headers directly to the target website without modification.', + example: true, + default: false, + type: 'boolean', + }, + + cookies: { + description: 'Cookies to send with the request. Format: "name1=value1;name2=value2" or JSON array of cookie objects.', + example: 'session_id=abc123;user=john', + type: 'string', + }, + + // Timeout + timeout: { + description: 'Maximum time in milliseconds to wait for the page to load. Includes all network requests and JavaScript execution.', + example: 30000, + default: 140000, + type: 'integer', + minimum: 1000, + maximum: 3600000, + }, + + // Proxy options + own_proxy: { + description: 'Use your own proxy server. Format: "http://user:pass@host:port" or "http://host:port".', + example: 'http://user:pass@proxy.example.com:8080', + type: 'string', + }, + + premium_proxy: { + description: 'Use premium residential proxies for better success rates on difficult targets. Also accepts: `stealth_proxy`, `premium` (ScraperAPI), `ultra_premium` (ScraperAPI).', + example: true, + default: false, + type: 'boolean', + aliases: ['stealth_proxy', 'premium', 'ultra_premium'], + }, + + stealth_proxy: { + description: 'Alias for premium_proxy. Use premium residential proxies for better success rates.', + example: true, + default: false, + type: 'boolean', + }, + + country_code: { + description: 'Two-letter ISO country code for geo-targeting. Proxy will use an IP from the specified country. Also accepts: `proxy_country` (ScrapingAnt).', + example: 'US', + type: 'string', + aliases: ['proxy_country'], + }, + + // Resource blocking + block_resources: { + description: 'Block resource types to speed up page loading. Set to true to block common resource types (images, fonts, stylesheets, media).', + example: true, + default: false, + type: 'boolean', + }, + + // Google-specific + custom_google: { + description: 'Enable optimizations for scraping Google Search results.', + example: true, + default: false, + type: 'boolean', + }, + + // ScrapingAnt-specific parameters + js_snippet: { + description: 'Base64-encoded JavaScript snippet to execute on the page. The script runs after page load but before content extraction. (ScrapingAnt compatible)', + example: 'Y29uc29sZS5sb2coIkhlbGxvIik=', + type: 'string', + }, + + proxy_type: { + description: 'Type of proxy to use. "datacenter" is faster but may be blocked by some sites. "residential" has better success rates. (ScrapingAnt compatible)', + example: 'residential', + default: 'datacenter', + type: 'string', + enum: ['datacenter', 'residential'], + }, + + block_resource: { + description: 'Comma-separated list of resource types to block (e.g., "image,stylesheet,font"). More granular than block_resources. (ScrapingAnt compatible)', + example: 'image,stylesheet,font,media', + type: 'string', + }, + + // ScraperAPI-specific parameters + binary_target: { + description: 'Fetch binary files (images, PDFs, etc.) instead of HTML. Returns the raw binary content. (ScraperAPI compatible)', + example: true, + default: false, + type: 'boolean', + }, + + keep_headers: { + description: 'Forward all request headers to the target website. Similar to forward_headers_pure but may include additional headers. (ScraperAPI compatible)', + example: true, + default: false, + type: 'boolean', + }, +}; + +/** + * Get all parameter names including aliases + */ +export function getAllParameterNames(): string[] { + const names = Object.keys(parameterMetadata); + for (const meta of Object.values(parameterMetadata)) { + if (meta.aliases) { + names.push(...meta.aliases); + } + } + return [...new Set(names)]; +} diff --git a/src/openapi/response-schemas.ts b/src/openapi/response-schemas.ts new file mode 100644 index 0000000..f17972f --- /dev/null +++ b/src/openapi/response-schemas.ts @@ -0,0 +1,288 @@ +/** + * OpenAPI response schemas derived from types.ts + */ + +import type { SchemaObject } from './types.js'; + +export const cookieSchema: SchemaObject = { + type: 'object', + description: 'Browser cookie', + properties: { + name: { type: 'string', description: 'Cookie name' }, + value: { type: 'string', description: 'Cookie value' }, + domain: { type: 'string', description: 'Cookie domain' }, + path: { type: 'string', description: 'Cookie path' }, + expires: { type: 'number', description: 'Expiration timestamp', nullable: true }, + httpOnly: { type: 'boolean', description: 'HTTP-only flag' }, + secure: { type: 'boolean', description: 'Secure flag' }, + sameSite: { + type: 'string', + enum: ['Strict', 'Lax', 'None'], + description: 'SameSite attribute', + }, + }, +}; + +export const xhrRequestDataSchema: SchemaObject = { + type: 'object', + description: 'Captured XHR/Fetch request data', + properties: { + url: { type: 'string', description: 'Request URL' }, + statusCode: { type: 'integer', description: 'HTTP status code' }, + method: { type: 'string', description: 'HTTP method (GET, POST, etc.)' }, + requestHeaders: { + type: 'object', + additionalProperties: { type: 'string' }, + description: 'Request headers sent', + }, + headers: { + type: 'object', + additionalProperties: { type: 'string' }, + description: 'Response headers received', + }, + body: { type: 'string', description: 'Response body' }, + }, + required: ['url', 'statusCode', 'method', 'requestHeaders', 'headers', 'body'], +}; + +export const iframeDataSchema: SchemaObject = { + type: 'object', + description: 'Iframe content data', + properties: { + src: { type: 'string', description: 'Iframe source URL' }, + content: { type: 'string', description: 'Iframe HTML content' }, + }, + required: ['src', 'content'], +}; + +export const individualInstructionReportSchema: SchemaObject = { + type: 'object', + description: 'Report for a single JS scenario instruction', + properties: { + task: { + type: 'string', + enum: ['wait', 'wait_for', 'click', 'scroll_x', 'scroll_y', 'fill', 'wait_browser', 'evaluate'], + description: 'The action that was executed', + }, + params: { + oneOf: [ + { type: 'string' }, + { type: 'number' }, + { type: 'array', items: { type: 'string' } }, + ], + description: 'Parameters passed to the action', + }, + success: { type: 'boolean', description: 'Whether the action succeeded' }, + duration: { type: 'number', description: 'Execution time in milliseconds' }, + }, + required: ['task', 'params', 'success', 'duration'], +}; + +export const jsScenarioReportSchema: SchemaObject = { + type: 'object', + description: 'Report of JS scenario execution', + properties: { + tasks: { + type: 'array', + items: individualInstructionReportSchema, + description: 'Individual task reports', + }, + taskExecuted: { type: 'integer', description: 'Number of tasks executed' }, + taskSuccess: { type: 'integer', description: 'Number of successful tasks' }, + taskFailure: { type: 'integer', description: 'Number of failed tasks' }, + totalDuration: { type: 'number', description: 'Total execution time in milliseconds' }, + }, + required: ['tasks', 'taskExecuted', 'taskSuccess', 'taskFailure', 'totalDuration'], +}; + +export const verboseResultSchema: SchemaObject = { + type: 'object', + description: 'Full response with metadata (returned when json_response=true)', + properties: { + body: { + oneOf: [ + { type: 'string', description: 'HTML content or extracted data as string' }, + { type: 'object', description: 'Extracted data as JSON object' }, + ], + description: 'Page content or extracted data', + }, + cookies: { + type: 'array', + items: cookieSchema, + description: 'Cookies set by the page', + }, + evaluateResults: { + type: 'array', + items: { type: 'string' }, + description: 'Results from evaluate actions in js_scenario', + }, + jsScenarioReport: { + oneOf: [ + jsScenarioReportSchema, + { type: 'object', additionalProperties: false, description: 'Empty object if no scenario was executed' }, + ], + description: 'JS scenario execution report', + }, + headers: { + type: 'object', + additionalProperties: { + oneOf: [ + { type: 'string' }, + { type: 'array', items: { type: 'string' } }, + ], + }, + description: 'Response headers from the target page', + }, + type: { + type: 'string', + enum: ['html', 'json', 'file'], + description: 'Content type of the response body', + }, + screenshot: { + type: 'string', + nullable: true, + description: 'Base64-encoded PNG screenshot (if requested)', + }, + iframes: { + type: 'array', + items: iframeDataSchema, + description: 'Content of iframes on the page', + }, + xhr: { + type: 'array', + items: xhrRequestDataSchema, + description: 'Captured XHR/Fetch requests made by the page', + }, + initialStatusCode: { + type: 'integer', + nullable: true, + description: 'HTTP status code of the initial page request', + }, + resolvedUrl: { + type: 'string', + description: 'Final URL after any redirects', + }, + metadata: { + type: 'string', + description: 'Additional metadata (if available)', + }, + }, + required: ['body', 'cookies', 'evaluateResults', 'jsScenarioReport', 'headers', 'type', 'screenshot', 'iframes', 'xhr', 'initialStatusCode', 'resolvedUrl'], +}; + +export const errorResponseSchema: SchemaObject = { + type: 'object', + description: 'Error response', + properties: { + errorMessage: { + type: 'string', + description: 'Human-readable error message', + }, + }, + required: ['errorMessage'], +}; + +export const extractRuleSchema: SchemaObject = { + type: 'object', + description: 'Rule for extracting data from the page', + properties: { + selector: { + type: 'string', + description: 'CSS selector to target elements', + }, + type: { + type: 'string', + enum: ['list', 'item'], + description: 'Whether to extract a single item or a list of items', + }, + output: { + oneOf: [ + { type: 'string', description: 'Attribute name or special value (@text, @html)' }, + { + type: 'object', + additionalProperties: { $ref: '#/components/schemas/ExtractRule' }, + description: 'Nested extraction rules', + }, + ], + description: 'What to extract from matched elements', + }, + clean: { + type: 'boolean', + description: 'Whether to clean/trim the extracted text', + }, + }, + required: ['selector', 'type', 'output'], +}; + +export const extractRulesSchema: SchemaObject = { + type: 'object', + description: 'Extract rules object. Keys are output field names, values are extraction rules.', + additionalProperties: extractRuleSchema, + example: { + title: { + selector: 'h1', + type: 'item', + output: '@text', + }, + links: { + selector: 'a', + type: 'list', + output: '@href', + }, + }, +}; + +export const jsScenarioSchema: SchemaObject = { + type: 'object', + description: 'JavaScript scenario to execute on the page. Each instruction is an object with the action name as key and parameter as value.', + properties: { + instructions: { + type: 'array', + items: { + type: 'object', + description: 'Single instruction object where key is the action name and value is the parameter. Actions: wait (ms), wait_for (selector), click (selector), scroll_x (pixels), scroll_y (pixels), fill ([selector, value]), wait_browser (load|domcontentloaded|networkidle), evaluate (js code), wait_for_and_click (selector)', + minProperties: 1, + maxProperties: 1, + additionalProperties: { + oneOf: [ + { type: 'string' }, + { type: 'number' }, + { type: 'array', items: { type: 'string' } }, + ], + }, + }, + description: 'List of actions to perform in order', + }, + strict: { + type: 'boolean', + description: 'If true, stop execution on first failure. Default is true.', + default: true, + }, + }, + required: ['instructions'], + example: { + instructions: [ + { click: '#load-more' }, + { wait: 1000 }, + { scroll_y: 500 }, + { wait_for: '.lazy-content' }, + ], + strict: false, + }, +}; + +/** + * All schemas for components section + */ +export const componentSchemas = { + VerboseResult: verboseResultSchema, + ErrorResponse: errorResponseSchema, + Cookie: cookieSchema, + XHRRequestData: xhrRequestDataSchema, + IFrameData: iframeDataSchema, + JsScenarioReport: jsScenarioReportSchema, + IndividualInstructionReport: individualInstructionReportSchema, + ExtractRule: extractRuleSchema, + ExtractRules: extractRulesSchema, + JsScenario: jsScenarioSchema, +}; diff --git a/src/openapi/types.ts b/src/openapi/types.ts new file mode 100644 index 0000000..c0302da --- /dev/null +++ b/src/openapi/types.ts @@ -0,0 +1,83 @@ +/** + * TypeScript interfaces for OpenAPI schema objects + */ + +export interface SchemaObject { + type?: 'string' | 'number' | 'integer' | 'boolean' | 'array' | 'object'; + description?: string; + enum?: string[]; + items?: SchemaObject; + properties?: Record; + additionalProperties?: SchemaObject | boolean; + required?: string[]; + nullable?: boolean; + example?: unknown; + default?: unknown; + oneOf?: SchemaObject[]; + $ref?: string; + minimum?: number; + maximum?: number; + minProperties?: number; + maxProperties?: number; +} + +export interface ParameterObject { + name: string; + in: 'query' | 'header' | 'path' | 'cookie'; + description?: string; + required?: boolean; + schema: SchemaObject; + example?: unknown; +} + +export interface ResponseObject { + description: string; + content?: Record; +} + +export interface OperationObject { + summary?: string; + description?: string; + operationId?: string; + parameters?: ParameterObject[]; + responses: Record; + tags?: string[]; +} + +export interface PathItemObject { + get?: OperationObject; + post?: OperationObject; + put?: OperationObject; + delete?: OperationObject; + patch?: OperationObject; +} + +export interface OpenAPISpec { + openapi: string; + info: { + title: string; + description?: string; + version: string; + contact?: { + name?: string; + url?: string; + email?: string; + }; + license?: { + name: string; + url?: string; + }; + }; + servers?: Array<{ + url: string; + description?: string; + }>; + paths: Record; + components?: { + schemas?: Record; + }; + tags?: Array<{ + name: string; + description?: string; + }>; +}