Skip to content

Commit 12470a6

Browse files
authored
feat(ocr): added reducto and pulse for OCR (#2843)
* feat(ocr): added reducto and pulse for OCR * ack comments
1 parent b813bf7 commit 12470a6

File tree

18 files changed

+2212
-0
lines changed

18 files changed

+2212
-0
lines changed

apps/docs/components/icons.tsx

Lines changed: 346 additions & 0 deletions
Large diffs are not rendered by default.

apps/docs/components/ui/icon-mapping.ts

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -84,9 +84,11 @@ import {
8484
PolymarketIcon,
8585
PostgresIcon,
8686
PosthogIcon,
87+
PulseIcon,
8788
QdrantIcon,
8889
RDSIcon,
8990
RedditIcon,
91+
ReductoIcon,
9092
ResendIcon,
9193
S3Icon,
9294
SalesforceIcon,
@@ -208,9 +210,11 @@ export const blockTypeToIconMap: Record<string, IconComponent> = {
208210
polymarket: PolymarketIcon,
209211
postgresql: PostgresIcon,
210212
posthog: PosthogIcon,
213+
pulse: PulseIcon,
211214
qdrant: QdrantIcon,
212215
rds: RDSIcon,
213216
reddit: RedditIcon,
217+
reducto: ReductoIcon,
214218
resend: ResendIcon,
215219
s3: S3Icon,
216220
salesforce: SalesforceIcon,

apps/docs/content/docs/en/tools/meta.json

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -79,9 +79,11 @@
7979
"polymarket",
8080
"postgresql",
8181
"posthog",
82+
"pulse",
8283
"qdrant",
8384
"rds",
8485
"reddit",
86+
"reducto",
8587
"resend",
8688
"s3",
8789
"salesforce",
Lines changed: 72 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,72 @@
1+
---
2+
title: Pulse
3+
description: Extract text from documents using Pulse OCR
4+
---
5+
6+
import { BlockInfoCard } from "@/components/ui/block-info-card"
7+
8+
<BlockInfoCard
9+
type="pulse"
10+
color="#E0E0E0"
11+
/>
12+
13+
{/* MANUAL-CONTENT-START:intro */}
14+
The [Pulse](https://www.pulseapi.com/) tool enables seamless extraction of text and structured content from a wide variety of documents—including PDFs, images, and Office files—using state-of-the-art OCR (Optical Character Recognition) powered by Pulse. Designed for automated agentic workflows, Pulse Parser makes it easy to unlock valuable information trapped in unstructured documents and integrate the extracted content directly into your workflow.
15+
16+
With Pulse, you can:
17+
18+
- **Extract text from documents**: Quickly convert scanned PDFs, images, and Office documents to usable text, markdown, or JSON.
19+
- **Process documents by URL or upload**: Simply provide a file URL or use upload to extract text from local documents or remote resources.
20+
- **Flexible output formats**: Choose between markdown, plain text, or JSON representations of the extracted content for downstream processing.
21+
- **Selective page processing**: Specify a range of pages to process, reducing processing time and cost when you only need part of a document.
22+
- **Figure and table extraction**: Optionally extract figures and tables, with automatic caption and description generation for populated context.
23+
- **Get processing insights**: Receive detailed metadata on each job, including file type, page count, processing time, and more.
24+
- **Integration-ready responses**: Incorporate extracted content into research, workflow automation, or data analysis pipelines.
25+
26+
Ideal for automating tedious document review, enabling content summarization, research, and more, Pulse Parser brings real-world documents into the digital workflow era.
27+
28+
If you need accurate, scalable, and developer-friendly document parsing capabilities—across formats, languages, and layouts—Pulse empowers your agents to read the world.
29+
{/* MANUAL-CONTENT-END */}
30+
31+
32+
## Usage Instructions
33+
34+
Integrate Pulse into the workflow. Extract text from PDF documents, images, and Office files via URL or upload.
35+
36+
37+
38+
## Tools
39+
40+
### `pulse_parser`
41+
42+
Parse documents (PDF, images, Office docs) using Pulse OCR API
43+
44+
#### Input
45+
46+
| Parameter | Type | Required | Description |
47+
| --------- | ---- | -------- | ----------- |
48+
| `filePath` | string | Yes | URL to a document to be processed |
49+
| `fileUpload` | object | No | File upload data from file-upload component |
50+
| `pages` | string | No | Page range to process \(1-indexed, e.g., "1-2,5"\) |
51+
| `extractFigure` | boolean | No | Enable figure extraction from the document |
52+
| `figureDescription` | boolean | No | Generate descriptions/captions for extracted figures |
53+
| `returnHtml` | boolean | No | Include HTML in the response |
54+
| `chunking` | string | No | Chunking strategies \(comma-separated: semantic, header, page, recursive\) |
55+
| `chunkSize` | number | No | Maximum characters per chunk when chunking is enabled |
56+
| `apiKey` | string | Yes | Pulse API key |
57+
58+
#### Output
59+
60+
| Parameter | Type | Description |
61+
| --------- | ---- | ----------- |
62+
| `markdown` | string | Extracted content in markdown format |
63+
| `page_count` | number | Number of pages in the document |
64+
| `job_id` | string | Unique job identifier |
65+
| `bounding_boxes` | json | Bounding box layout information |
66+
| `extraction_url` | string | URL for extraction results \(for large documents\) |
67+
| `html` | string | HTML content if requested |
68+
| `structured_output` | json | Structured output if schema was provided |
69+
| `chunks` | json | Chunked content if chunking was enabled |
70+
| `figures` | json | Extracted figures if figure extraction was enabled |
71+
72+
Lines changed: 63 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,63 @@
1+
---
2+
title: Reducto
3+
description: Extract text from PDF documents
4+
---
5+
6+
import { BlockInfoCard } from "@/components/ui/block-info-card"
7+
8+
<BlockInfoCard
9+
type="reducto"
10+
color="#5c0c5c"
11+
/>
12+
13+
{/* MANUAL-CONTENT-START:intro */}
14+
The [Reducto](https://reducto.ai/) tool enables fast and accurate extraction of text and data from PDF documents via OCR (Optical Character Recognition). Reducto is designed for agent workflows, making it easy to process uploaded or linked PDFs and transform their contents into ready-to-use information.
15+
16+
With the Reducto tool, you can:
17+
18+
- **Extract text and tables from PDFs**: Quickly convert scanned or digital PDFs to text, markdown, or structured JSON.
19+
- **Parse PDFs from uploads or URLs**: Process documents either by uploading a PDF or specifying a direct URL.
20+
- **Customize output formatting**: Choose your preferred output format—markdown, plain text, or JSON—and specify table formats as markdown or HTML.
21+
- **Select specific pages**: Optionally extract content from particular pages to optimize processing and focus on what matters.
22+
- **Receive detailed processing metadata**: Alongside extracted content, get job details, processing times, source file info, page counts, and OCR usage stats for audit and automation.
23+
24+
Whether you’re automating workflow steps, extracting business-critical information, or unlocking archival documents for search and analysis, Reducto’s OCR parser gives you structured, actionable data from even the most complex PDFs.
25+
26+
Looking for reliable and scalable PDF parsing? Reducto is optimized for developer and agent use—providing accuracy, speed, and flexibility for modern document understanding.
27+
{/* MANUAL-CONTENT-END */}
28+
29+
30+
## Usage Instructions
31+
32+
Integrate Reducto Parse into the workflow. Can extract text from uploaded PDF documents, or from a URL.
33+
34+
35+
36+
## Tools
37+
38+
### `reducto_parser`
39+
40+
Parse PDF documents using Reducto OCR API
41+
42+
#### Input
43+
44+
| Parameter | Type | Required | Description |
45+
| --------- | ---- | -------- | ----------- |
46+
| `filePath` | string | Yes | URL to a PDF document to be processed |
47+
| `fileUpload` | object | No | File upload data from file-upload component |
48+
| `pages` | array | No | Specific pages to process \(1-indexed page numbers\) |
49+
| `tableOutputFormat` | string | No | Table output format \(html or markdown\). Defaults to markdown. |
50+
| `apiKey` | string | Yes | Reducto API key \(REDUCTO_API_KEY\) |
51+
52+
#### Output
53+
54+
| Parameter | Type | Description |
55+
| --------- | ---- | ----------- |
56+
| `job_id` | string | Unique identifier for the processing job |
57+
| `duration` | number | Processing time in seconds |
58+
| `usage` | json | Resource consumption data |
59+
| `result` | json | Parsed document content with chunks and blocks |
60+
| `pdf_url` | string | Storage URL of converted PDF |
61+
| `studio_link` | string | Link to Reducto studio interface |
62+
63+
Lines changed: 169 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,169 @@
1+
import { createLogger } from '@sim/logger'
2+
import { type NextRequest, NextResponse } from 'next/server'
3+
import { z } from 'zod'
4+
import { checkHybridAuth } from '@/lib/auth/hybrid'
5+
import { generateRequestId } from '@/lib/core/utils/request'
6+
import { getBaseUrl } from '@/lib/core/utils/urls'
7+
import { StorageService } from '@/lib/uploads'
8+
import { extractStorageKey, inferContextFromKey } from '@/lib/uploads/utils/file-utils'
9+
import { verifyFileAccess } from '@/app/api/files/authorization'
10+
11+
export const dynamic = 'force-dynamic'
12+
13+
const logger = createLogger('PulseParseAPI')
14+
15+
const PulseParseSchema = z.object({
16+
apiKey: z.string().min(1, 'API key is required'),
17+
filePath: z.string().min(1, 'File path is required'),
18+
pages: z.string().optional(),
19+
extractFigure: z.boolean().optional(),
20+
figureDescription: z.boolean().optional(),
21+
returnHtml: z.boolean().optional(),
22+
chunking: z.string().optional(),
23+
chunkSize: z.number().optional(),
24+
})
25+
26+
export async function POST(request: NextRequest) {
27+
const requestId = generateRequestId()
28+
29+
try {
30+
const authResult = await checkHybridAuth(request, { requireWorkflowId: false })
31+
32+
if (!authResult.success || !authResult.userId) {
33+
logger.warn(`[${requestId}] Unauthorized Pulse parse attempt`, {
34+
error: authResult.error || 'Missing userId',
35+
})
36+
return NextResponse.json(
37+
{
38+
success: false,
39+
error: authResult.error || 'Unauthorized',
40+
},
41+
{ status: 401 }
42+
)
43+
}
44+
45+
const userId = authResult.userId
46+
const body = await request.json()
47+
const validatedData = PulseParseSchema.parse(body)
48+
49+
logger.info(`[${requestId}] Pulse parse request`, {
50+
filePath: validatedData.filePath,
51+
isWorkspaceFile: validatedData.filePath.includes('/api/files/serve/'),
52+
userId,
53+
})
54+
55+
let fileUrl = validatedData.filePath
56+
57+
if (validatedData.filePath?.includes('/api/files/serve/')) {
58+
try {
59+
const storageKey = extractStorageKey(validatedData.filePath)
60+
const context = inferContextFromKey(storageKey)
61+
62+
const hasAccess = await verifyFileAccess(storageKey, userId, undefined, context, false)
63+
64+
if (!hasAccess) {
65+
logger.warn(`[${requestId}] Unauthorized presigned URL generation attempt`, {
66+
userId,
67+
key: storageKey,
68+
context,
69+
})
70+
return NextResponse.json(
71+
{
72+
success: false,
73+
error: 'File not found',
74+
},
75+
{ status: 404 }
76+
)
77+
}
78+
79+
fileUrl = await StorageService.generatePresignedDownloadUrl(storageKey, context, 5 * 60)
80+
logger.info(`[${requestId}] Generated presigned URL for ${context} file`)
81+
} catch (error) {
82+
logger.error(`[${requestId}] Failed to generate presigned URL:`, error)
83+
return NextResponse.json(
84+
{
85+
success: false,
86+
error: 'Failed to generate file access URL',
87+
},
88+
{ status: 500 }
89+
)
90+
}
91+
} else if (validatedData.filePath?.startsWith('/')) {
92+
const baseUrl = getBaseUrl()
93+
fileUrl = `${baseUrl}${validatedData.filePath}`
94+
}
95+
96+
const formData = new FormData()
97+
formData.append('file_url', fileUrl)
98+
99+
if (validatedData.pages) {
100+
formData.append('pages', validatedData.pages)
101+
}
102+
if (validatedData.extractFigure !== undefined) {
103+
formData.append('extract_figure', String(validatedData.extractFigure))
104+
}
105+
if (validatedData.figureDescription !== undefined) {
106+
formData.append('figure_description', String(validatedData.figureDescription))
107+
}
108+
if (validatedData.returnHtml !== undefined) {
109+
formData.append('return_html', String(validatedData.returnHtml))
110+
}
111+
if (validatedData.chunking) {
112+
formData.append('chunking', validatedData.chunking)
113+
}
114+
if (validatedData.chunkSize !== undefined) {
115+
formData.append('chunk_size', String(validatedData.chunkSize))
116+
}
117+
118+
const pulseResponse = await fetch('https://api.runpulse.com/extract', {
119+
method: 'POST',
120+
headers: {
121+
'x-api-key': validatedData.apiKey,
122+
},
123+
body: formData,
124+
})
125+
126+
if (!pulseResponse.ok) {
127+
const errorText = await pulseResponse.text()
128+
logger.error(`[${requestId}] Pulse API error:`, errorText)
129+
return NextResponse.json(
130+
{
131+
success: false,
132+
error: `Pulse API error: ${pulseResponse.statusText}`,
133+
},
134+
{ status: pulseResponse.status }
135+
)
136+
}
137+
138+
const pulseData = await pulseResponse.json()
139+
140+
logger.info(`[${requestId}] Pulse parse successful`)
141+
142+
return NextResponse.json({
143+
success: true,
144+
output: pulseData,
145+
})
146+
} catch (error) {
147+
if (error instanceof z.ZodError) {
148+
logger.warn(`[${requestId}] Invalid request data`, { errors: error.errors })
149+
return NextResponse.json(
150+
{
151+
success: false,
152+
error: 'Invalid request data',
153+
details: error.errors,
154+
},
155+
{ status: 400 }
156+
)
157+
}
158+
159+
logger.error(`[${requestId}] Error in Pulse parse:`, error)
160+
161+
return NextResponse.json(
162+
{
163+
success: false,
164+
error: error instanceof Error ? error.message : 'Internal server error',
165+
},
166+
{ status: 500 }
167+
)
168+
}
169+
}

0 commit comments

Comments
 (0)