diff --git a/app/api/parse-pdf/route.ts b/app/api/parse-pdf/route.ts index 1fdd947b0..db1d2d4b4 100644 --- a/app/api/parse-pdf/route.ts +++ b/app/api/parse-pdf/route.ts @@ -6,6 +6,10 @@ import type { ParsedPdfContent } from '@/lib/types/pdf'; import { createLogger } from '@/lib/logger'; import { apiError, apiSuccess } from '@/lib/server/api-response'; import { validateUrlForSSRF } from '@/lib/server/ssrf-guard'; + +export const runtime = 'nodejs'; +export const dynamic = 'force-dynamic'; + const log = createLogger('Parse PDF'); export async function POST(req: NextRequest) { @@ -27,6 +31,10 @@ export async function POST(req: NextRequest) { const providerId = formData.get('providerId') as PDFProviderId | null; const apiKey = formData.get('apiKey') as string | null; const baseUrl = formData.get('baseUrl') as string | null; + const cloudApiKey = formData.get('cloudApiKey') as string | null; + const cloudBaseUrl = formData.get('cloudBaseUrl') as string | null; + const localApiKey = formData.get('localApiKey') as string | null; + const localBaseUrl = formData.get('localBaseUrl') as string | null; if (!pdfFile) { return apiError('MISSING_REQUIRED_FIELD', 400, 'No PDF file provided'); @@ -37,7 +45,39 @@ export async function POST(req: NextRequest) { pdfFileName = pdfFile?.name; resolvedProviderId = effectiveProviderId; - const clientBaseUrl = baseUrl || undefined; + const resolveClientPdfConfig = () => { + const legacyBaseUrl = baseUrl?.trim() || ''; + const legacyApiKey = apiKey?.trim() || ''; + + if (effectiveProviderId !== 'mineru') { + return { + clientBaseUrl: legacyBaseUrl || undefined, + clientApiKey: legacyApiKey || undefined, + }; + } + + const cloudUrl = cloudBaseUrl?.trim() || ''; + const cloudKey = cloudApiKey?.trim() || ''; + const localUrl = localBaseUrl?.trim() || ''; + const localKey = localApiKey?.trim() || ''; + + // Prefer cloud when it is complete, then local, then legacy fallback. + if (cloudUrl && cloudKey) { + return { clientBaseUrl: cloudUrl, clientApiKey: cloudKey }; + } + if (localUrl) { + return { clientBaseUrl: localUrl, clientApiKey: localKey || undefined }; + } + if (cloudUrl) { + return { clientBaseUrl: cloudUrl, clientApiKey: cloudKey || undefined }; + } + return { + clientBaseUrl: legacyBaseUrl || undefined, + clientApiKey: legacyApiKey || undefined, + }; + }; + + const { clientBaseUrl, clientApiKey } = resolveClientPdfConfig(); if (clientBaseUrl && process.env.NODE_ENV === 'production') { const ssrfError = validateUrlForSSRF(clientBaseUrl); if (ssrfError) { @@ -48,7 +88,7 @@ export async function POST(req: NextRequest) { const config = { providerId: effectiveProviderId, apiKey: clientBaseUrl - ? apiKey || '' + ? clientApiKey || '' : resolvePDFApiKey(effectiveProviderId, apiKey || undefined), baseUrl: clientBaseUrl ? clientBaseUrl diff --git a/app/generation-preview/page.tsx b/app/generation-preview/page.tsx index 253db686c..03f881b20 100644 --- a/app/generation-preview/page.tsx +++ b/app/generation-preview/page.tsx @@ -32,6 +32,50 @@ import { StepVisualizer } from './components/visualizers'; const log = createLogger('GenerationPreview'); +function resolveEffectivePdfConfig( + providerId?: string, + config?: { + apiKey?: string; + baseUrl?: string; + cloudApiKey?: string; + cloudBaseUrl?: string; + localApiKey?: string; + localBaseUrl?: string; + }, +): { apiKey?: string; baseUrl?: string } { + if (!config) return {}; + + const legacyBaseUrl = config.baseUrl?.trim() || ''; + const legacyApiKey = config.apiKey?.trim() || ''; + + if (providerId !== 'mineru') { + return { + apiKey: legacyApiKey || undefined, + baseUrl: legacyBaseUrl || undefined, + }; + } + + const cloudBaseUrl = config.cloudBaseUrl?.trim() || ''; + const cloudApiKey = config.cloudApiKey?.trim() || ''; + const localBaseUrl = config.localBaseUrl?.trim() || ''; + const localApiKey = config.localApiKey?.trim() || ''; + + // Prefer cloud when complete, otherwise local, then legacy. + if (cloudBaseUrl && cloudApiKey) { + return { baseUrl: cloudBaseUrl, apiKey: cloudApiKey }; + } + if (localBaseUrl) { + return { baseUrl: localBaseUrl, apiKey: localApiKey || undefined }; + } + if (cloudBaseUrl) { + return { baseUrl: cloudBaseUrl, apiKey: cloudApiKey || undefined }; + } + return { + baseUrl: legacyBaseUrl || undefined, + apiKey: legacyApiKey || undefined, + }; +} + function GenerationPreviewContent() { const router = useRouter(); const { t } = useI18n(); @@ -183,11 +227,29 @@ function GenerationPreviewContent() { if (currentSession.pdfProviderId) { parseFormData.append('providerId', currentSession.pdfProviderId); } - if (currentSession.pdfProviderConfig?.apiKey?.trim()) { - parseFormData.append('apiKey', currentSession.pdfProviderConfig.apiKey); + + const resolvedPdfConfig = resolveEffectivePdfConfig( + currentSession.pdfProviderId, + currentSession.pdfProviderConfig, + ); + if (resolvedPdfConfig.apiKey) { + parseFormData.append('apiKey', resolvedPdfConfig.apiKey); + } + if (resolvedPdfConfig.baseUrl) { + parseFormData.append('baseUrl', resolvedPdfConfig.baseUrl); + } + + if (currentSession.pdfProviderConfig?.cloudApiKey?.trim()) { + parseFormData.append('cloudApiKey', currentSession.pdfProviderConfig.cloudApiKey); + } + if (currentSession.pdfProviderConfig?.cloudBaseUrl?.trim()) { + parseFormData.append('cloudBaseUrl', currentSession.pdfProviderConfig.cloudBaseUrl); + } + if (currentSession.pdfProviderConfig?.localApiKey?.trim()) { + parseFormData.append('localApiKey', currentSession.pdfProviderConfig.localApiKey); } - if (currentSession.pdfProviderConfig?.baseUrl?.trim()) { - parseFormData.append('baseUrl', currentSession.pdfProviderConfig.baseUrl); + if (currentSession.pdfProviderConfig?.localBaseUrl?.trim()) { + parseFormData.append('localBaseUrl', currentSession.pdfProviderConfig.localBaseUrl); } const parseResponse = await fetch('/api/parse-pdf', { diff --git a/app/generation-preview/types.ts b/app/generation-preview/types.ts index 408ae81fd..7a9991543 100644 --- a/app/generation-preview/types.ts +++ b/app/generation-preview/types.ts @@ -21,7 +21,14 @@ export interface GenerationSessionState { pdfStorageKey?: string; pdfFileName?: string; pdfProviderId?: string; - pdfProviderConfig?: { apiKey?: string; baseUrl?: string }; + pdfProviderConfig?: { + apiKey?: string; + baseUrl?: string; + cloudApiKey?: string; + cloudBaseUrl?: string; + localApiKey?: string; + localBaseUrl?: string; + }; // Web search context researchContext?: string; researchSources?: Array<{ title: string; url: string }>; diff --git a/app/page.tsx b/app/page.tsx index c0da47614..51cad06b9 100644 --- a/app/page.tsx +++ b/app/page.tsx @@ -270,7 +270,16 @@ function HomePage() { let pdfStorageKey: string | undefined; let pdfFileName: string | undefined; let pdfProviderId: string | undefined; - let pdfProviderConfig: { apiKey?: string; baseUrl?: string } | undefined; + let pdfProviderConfig: + | { + apiKey?: string; + baseUrl?: string; + cloudApiKey?: string; + cloudBaseUrl?: string; + localApiKey?: string; + localBaseUrl?: string; + } + | undefined; if (form.pdfFile) { pdfStorageKey = await storePdfBlob(form.pdfFile); @@ -283,6 +292,10 @@ function HomePage() { pdfProviderConfig = { apiKey: providerCfg.apiKey, baseUrl: providerCfg.baseUrl, + cloudApiKey: providerCfg.cloudApiKey, + cloudBaseUrl: providerCfg.cloudBaseUrl, + localApiKey: providerCfg.localApiKey, + localBaseUrl: providerCfg.localBaseUrl, }; } } diff --git a/components/settings/pdf-settings.tsx b/components/settings/pdf-settings.tsx index bfa43bdda..de9282195 100644 --- a/components/settings/pdf-settings.tsx +++ b/components/settings/pdf-settings.tsx @@ -34,8 +34,18 @@ interface PDFSettingsProps { export function PDFSettings({ selectedProviderId }: PDFSettingsProps) { const { t } = useI18n(); const [showApiKey, setShowApiKey] = useState(false); + const [showCloudApiKey, setShowCloudApiKey] = useState(false); + const [showLocalApiKey, setShowLocalApiKey] = useState(false); const [testStatus, setTestStatus] = useState<'idle' | 'testing' | 'success' | 'error'>('idle'); const [testMessage, setTestMessage] = useState(''); + const [cloudTestStatus, setCloudTestStatus] = useState<'idle' | 'testing' | 'success' | 'error'>( + 'idle', + ); + const [cloudTestMessage, setCloudTestMessage] = useState(''); + const [localTestStatus, setLocalTestStatus] = useState<'idle' | 'testing' | 'success' | 'error'>( + 'idle', + ); + const [localTestMessage, setLocalTestMessage] = useState(''); const pdfProvidersConfig = useSettingsStore((state) => state.pdfProvidersConfig); const setPDFProviderConfig = useSettingsStore((state) => state.setPDFProviderConfig); @@ -51,8 +61,14 @@ export function PDFSettings({ selectedProviderId }: PDFSettingsProps) { if (selectedProviderId !== prevSelectedProviderId) { setPrevSelectedProviderId(selectedProviderId); setShowApiKey(false); + setShowCloudApiKey(false); + setShowLocalApiKey(false); setTestStatus('idle'); setTestMessage(''); + setCloudTestStatus('idle'); + setCloudTestMessage(''); + setLocalTestStatus('idle'); + setLocalTestMessage(''); } const handleTestConnection = async () => { @@ -89,6 +105,74 @@ export function PDFSettings({ selectedProviderId }: PDFSettingsProps) { } }; + const handleCloudTest = async () => { + const baseUrl = providerConfig?.cloudBaseUrl; + if (!baseUrl) return; + + setCloudTestStatus('testing'); + setCloudTestMessage(''); + + try { + const response = await fetch('/api/verify-pdf-provider', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + providerId: selectedProviderId, + apiKey: providerConfig?.cloudApiKey || '', + baseUrl, + }), + }); + + const data = await response.json(); + + if (data.success) { + setCloudTestStatus('success'); + setCloudTestMessage(t('settings.connectionSuccess')); + } else { + setCloudTestStatus('error'); + setCloudTestMessage(`${t('settings.connectionFailed')}: ${data.error}`); + } + } catch (err) { + setCloudTestStatus('error'); + const message = err instanceof Error ? err.message : String(err); + setCloudTestMessage(`${t('settings.connectionFailed')}: ${message}`); + } + }; + + const handleLocalTest = async () => { + const baseUrl = providerConfig?.localBaseUrl; + if (!baseUrl) return; + + setLocalTestStatus('testing'); + setLocalTestMessage(''); + + try { + const response = await fetch('/api/verify-pdf-provider', { + method: 'POST', + headers: { 'Content-Type': 'application/json' }, + body: JSON.stringify({ + providerId: selectedProviderId, + apiKey: providerConfig?.localApiKey || '', + baseUrl, + }), + }); + + const data = await response.json(); + + if (data.success) { + setLocalTestStatus('success'); + setLocalTestMessage(t('settings.connectionSuccess')); + } else { + setLocalTestStatus('error'); + setLocalTestMessage(`${t('settings.connectionFailed')}: ${data.error}`); + } + } catch (err) { + setLocalTestStatus('error'); + const message = err instanceof Error ? err.message : String(err); + setLocalTestMessage(`${t('settings.connectionFailed')}: ${message}`); + } + }; + return (
{/* Server-configured notice */} @@ -98,8 +182,198 @@ export function PDFSettings({ selectedProviderId }: PDFSettingsProps) {
)} - {/* Base URL + API Key Configuration (for remote providers like MinerU) */} - {(needsRemoteConfig || isServerConfigured) && ( + {/* MinerU: Cloud + Local blocks */} + {needsRemoteConfig && ( + <> + {/* Cloud */} +
+

{t('settings.mineruCloud')}

+
+
+ +
+ + setPDFProviderConfig(selectedProviderId, { cloudBaseUrl: e.target.value }) + } + className="text-sm" + /> + +
+
+ +
+ +
+ + setPDFProviderConfig(selectedProviderId, { cloudApiKey: e.target.value }) + } + className="font-mono text-sm pr-10" + /> + +
+
+
+ + {cloudTestMessage && ( +
+
+ {cloudTestStatus === 'success' && } + {cloudTestStatus === 'error' && } + {cloudTestMessage} +
+
+ )} + + {providerConfig?.cloudBaseUrl && ( +

+ {t('settings.requestUrl')}: {providerConfig.cloudBaseUrl}/file_parse +

+ )} +
+ + {/* Local */} +
+

{t('settings.mineruLocal')}

+
+
+ +
+ + setPDFProviderConfig(selectedProviderId, { localBaseUrl: e.target.value }) + } + className="text-sm" + /> + +
+
+ +
+ +
+ + setPDFProviderConfig(selectedProviderId, { localApiKey: e.target.value }) + } + className="font-mono text-sm pr-10" + /> + +
+
+
+ + {localTestMessage && ( +
+
+ {localTestStatus === 'success' && } + {localTestStatus === 'error' && } + {localTestMessage} +
+
+ )} + + {providerConfig?.localBaseUrl && ( +

+ {t('settings.requestUrl')}: {providerConfig.localBaseUrl}/file_parse +

+ )} +
+ + )} + + {/* Base URL + API Key Configuration (for non-MinerU remote providers) */} + {!needsRemoteConfig && isServerConfigured && ( <>
diff --git a/lib/i18n/settings.ts b/lib/i18n/settings.ts index 356fea554..121f853e7 100644 --- a/lib/i18n/settings.ts +++ b/lib/i18n/settings.ts @@ -461,6 +461,8 @@ export const settingsZhCN = { pdfFeatures: '支持功能', pdfApiKey: 'API Key', pdfBaseUrl: 'Base URL', + mineruCloud: 'Cloud Base URL', + mineruLocal: 'Local Base URL', mineruDescription: 'MinerU 是一个商用 PDF 解析服务,支持高级功能如表格提取、公式识别和布局分析。', mineruApiKeyRequired: '使用前需要在 MinerU 官网申请 API Key。', @@ -1060,6 +1062,8 @@ export const settingsEnUS = { pdfFeatures: 'Supported Features', pdfApiKey: 'API Key', pdfBaseUrl: 'Base URL', + mineruCloud: 'Cloud Base URL', + mineruLocal: 'Local Base URL', mineruDescription: 'MinerU is a commercial PDF parsing service that supports advanced features such as table extraction, formula recognition, and layout analysis.', mineruApiKeyRequired: 'You need to apply for an API Key on the MinerU website before use.', diff --git a/lib/pdf/mineru-cloud.ts b/lib/pdf/mineru-cloud.ts new file mode 100644 index 000000000..3d1a34e0d --- /dev/null +++ b/lib/pdf/mineru-cloud.ts @@ -0,0 +1,339 @@ +/** + * MinerU Precision Parsing API (v4) + * https://mineru.net/api/v4 + * + * Flow: POST /file-urls/batch → PUT presigned URL → poll /extract-results/batch/{id} → download ZIP + */ + +import JSZip from 'jszip'; +import type { PDFParserConfig } from './types'; +import type { ParsedPdfContent } from '@/lib/types/pdf'; +import { createLogger } from '@/lib/logger'; +import { extractMinerUResult } from './mineru-parser'; + +const log = createLogger('MinerUCloudV4'); + +export const MINERU_CLOUD_V4_API_BASE = 'https://mineru.net/api/v4'; + +const TIMEOUTS = { + batch: 60_000, + upload: 180_000, + poll: 30_000, + zip: 180_000, +} as const; + +const POLL_INTERVAL_MS = 2500; +const POLL_MAX_MS = 15 * 60 * 1000; + +const MIME_MAP: Record = { + png: 'image/png', + jpg: 'image/jpeg', + jpeg: 'image/jpeg', + webp: 'image/webp', + gif: 'image/gif', +}; + +const sleep = (ms: number) => new Promise((r) => setTimeout(r, ms)); + +function extToMime(ext: string): string { + return MIME_MAP[ext.toLowerCase()] ?? 'application/octet-stream'; +} + +function urlPathForLog(url: string): string { + try { + const { hostname, pathname } = new URL(url); + return `${hostname}${pathname}`; + } catch { + return url; + } +} + +function isRetryable(err: unknown): boolean { + if (!(err instanceof Error)) return false; + const msg = err.message.toLowerCase(); + return ['fetch failed', 'econnreset', 'etimedout', 'timeout', 'aborted', 'enotfound'].some((s) => + msg.includes(s), + ); +} + +async function fetchWithRetry(fn: () => Promise, context: string, attempts = 4): Promise { + let lastErr: unknown; + + for (let i = 1; i <= attempts; i++) { + try { + return await fn(); + } catch (err) { + lastErr = err; + if (!isRetryable(err) || i === attempts) break; + log.warn(`[MinerU v4] ${context} — retry ${i}/${attempts}:`, err); + await sleep(400 * i); + } + } + + const msg = lastErr instanceof Error ? lastErr.message : String(lastErr); + throw new Error(`MinerU v4 ${context} failed: ${msg}`); +} + +// ── API response helpers ────────────────────────────────────────────────────── + +interface MinerUEnvelope { + code: number; + msg: string; + trace_id?: string; + data: T; +} + +function unwrapMinerUResponse(json: MinerUEnvelope, context: string): T { + if (json.code !== 0) { + throw new Error(`MinerU ${context}: ${json.msg || 'unknown error'} (code ${json.code})`); + } + return json.data; +} + +async function readMinerUJson(res: Response, context: string): Promise { + const text = await res.text(); + + let json: MinerUEnvelope; + try { + json = JSON.parse(text) as MinerUEnvelope; + } catch { + throw new Error(`MinerU ${context}: invalid JSON (HTTP ${res.status}): ${text.slice(0, 500)}`); + } + + if (!res.ok) { + throw new Error(`MinerU ${context}: HTTP ${res.status} — ${json.msg || text.slice(0, 300)}`); + } + + return unwrapMinerUResponse(json, context); +} + +// ── Filename sanitization ───────────────────────────────────────────────────── + +export function sanitizePdfFileNameForMinerU(name: string | undefined): string { + const fallback = 'document.pdf'; + const raw = (name ?? fallback).split(/[/\\]/).pop()?.trim() ?? fallback; + const trimmed = raw.slice(0, 240); + + if (!trimmed.toLowerCase().endsWith('.pdf')) return fallback; + if (trimmed.includes('..') || trimmed.includes('/') || trimmed.includes('\\')) return fallback; + + return trimmed || fallback; +} + +export function isMinerUCloudV4BaseUrl(baseUrl: string): boolean { + try { + const { hostname, pathname } = new URL(baseUrl.trim()); + return hostname.toLowerCase() === 'mineru.net' && pathname.replace(/\/+$/, '') === '/api/v4'; + } catch { + return false; + } +} + +// ── ZIP parsing ─────────────────────────────────────────────────────────────── + +type BatchExtractRow = { + file_name?: string; + state?: string; + full_zip_url?: string; + err_msg?: string; +}; + +async function parseMinerUZip(zipUrl: string): Promise { + log.info(`[MinerU v4] Downloading result ZIP: ${urlPathForLog(zipUrl)}`); + + const zipRes = await fetchWithRetry( + () => fetch(zipUrl, { signal: AbortSignal.timeout(TIMEOUTS.zip) }), + 'ZIP download', + ); + + if (!zipRes.ok) { + const text = await zipRes.text().catch(() => zipRes.statusText); + throw new Error(`MinerU ZIP download failed (${zipRes.status}): ${text.slice(0, 300)}`); + } + + const zipBuf = Buffer.from(await zipRes.arrayBuffer()); + let zip: Awaited>; + try { + zip = await JSZip.loadAsync(zipBuf); + } catch (e) { + throw new Error(`MinerU ZIP parse failed: ${e instanceof Error ? e.message : String(e)}`); + } + + const filePaths = Object.keys(zip.files).filter((p) => !zip.files[p].dir); + const fullMdPath = filePaths.find((p) => /(^|\/)full\.md$/i.test(p)); + const contentListPath = filePaths.find( + (p) => p.endsWith('_content_list.json') || /(^|\/)content_list\.json$/i.test(p), + ); + + if (!fullMdPath) throw new Error('MinerU ZIP: full.md not found'); + + const mdContent = await zip.file(fullMdPath)!.async('string'); + const dirPrefix = fullMdPath.includes('/') + ? fullMdPath.slice(0, fullMdPath.lastIndexOf('/') + 1) + : ''; + + let contentList: unknown; + if (contentListPath) { + const raw = await zip.file(contentListPath)!.async('string'); + try { + contentList = JSON.parse(raw); + } catch { + log.warn('[MinerU v4] content_list JSON parse failed, continuing with markdown only'); + } + } + + async function readImage(relPath: string): Promise { + const normalized = relPath.replace(/^\.?\//, ''); + for (const candidate of [dirPrefix + normalized, normalized]) { + const entry = zip.file(candidate); + if (!entry) continue; + const buf = await entry.async('nodebuffer'); + const ext = candidate.split('.').pop() ?? 'png'; + return `data:${extToMime(ext)};base64,${buf.toString('base64')}`; + } + log.warn(`[MinerU v4] Image not found in ZIP: ${relPath}`); + return null; + } + + const imageData: Record = {}; + if (Array.isArray(contentList)) { + for (const item of contentList as Array>) { + const imgPath = item.img_path; + if (typeof imgPath === 'string' && imgPath) { + const dataUrl = await readImage(imgPath); + if (dataUrl) imageData[imgPath] = dataUrl; + } + } + } + + return extractMinerUResult({ + md_content: mdContent, + images: imageData, + content_list: contentList, + }); +} + +// ── Main export ─────────────────────────────────────────────────────────────── + +/** + * Upload a PDF via MinerU cloud v4 presigned URLs, poll until done, return parsed content. + */ +export async function parseWithMinerUCloudV4( + config: PDFParserConfig, + pdfBuffer: Buffer, + uploadFileName: string, +): Promise { + const apiRoot = config.baseUrl!.trim().replace(/\/+$/, ''); + const token = config.apiKey!.trim(); + const modelVersion = config.mineruModelVersion === 'pipeline' ? 'pipeline' : 'vlm'; + + const authHeaders = { + Authorization: `Bearer ${token}`, + 'Content-Type': 'application/json', + Accept: 'application/json', + }; + + // Step 1: Create batch and get presigned upload URL + log.info(`[MinerU v4] Creating batch for "${uploadFileName}" (model: ${modelVersion})`); + + const batchData = await fetchWithRetry(async () => { + const res = await fetch(`${apiRoot}/file-urls/batch`, { + method: 'POST', + headers: authHeaders, + body: JSON.stringify({ + files: [{ name: uploadFileName }], + model_version: modelVersion, + enable_formula: true, + enable_table: true, + }), + signal: AbortSignal.timeout(TIMEOUTS.batch), + }); + return readMinerUJson<{ batch_id: string; file_urls?: string[]; files?: string[] }>( + res, + 'file-urls/batch', + ); + }, 'create batch'); + + const uploadUrls = batchData.file_urls ?? batchData.files; + if (!batchData.batch_id || !uploadUrls?.length) { + throw new Error('MinerU batch response missing batch_id or upload URLs'); + } + + // Step 2: Upload PDF to presigned URL + log.info( + `[MinerU v4] Uploading ${pdfBuffer.byteLength} bytes to ${urlPathForLog(uploadUrls[0])}`, + ); + + const putRes = await fetchWithRetry( + () => + fetch(uploadUrls[0], { + method: 'PUT', + body: new Uint8Array(pdfBuffer), + signal: AbortSignal.timeout(TIMEOUTS.upload), + redirect: 'manual', + // No Content-Type — presigned OSS URLs are sensitive to headers in the signature + }), + 'presigned upload', + 5, + ); + + if (!putRes.ok) { + const text = await putRes.text().catch(() => putRes.statusText); + throw new Error(`MinerU upload failed (${putRes.status}): ${text.slice(0, 400)}`); + } + + // Give the backend a moment to register the upload + await sleep(1500); + + // Step 3: Poll for completion + const deadline = Date.now() + POLL_MAX_MS; + let lastState = ''; + + while (Date.now() < deadline) { + const statusData = await fetchWithRetry( + async () => { + log.debug?.(`[MinerU v4] Polling batch ${batchData.batch_id}`); + const res = await fetch(`${apiRoot}/extract-results/batch/${batchData.batch_id}`, { + headers: { Authorization: `Bearer ${token}`, Accept: 'application/json' }, + signal: AbortSignal.timeout(TIMEOUTS.poll), + }); + return readMinerUJson<{ extract_result?: BatchExtractRow | BatchExtractRow[] }>( + res, + 'extract-results/batch', + ); + }, + 'poll batch', + 3, + ); + + const rows = statusData.extract_result; + const list: BatchExtractRow[] = Array.isArray(rows) ? rows : rows ? [rows] : []; + const row = + list.find((r) => r.file_name === uploadFileName) || + list.find((r) => r.file_name?.toLowerCase() === uploadFileName.toLowerCase()) || + list[0]; + + if (!row?.state) { + log.warn('[MinerU v4] Poll returned no result row yet'); + await sleep(POLL_INTERVAL_MS); + continue; + } + + if (row.state !== lastState) { + lastState = row.state; + log.info(`[MinerU v4] Batch ${batchData.batch_id} → ${row.state}`); + } + + if (row.state === 'failed') { + throw new Error(`MinerU parsing failed: ${row.err_msg || 'unknown error'}`); + } + + if (row.state === 'done' && row.full_zip_url) { + return parseMinerUZip(row.full_zip_url); + } + + await sleep(POLL_INTERVAL_MS); + } + + throw new Error(`MinerU timed out after ${POLL_MAX_MS / 1000}s (batch: ${batchData.batch_id})`); +} diff --git a/lib/pdf/mineru-parser.ts b/lib/pdf/mineru-parser.ts new file mode 100644 index 000000000..c52552863 --- /dev/null +++ b/lib/pdf/mineru-parser.ts @@ -0,0 +1,123 @@ +/** + * MinerU result parser + * Used by both self-hosted and cloud v4 paths. + */ + +import type { ParsedPdfContent } from '@/lib/types/pdf'; +import { createLogger } from '@/lib/logger'; + +const log = createLogger('MinerUResult'); + +type ImageMeta = { + pageIdx: number; + bbox: number[]; + caption?: string; +}; + +type ContentItem = Record; + +function parseContentList(raw: unknown): ContentItem[] | null { + if (Array.isArray(raw)) return raw as ContentItem[]; + + if (typeof raw === 'string') { + try { + const parsed = JSON.parse(raw); + return Array.isArray(parsed) ? parsed : null; + } catch { + log.warn('[MinerU] content_list is not valid JSON, skipping layout metadata'); + return null; + } + } + + return null; +} + +function normalizeImageData(raw: unknown): Record { + if (!raw || typeof raw !== 'object') return {}; + + return Object.fromEntries( + Object.entries(raw as Record).map(([key, value]) => [ + key, + value.startsWith('data:') ? value : `data:image/png;base64,${value}`, + ]), + ); +} + +function buildImageMetaLookup(contentList: ContentItem[]): Map { + const lookup = new Map(); + + for (const item of contentList) { + if (item.type !== 'image' || !item.img_path) continue; + + const meta: ImageMeta = { + pageIdx: (item.page_idx as number) ?? 0, + bbox: (item.bbox as number[]) || [0, 0, 1000, 1000], + caption: Array.isArray(item.image_caption) ? (item.image_caption[0] as string) : undefined, + }; + + const imgPath = item.img_path as string; + lookup.set(imgPath, meta); + + // Also index by basename so we can match `images/foo.png` → `foo.png` + const basename = imgPath.split('/').pop(); + if (basename && basename !== imgPath) { + lookup.set(basename, meta); + } + } + + return lookup; +} + +/** + * Normalize MinerU API / ZIP output into ParsedPdfContent. + * Used by both self-hosted and cloud v4 paths. + */ +export function extractMinerUResult(fileResult: Record): ParsedPdfContent { + const markdown = (fileResult.md_content as string) || ''; + const imageData = normalizeImageData(fileResult.images); + const contentList = parseContentList(fileResult.content_list); + + const pageCount = contentList + ? new Set(contentList.map((item) => item.page_idx).filter((v) => v != null)).size + : 0; + + const metaLookup = contentList ? buildImageMetaLookup(contentList) : new Map(); + + const imageMapping: Record = {}; + const pdfImages: Array<{ + id: string; + src: string; + pageNumber: number; + description?: string; + width?: number; + height?: number; + }> = []; + + for (const [index, [key, base64Url]] of Object.entries(imageData).entries()) { + const imageId = key.startsWith('img_') ? key : `img_${index + 1}`; + const meta = metaLookup.get(key) ?? metaLookup.get(`images/${key}`); + + imageMapping[imageId] = base64Url; + pdfImages.push({ + id: imageId, + src: base64Url, + pageNumber: meta ? meta.pageIdx + 1 : 0, + description: meta?.caption, + width: meta ? meta.bbox[2] - meta.bbox[0] : undefined, + height: meta ? meta.bbox[3] - meta.bbox[1] : undefined, + }); + } + + log.info(`[MinerU] Parsed: ${pdfImages.length} images, ${markdown.length} chars`); + + return { + text: markdown, + images: Object.values(imageMapping), + metadata: { + pageCount, + parser: 'mineru', + imageMapping, + pdfImages, + }, + }; +} diff --git a/lib/pdf/pdf-providers.ts b/lib/pdf/pdf-providers.ts index edfaea06e..41313e4df 100644 --- a/lib/pdf/pdf-providers.ts +++ b/lib/pdf/pdf-providers.ts @@ -143,6 +143,7 @@ import type { PDFParserConfig } from './types'; import type { ParsedPdfContent } from '@/lib/types/pdf'; import { PDF_PROVIDERS } from './constants'; import { createLogger } from '@/lib/logger'; +import { parseWithMinerUCloudV4, sanitizePdfFileNameForMinerU } from './mineru-cloud'; const log = createLogger('PDFProviders'); @@ -262,6 +263,21 @@ async function parseWithUnpdf(pdfBuffer: Buffer): Promise { }; } +// Return MinerU v4 API base URL ("https://mineru.net/api/v4") if baseUrl is a valid mineru.net (cloud) endpoint, else null +function getMinerUCloudApiBase(baseUrl: string): string | null { + try { + const url = new URL(baseUrl.trim()); + if (url.hostname.toLowerCase() !== 'mineru.net') return null; + const path = url.pathname.replace(/\/+$/, ''); + if (path === '' || path === '/' || path.startsWith('/api/v4')) { + return `${url.origin}/api/v4`; + } + return null; + } catch { + return null; + } +} + /** * Parse PDF using self-hosted MinerU service (mineru-api) * @@ -285,6 +301,24 @@ async function parseWithMinerU( ); } + // Route to cloud v4 if baseUrl points to mineru.net (mineru.net/api/v4) + // Otherwise, route to self-hosted MinerU server + const cloudApiBase = getMinerUCloudApiBase(config.baseUrl); + if (cloudApiBase) { + // MinerU cloud Precision API v4 has an upload size limit (200MB for single files). + const MAX_BYTES = 200 * 1024 * 1024; + if (pdfBuffer.byteLength > MAX_BYTES) { + const sizeMb = (pdfBuffer.byteLength / (1024 * 1024)).toFixed(1); + throw new Error(`MinerU cloud: file too large (${sizeMb}MB, max 200MB)`); + } + if (!config.apiKey?.trim()) { + throw new Error('MinerU cloud (mineru.net) requires an API token'); + } + const uploadName = sanitizePdfFileNameForMinerU(config.sourceFileName); + log.info('[MinerU] Using cloud v4 API:', cloudApiBase, 'file:', uploadName); + return parseWithMinerUCloudV4({ ...config, baseUrl: cloudApiBase }, pdfBuffer, uploadName); + } + log.info('[MinerU] Parsing PDF with MinerU server:', config.baseUrl); const fileName = 'document.pdf'; diff --git a/lib/pdf/types.ts b/lib/pdf/types.ts index 8173daedc..44c992f7d 100644 --- a/lib/pdf/types.ts +++ b/lib/pdf/types.ts @@ -15,6 +15,12 @@ export interface PDFProviderConfig { name: string; requiresApiKey: boolean; baseUrl?: string; + apiKey?: string; + cloudBaseUrl?: string; + cloudApiKey?: string; + localBaseUrl?: string; + localApiKey?: string; + isServerConfigured?: boolean; icon?: string; features: string[]; // ['text', 'images', 'tables', 'formulas', 'layout-analysis', etc.] } @@ -26,6 +32,6 @@ export interface PDFParserConfig { providerId: PDFProviderId; apiKey?: string; baseUrl?: string; + sourceFileName?: string; + mineruModelVersion?: 'pipeline' | 'vlm'; } - -// Note: ParsedPdfContent is imported from @/lib/types/pdf to avoid duplication diff --git a/lib/store/settings.ts b/lib/store/settings.ts index 4b088bbc6..19ca323d1 100644 --- a/lib/store/settings.ts +++ b/lib/store/settings.ts @@ -83,6 +83,10 @@ export interface SettingsState { enabled: boolean; isServerConfigured?: boolean; serverBaseUrl?: string; + cloudApiKey?: string; + cloudBaseUrl?: string; + localApiKey?: string; + localBaseUrl?: string; } >; @@ -211,7 +215,15 @@ export interface SettingsState { setPDFProvider: (providerId: PDFProviderId) => void; setPDFProviderConfig: ( providerId: PDFProviderId, - config: Partial<{ apiKey: string; baseUrl: string; enabled: boolean }>, + config: Partial<{ + apiKey: string; + baseUrl: string; + enabled: boolean; + cloudApiKey: string; + cloudBaseUrl: string; + localApiKey: string; + localBaseUrl: string; + }>, ) => void; // Image Generation actions @@ -307,8 +319,16 @@ const getDefaultPDFConfig = () => ({ pdfProviderId: 'unpdf' as PDFProviderId, pdfProvidersConfig: { unpdf: { apiKey: '', baseUrl: '', enabled: true }, - mineru: { apiKey: '', baseUrl: '', enabled: false }, - } as Record, + mineru: { + apiKey: '', + baseUrl: '', + enabled: false, + cloudApiKey: '', + cloudBaseUrl: '', + localApiKey: '', + localBaseUrl: '', + }, + } as SettingsState['pdfProvidersConfig'], }); // Initialize default Image config @@ -457,6 +477,26 @@ function ensureBuiltInVideoProviders(state: Partial): void { }); } +/** + * Ensure PDF provider config shape includes MinerU cloud/local fields. + * Preserves legacy baseUrl/apiKey and migrates them into cloud defaults. + */ +function ensurePDFProviderConfigShape(state: Partial): void { + if (!state.pdfProvidersConfig?.mineru) return; + + const mineruConfig = state.pdfProvidersConfig.mineru; + const legacyBaseUrl = mineruConfig.baseUrl || ''; + const legacyApiKey = mineruConfig.apiKey || ''; + + state.pdfProvidersConfig.mineru = { + ...mineruConfig, + cloudBaseUrl: mineruConfig.cloudBaseUrl ?? legacyBaseUrl, + cloudApiKey: mineruConfig.cloudApiKey ?? legacyApiKey, + localBaseUrl: mineruConfig.localBaseUrl ?? '', + localApiKey: mineruConfig.localApiKey ?? '', + }; +} + // Migrate from old localStorage format const migrateFromOldStorage = () => { if (typeof window === 'undefined') return null; @@ -1268,6 +1308,7 @@ export const useSettingsStore = create()( const defaultPDFConfig = getDefaultPDFConfig(); Object.assign(state, defaultPDFConfig); } + ensurePDFProviderConfigShape(state); // Add default Image config if missing if (!state.imageProvidersConfig) { @@ -1345,6 +1386,7 @@ export const useSettingsStore = create()( ensureBuiltInProviders(merged as Partial); ensureBuiltInImageProviders(merged as Partial); ensureBuiltInVideoProviders(merged as Partial); + ensurePDFProviderConfigShape(merged as Partial); ensureValidProviderSelections(merged as Partial); return merged as SettingsState; },