diff --git a/README.md b/README.md index 6da3ee1d9..6d4ef76f3 100644 --- a/README.md +++ b/README.md @@ -28,6 +28,7 @@ MarkItDown currently supports the conversion from: - ZIP files (iterates over contents) - Youtube URLs - EPubs +- UYAP UDF documents - ... and more! ## Why Markdown? diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py index f342a614b..b8b801a2a 100644 --- a/packages/markitdown/src/markitdown/_markitdown.py +++ b/packages/markitdown/src/markitdown/_markitdown.py @@ -39,6 +39,7 @@ EpubConverter, DocumentIntelligenceConverter, CsvConverter, + UdfConverter, ) from ._base_converter import DocumentConverter, DocumentConverterResult @@ -202,6 +203,7 @@ def enable_builtins(self, **kwargs) -> None: self.register_converter(OutlookMsgConverter()) self.register_converter(EpubConverter()) self.register_converter(CsvConverter()) + self.register_converter(UdfConverter()) # Register Document Intelligence converter at the top of the stack if endpoint is provided docintel_endpoint = kwargs.get("docintel_endpoint") diff --git a/packages/markitdown/src/markitdown/converters/__init__.py b/packages/markitdown/src/markitdown/converters/__init__.py index e4437a582..6011a8eb7 100644 --- a/packages/markitdown/src/markitdown/converters/__init__.py +++ b/packages/markitdown/src/markitdown/converters/__init__.py @@ -23,6 +23,7 @@ ) from ._epub_converter import EpubConverter from ._csv_converter import CsvConverter +from ._udf_converter import UdfConverter __all__ = [ "PlainTextConverter", @@ -45,4 +46,5 @@ "DocumentIntelligenceFileType", "EpubConverter", "CsvConverter", + "UdfConverter", ] diff --git a/packages/markitdown/src/markitdown/converters/_udf_converter.py b/packages/markitdown/src/markitdown/converters/_udf_converter.py new file mode 100644 index 000000000..8d474aab9 --- /dev/null +++ b/packages/markitdown/src/markitdown/converters/_udf_converter.py @@ -0,0 +1,449 @@ +from __future__ import annotations + +import zipfile +from dataclasses import dataclass +from typing import Any, BinaryIO + +from defusedxml import ElementTree as ET + +from .._base_converter import DocumentConverter, DocumentConverterResult +from .._stream_info import StreamInfo + +ACCEPTED_FILE_EXTENSIONS = [".udf"] +ZIP_MIME_TYPE_PREFIXES = [ + "application/zip", + "application/x-zip-compressed", + "application/octet-stream", +] +IMAGE_PLACEHOLDER = "[embedded image omitted]" + + +@dataclass(frozen=True) +class _TextStyle: + bold: bool = False + italic: bool = False + underline: bool = False + + +@dataclass(frozen=True) +class _TextRun: + text: str + style: _TextStyle + + +@dataclass(frozen=True) +class _ImageRun: + pass + + +@dataclass(frozen=True) +class _ListInfo: + ordered: bool + level: int + list_id: str | None + + +@dataclass(frozen=True) +class _Paragraph: + runs: list[_TextRun | _ImageRun] + list_info: _ListInfo | None + + +@dataclass(frozen=True) +class _TableCell: + blocks: list[_Block] + + +@dataclass(frozen=True) +class _TableRow: + cells: list[_TableCell] + + +@dataclass(frozen=True) +class _Table: + rows: list[_TableRow] + + +_Block = _Paragraph | _Table +_InlineRun = _TextRun | _ImageRun + + +class UdfConverter(DocumentConverter): + """Convert UYAP UDF documents into Markdown.""" + + def accepts( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> bool: + extension = (stream_info.extension or "").lower() + mimetype = (stream_info.mimetype or "").lower() + + if extension in ACCEPTED_FILE_EXTENSIONS: + return True + + if extension not in ("", ".zip"): + return False + + if mimetype and not any( + mimetype.startswith(prefix) for prefix in ZIP_MIME_TYPE_PREFIXES + ): + return False + + return self._looks_like_udf(file_stream) + + def convert( + self, + file_stream: BinaryIO, + stream_info: StreamInfo, + **kwargs: Any, + ) -> DocumentConverterResult: + content_xml = self._read_content_xml(file_stream) + root = ET.fromstring(content_xml) + if self._local_name(root.tag) != "template": + raise ValueError("Invalid UDF file: missing