Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -164,6 +164,10 @@ result = md.convert("test.pdf")
print(result.text_content)
```

Formula extraction is disabled by default for Azure Document Intelligence. To
enable it for OCR-capable files, pass `docintel_enable_formulas=True` when you
construct `MarkItDown`.

To use Large Language Models for image descriptions (currently only for pptx and image files), provide `llm_client` and `llm_model`:

```python
Expand Down
16 changes: 10 additions & 6 deletions packages/markitdown/src/markitdown/_markitdown.py
Original file line number Diff line number Diff line change
Expand Up @@ -221,6 +221,10 @@ def enable_builtins(self, **kwargs) -> None:
if docintel_version is not None:
docintel_args["api_version"] = docintel_version

docintel_enable_formulas = kwargs.get("docintel_enable_formulas")
if docintel_enable_formulas is not None:
docintel_args["include_formulas"] = docintel_enable_formulas

self.register_converter(
DocumentIntelligenceConverter(**docintel_args),
)
Expand Down Expand Up @@ -555,9 +559,9 @@ def _convert(
for converter_registration in sorted_registrations:
converter = converter_registration.converter
# Sanity check -- make sure the cur_pos is still the same
assert (
cur_pos == file_stream.tell()
), "File stream position should NOT change between guess iterations"
assert cur_pos == file_stream.tell(), (
"File stream position should NOT change between guess iterations"
)

_kwargs = {k: v for k, v in kwargs.items()}

Expand Down Expand Up @@ -596,9 +600,9 @@ def _convert(
pass

# accept() should not have changed the file stream position
assert (
cur_pos == file_stream.tell()
), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
assert cur_pos == file_stream.tell(), (
f"{type(converter).__name__}.accept() should NOT change the file_stream position"
)

# Attempt the conversion
if _accepts:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -136,6 +136,7 @@ def __init__(
endpoint: str,
api_version: str = "2024-07-31-preview",
credential: AzureKeyCredential | TokenCredential | None = None,
include_formulas: bool = False,
file_types: List[DocumentIntelligenceFileType] = [
DocumentIntelligenceFileType.DOCX,
DocumentIntelligenceFileType.PPTX,
Expand All @@ -154,21 +155,21 @@ def __init__(
endpoint (str): The endpoint for the Document Intelligence service.
api_version (str): The API version to use. Defaults to "2024-07-31-preview".
credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
include_formulas (bool): Whether to enable formula extraction for OCR-capable file types.
file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
"""

super().__init__()
self._file_types = file_types
self._include_formulas = include_formulas

# Raise an error if the dependencies are not available.
# This is different than other converters since this one isn't even instantiated
# unless explicitly requested.
if _dependency_exc_info is not None:
raise MissingDependencyException(
"DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
) from _dependency_exc_info[
1
].with_traceback( # type: ignore[union-attr]
) from _dependency_exc_info[1].with_traceback( # type: ignore[union-attr]
_dependency_exc_info[2]
)

Expand Down Expand Up @@ -228,11 +229,15 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
if mimetype.startswith(prefix):
return []

return [
DocumentAnalysisFeature.FORMULAS, # enable formula extraction
features = [
DocumentAnalysisFeature.OCR_HIGH_RESOLUTION, # enable high resolution OCR
DocumentAnalysisFeature.STYLE_FONT, # enable font style extraction
]
if self._include_formulas:
features.insert(
0, DocumentAnalysisFeature.FORMULAS
) # enable formula extraction
return features

def convert(
self,
Expand Down
64 changes: 63 additions & 1 deletion packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,13 @@
import os
import re
import shutil
import pytest
from types import SimpleNamespace
from unittest.mock import MagicMock

import pytest

from markitdown import _markitdown
from markitdown.converters import _doc_intel_converter
from markitdown._uri_utils import parse_data_uri, file_uri_to_path

from markitdown import (
Expand Down Expand Up @@ -288,6 +292,64 @@ def test_input_as_strings() -> None:
assert "# Test" in result.text_content


def test_docintel_analysis_features_disable_formulas_by_default(
monkeypatch: pytest.MonkeyPatch,
) -> None:
monkeypatch.setattr(
_doc_intel_converter,
"DocumentAnalysisFeature",
SimpleNamespace(
FORMULAS="FORMULAS",
OCR_HIGH_RESOLUTION="OCR_HIGH_RESOLUTION",
STYLE_FONT="STYLE_FONT",
),
)

converter = _doc_intel_converter.DocumentIntelligenceConverter.__new__(
_doc_intel_converter.DocumentIntelligenceConverter
)
converter._include_formulas = False

assert converter._analysis_features(StreamInfo(extension=".pdf")) == [
"OCR_HIGH_RESOLUTION",
"STYLE_FONT",
]

converter._include_formulas = True

assert converter._analysis_features(StreamInfo(extension=".pdf")) == [
"FORMULAS",
"OCR_HIGH_RESOLUTION",
"STYLE_FONT",
]


@pytest.mark.parametrize("enable_formulas", [False, True])
def test_markitdown_passes_docintel_formula_setting(
monkeypatch: pytest.MonkeyPatch,
enable_formulas: bool,
) -> None:
captured_kwargs: dict[str, object] = {}

class FakeDocumentIntelligenceConverter:
def __init__(self, **kwargs: object) -> None:
captured_kwargs.update(kwargs)

monkeypatch.setattr(
_markitdown,
"DocumentIntelligenceConverter",
FakeDocumentIntelligenceConverter,
)

MarkItDown(
docintel_endpoint="https://example.cognitiveservices.azure.com/",
docintel_enable_formulas=enable_formulas,
)

assert captured_kwargs["endpoint"] == "https://example.cognitiveservices.azure.com/"
assert captured_kwargs["include_formulas"] is enable_formulas


def test_doc_rlink() -> None:
# Test for: CVE-2025-11849
markitdown = MarkItDown()
Expand Down