microsoft · lavish0000 · Mar 6, 2026
diff --git a/README.md b/README.md
@@ -164,6 +164,10 @@ result = md.convert("test.pdf")
 print(result.text_content)
 ```
 
+Formula extraction is disabled by default for Azure Document Intelligence. To
+enable it for OCR-capable files, pass `docintel_enable_formulas=True` when you
+construct `MarkItDown`.
+
 To use Large Language Models for image descriptions (currently only for pptx and image files), provide `llm_client` and `llm_model`:
 
 ```python

diff --git a/packages/markitdown/src/markitdown/_markitdown.py b/packages/markitdown/src/markitdown/_markitdown.py
@@ -221,6 +221,10 @@ def enable_builtins(self, **kwargs) -> None:
                 if docintel_version is not None:
                     docintel_args["api_version"] = docintel_version
 
+                docintel_enable_formulas = kwargs.get("docintel_enable_formulas")
+                if docintel_enable_formulas is not None:
+                    docintel_args["include_formulas"] = docintel_enable_formulas
+
                 self.register_converter(
                     DocumentIntelligenceConverter(**docintel_args),
                 )
@@ -555,9 +559,9 @@ def _convert(
             for converter_registration in sorted_registrations:
                 converter = converter_registration.converter
                 # Sanity check -- make sure the cur_pos is still the same
-                assert (
-                    cur_pos == file_stream.tell()
-                ), "File stream position should NOT change between guess iterations"
+                assert cur_pos == file_stream.tell(), (
+                    "File stream position should NOT change between guess iterations"
+                )
 
                 _kwargs = {k: v for k, v in kwargs.items()}
 
@@ -596,9 +600,9 @@ def _convert(
                     pass
 
                 # accept() should not have changed the file stream position
-                assert (
-                    cur_pos == file_stream.tell()
-                ), f"{type(converter).__name__}.accept() should NOT change the file_stream position"
+                assert cur_pos == file_stream.tell(), (
+                    f"{type(converter).__name__}.accept() should NOT change the file_stream position"
+                )
 
                 # Attempt the conversion
                 if _accepts:

diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py
@@ -136,6 +136,7 @@ def __init__(
         endpoint: str,
         api_version: str = "2024-07-31-preview",
         credential: AzureKeyCredential | TokenCredential | None = None,
+        include_formulas: bool = False,
         file_types: List[DocumentIntelligenceFileType] = [
             DocumentIntelligenceFileType.DOCX,
             DocumentIntelligenceFileType.PPTX,
@@ -154,21 +155,21 @@ def __init__(
             endpoint (str): The endpoint for the Document Intelligence service.
             api_version (str): The API version to use. Defaults to "2024-07-31-preview".
             credential (AzureKeyCredential | TokenCredential | None): The credential to use for authentication.
+            include_formulas (bool): Whether to enable formula extraction for OCR-capable file types.
             file_types (List[DocumentIntelligenceFileType]): The file types to accept. Defaults to all supported file types.
         """
 
         super().__init__()
         self._file_types = file_types
+        self._include_formulas = include_formulas
 
         # Raise an error if the dependencies are not available.
         # This is different than other converters since this one isn't even instantiated
         # unless explicitly requested.
         if _dependency_exc_info is not None:
             raise MissingDependencyException(
                 "DocumentIntelligenceConverter requires the optional dependency [az-doc-intel] (or [all]) to be installed. E.g., `pip install markitdown[az-doc-intel]`"
-            ) from _dependency_exc_info[
-                1
-            ].with_traceback(  # type: ignore[union-attr]
+            ) from _dependency_exc_info[1].with_traceback(  # type: ignore[union-attr]
                 _dependency_exc_info[2]
             )
 
@@ -228,11 +229,15 @@ def _analysis_features(self, stream_info: StreamInfo) -> List[str]:
             if mimetype.startswith(prefix):
                 return []
 
-        return [
-            DocumentAnalysisFeature.FORMULAS,  # enable formula extraction
+        features = [
             DocumentAnalysisFeature.OCR_HIGH_RESOLUTION,  # enable high resolution OCR
             DocumentAnalysisFeature.STYLE_FONT,  # enable font style extraction
         ]
+        if self._include_formulas:
+            features.insert(
+                0, DocumentAnalysisFeature.FORMULAS
+            )  # enable formula extraction
+        return features
 
     def convert(
         self,

diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py
@@ -3,9 +3,13 @@
 import os
 import re
 import shutil
-import pytest
+from types import SimpleNamespace
 from unittest.mock import MagicMock
 
+import pytest
+
+from markitdown import _markitdown
+from markitdown.converters import _doc_intel_converter
 from markitdown._uri_utils import parse_data_uri, file_uri_to_path
 
 from markitdown import (
@@ -288,6 +292,64 @@ def test_input_as_strings() -> None:
     assert "# Test" in result.text_content
 
 
+def test_docintel_analysis_features_disable_formulas_by_default(
+    monkeypatch: pytest.MonkeyPatch,
+) -> None:
+    monkeypatch.setattr(
+        _doc_intel_converter,
+        "DocumentAnalysisFeature",
+        SimpleNamespace(
+            FORMULAS="FORMULAS",
+            OCR_HIGH_RESOLUTION="OCR_HIGH_RESOLUTION",
+            STYLE_FONT="STYLE_FONT",
+        ),
+    )
+
+    converter = _doc_intel_converter.DocumentIntelligenceConverter.__new__(
+        _doc_intel_converter.DocumentIntelligenceConverter
+    )
+    converter._include_formulas = False
+
+    assert converter._analysis_features(StreamInfo(extension=".pdf")) == [
+        "OCR_HIGH_RESOLUTION",
+        "STYLE_FONT",
+    ]
+
+    converter._include_formulas = True
+
+    assert converter._analysis_features(StreamInfo(extension=".pdf")) == [
+        "FORMULAS",
+        "OCR_HIGH_RESOLUTION",
+        "STYLE_FONT",
+    ]
+
+
+@pytest.mark.parametrize("enable_formulas", [False, True])
+def test_markitdown_passes_docintel_formula_setting(
+    monkeypatch: pytest.MonkeyPatch,
+    enable_formulas: bool,
+) -> None:
+    captured_kwargs: dict[str, object] = {}
+
+    class FakeDocumentIntelligenceConverter:
+        def __init__(self, **kwargs: object) -> None:
+            captured_kwargs.update(kwargs)
+
+    monkeypatch.setattr(
+        _markitdown,
+        "DocumentIntelligenceConverter",
+        FakeDocumentIntelligenceConverter,
+    )
+
+    MarkItDown(
+        docintel_endpoint="https://example.cognitiveservices.azure.com/",
+        docintel_enable_formulas=enable_formulas,
+    )
+
+    assert captured_kwargs["endpoint"] == "https://example.cognitiveservices.azure.com/"
+    assert captured_kwargs["include_formulas"] is enable_formulas
+
+
 def test_doc_rlink() -> None:
     # Test for: CVE-2025-11849
     markitdown = MarkItDown()