Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
]

ACCEPTED_FILE_EXTENSIONS = [".docx"]
DEFAULT_STYLE_MAP = "u => u"


class DocxConverter(HtmlConverter):
Expand Down Expand Up @@ -75,7 +76,12 @@ def convert(
_dependency_exc_info[2]
)

style_map = kwargs.get("style_map", None)
style_map = kwargs.get("style_map")
if style_map:
style_map = f"{DEFAULT_STYLE_MAP}\n{style_map}"
else:
style_map = DEFAULT_STYLE_MAP

pre_process_stream = pre_process_docx(file_stream)
return self._html_converter.convert_string(
mammoth.convert_to_html(pre_process_stream, style_map=style_map).value,
Expand Down
53 changes: 53 additions & 0 deletions packages/markitdown/tests/test_module_misc.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,8 +5,11 @@
import shutil
import pytest
from unittest.mock import MagicMock
from types import SimpleNamespace

from markitdown._uri_utils import parse_data_uri, file_uri_to_path
from markitdown.converters._docx_converter import DocxConverter, DEFAULT_STYLE_MAP
from markitdown.converters import _docx_converter

from markitdown import (
MarkItDown,
Expand Down Expand Up @@ -261,6 +264,56 @@ def test_docx_comments() -> None:
validate_strings(result, DOCX_COMMENT_TEST_STRINGS)


def test_docx_converter_preserves_underline_by_default(monkeypatch) -> None:
converter = DocxConverter()
captured = {}

monkeypatch.setattr(_docx_converter, "pre_process_docx", lambda stream: stream)

def fake_convert_to_html(stream, style_map=None):
captured["style_map"] = style_map
return SimpleNamespace(value="<p><u>underlined</u></p>")

monkeypatch.setattr(_docx_converter.mammoth, "convert_to_html", fake_convert_to_html)
monkeypatch.setattr(
converter._html_converter,
"convert_string",
lambda html, **kwargs: html,
)

result = converter.convert(io.BytesIO(b"docx"), StreamInfo(extension=".docx"))

assert captured["style_map"] == DEFAULT_STYLE_MAP
assert result == "<p><u>underlined</u></p>"


def test_docx_converter_appends_custom_style_map(monkeypatch) -> None:
converter = DocxConverter()
captured = {}

monkeypatch.setattr(_docx_converter, "pre_process_docx", lambda stream: stream)

def fake_convert_to_html(stream, style_map=None):
captured["style_map"] = style_map
return SimpleNamespace(value="<p>comment</p>")

monkeypatch.setattr(_docx_converter.mammoth, "convert_to_html", fake_convert_to_html)
monkeypatch.setattr(
converter._html_converter,
"convert_string",
lambda html, **kwargs: html,
)

result = converter.convert(
io.BytesIO(b"docx"),
StreamInfo(extension=".docx"),
style_map="comment-reference => ",
)

assert captured["style_map"] == f"{DEFAULT_STYLE_MAP}\ncomment-reference => "
assert result == "<p>comment</p>"


def test_docx_equations() -> None:
markitdown = MarkItDown()
docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")
Expand Down