diff --git a/packages/markitdown/src/markitdown/converters/_docx_converter.py b/packages/markitdown/src/markitdown/converters/_docx_converter.py index 3975107b1..7b7fef9a6 100644 --- a/packages/markitdown/src/markitdown/converters/_docx_converter.py +++ b/packages/markitdown/src/markitdown/converters/_docx_converter.py @@ -26,6 +26,7 @@ ] ACCEPTED_FILE_EXTENSIONS = [".docx"] +DEFAULT_STYLE_MAP = "u => u" class DocxConverter(HtmlConverter): @@ -75,7 +76,12 @@ def convert( _dependency_exc_info[2] ) - style_map = kwargs.get("style_map", None) + style_map = kwargs.get("style_map") + if style_map: + style_map = f"{DEFAULT_STYLE_MAP}\n{style_map}" + else: + style_map = DEFAULT_STYLE_MAP + pre_process_stream = pre_process_docx(file_stream) return self._html_converter.convert_string( mammoth.convert_to_html(pre_process_stream, style_map=style_map).value, diff --git a/packages/markitdown/tests/test_module_misc.py b/packages/markitdown/tests/test_module_misc.py index 8e3acc23d..73451cc30 100644 --- a/packages/markitdown/tests/test_module_misc.py +++ b/packages/markitdown/tests/test_module_misc.py @@ -5,8 +5,11 @@ import shutil import pytest from unittest.mock import MagicMock +from types import SimpleNamespace from markitdown._uri_utils import parse_data_uri, file_uri_to_path +from markitdown.converters._docx_converter import DocxConverter, DEFAULT_STYLE_MAP +from markitdown.converters import _docx_converter from markitdown import ( MarkItDown, @@ -261,6 +264,56 @@ def test_docx_comments() -> None: validate_strings(result, DOCX_COMMENT_TEST_STRINGS) +def test_docx_converter_preserves_underline_by_default(monkeypatch) -> None: + converter = DocxConverter() + captured = {} + + monkeypatch.setattr(_docx_converter, "pre_process_docx", lambda stream: stream) + + def fake_convert_to_html(stream, style_map=None): + captured["style_map"] = style_map + return SimpleNamespace(value="
underlined
") + + monkeypatch.setattr(_docx_converter.mammoth, "convert_to_html", fake_convert_to_html) + monkeypatch.setattr( + converter._html_converter, + "convert_string", + lambda html, **kwargs: html, + ) + + result = converter.convert(io.BytesIO(b"docx"), StreamInfo(extension=".docx")) + + assert captured["style_map"] == DEFAULT_STYLE_MAP + assert result == "underlined
" + + +def test_docx_converter_appends_custom_style_map(monkeypatch) -> None: + converter = DocxConverter() + captured = {} + + monkeypatch.setattr(_docx_converter, "pre_process_docx", lambda stream: stream) + + def fake_convert_to_html(stream, style_map=None): + captured["style_map"] = style_map + return SimpleNamespace(value="comment
") + + monkeypatch.setattr(_docx_converter.mammoth, "convert_to_html", fake_convert_to_html) + monkeypatch.setattr( + converter._html_converter, + "convert_string", + lambda html, **kwargs: html, + ) + + result = converter.convert( + io.BytesIO(b"docx"), + StreamInfo(extension=".docx"), + style_map="comment-reference => ", + ) + + assert captured["style_map"] == f"{DEFAULT_STYLE_MAP}\ncomment-reference => " + assert result == "comment
" + + def test_docx_equations() -> None: markitdown = MarkItDown() docx_file = os.path.join(TEST_FILES_DIR, "equations.docx")