From 7be1b5021ed61ab25061c108abfdd6ea456f4e18 Mon Sep 17 00:00:00 2001 From: Contributor Date: Fri, 6 Mar 2026 04:17:17 +0000 Subject: [PATCH 1/2] docs: fix typos and grammar in base converter documentation - Fix 'based' -> 'based on' for proper grammar - Fix 'steam_info.url' -> 'stream_info.url' (typo) - Fix 'used to in cases' -> 'used in cases' (grammar) - Fix duplicate 'MUST be reset' text - Fix 'advances' -> 'advance' (subject-verb agreement) - Fix 'case' -> 'cases' (pluralization) --- packages/markitdown/src/markitdown/__main__.py | 2 +- packages/markitdown/src/markitdown/_base_converter.py | 10 +++++----- 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/packages/markitdown/src/markitdown/__main__.py b/packages/markitdown/src/markitdown/__main__.py index 6085ad6bb..0adf93d72 100644 --- a/packages/markitdown/src/markitdown/__main__.py +++ b/packages/markitdown/src/markitdown/__main__.py @@ -74,7 +74,7 @@ def main(): parser.add_argument( "-c", "--charset", - help="Provide a hint about the file's charset (e.g, UTF-8).", + help="Provide a hint about the file's charset (e.g., UTF-8).", ) parser.add_argument( diff --git a/packages/markitdown/src/markitdown/_base_converter.py b/packages/markitdown/src/markitdown/_base_converter.py index fa2b11145..4751ca9f1 100644 --- a/packages/markitdown/src/markitdown/_base_converter.py +++ b/packages/markitdown/src/markitdown/_base_converter.py @@ -50,18 +50,18 @@ def accepts( ) -> bool: """ Return a quick determination on if the converter should attempt converting the document. - This is primarily based `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`). - In cases where the data is retrieved via HTTP, the `steam_info.url` might also be referenced to + This is primarily based on `stream_info` (typically, `stream_info.mimetype`, `stream_info.extension`). + In cases where the data is retrieved via HTTP, the `stream_info.url` might also be referenced to make a determination (e.g., special converters for Wikipedia, YouTube etc). - Finally, it is conceivable that the `stream_info.filename` might be used to in cases + Finally, it is conceivable that the `stream_info.filename` might be used in cases where the filename is well-known (e.g., `Dockerfile`, `Makefile`, etc) NOTE: The method signature is designed to match that of the convert() method. This provides some assurance that, if accepts() returns True, the convert() method will also be able to handle the document. IMPORTANT: In rare cases, (e.g., OutlookMsgConverter) we need to read more from the stream to make a final - determination. Read operations inevitably advances the position in file_stream. In these case, the position - MUST be reset it MUST be reset before returning. This is because the convert() method may be called immediately + determination. Read operations inevitably advance the position in file_stream. In these cases, the position + MUST be reset before returning. This is because the convert() method may be called immediately after accepts(), and will expect the file_stream to be at the original position. E.g., From b2273fb529eae8bc40feffe32d4181deff89b16c Mon Sep 17 00:00:00 2001 From: Contributor Date: Sat, 7 Mar 2026 04:17:37 +0000 Subject: [PATCH 2/2] feat(docintel): accept .doc extension and msword mime --- .../markitdown/converters/_doc_intel_converter.py | 2 ++ packages/markitdown/tests/test_docintel_html.py | 12 ++++++++++++ 2 files changed, 14 insertions(+) diff --git a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py index fd843f231..d244ecfc9 100644 --- a/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py +++ b/packages/markitdown/src/markitdown/converters/_doc_intel_converter.py @@ -76,6 +76,7 @@ def _get_mime_type_prefixes(types: List[DocumentIntelligenceFileType]) -> List[s prefixes.append( "application/vnd.openxmlformats-officedocument.wordprocessingml.document" ) + prefixes.append("application/msword") elif type_ == DocumentIntelligenceFileType.PPTX: prefixes.append( "application/vnd.openxmlformats-officedocument.presentationml" @@ -107,6 +108,7 @@ def _get_file_extensions(types: List[DocumentIntelligenceFileType]) -> List[str] for type_ in types: if type_ == DocumentIntelligenceFileType.DOCX: extensions.append(".docx") + extensions.append(".doc") elif type_ == DocumentIntelligenceFileType.PPTX: extensions.append(".pptx") elif type_ == DocumentIntelligenceFileType.XLSX: diff --git a/packages/markitdown/tests/test_docintel_html.py b/packages/markitdown/tests/test_docintel_html.py index d0b4caa3e..aaf54229c 100644 --- a/packages/markitdown/tests/test_docintel_html.py +++ b/packages/markitdown/tests/test_docintel_html.py @@ -24,3 +24,15 @@ def test_docintel_accepts_html_mimetype(): assert conv.accepts(io.BytesIO(b""), stream_info) stream_info = StreamInfo(mimetype="application/xhtml+xml", extension=None) assert conv.accepts(io.BytesIO(b""), stream_info) + + +def test_docintel_accepts_doc_extension_for_docx_type(): + conv = _make_converter([DocumentIntelligenceFileType.DOCX]) + stream_info = StreamInfo(mimetype=None, extension=".doc") + assert conv.accepts(io.BytesIO(b""), stream_info) + + +def test_docintel_accepts_msword_mimetype_for_docx_type(): + conv = _make_converter([DocumentIntelligenceFileType.DOCX]) + stream_info = StreamInfo(mimetype="application/msword", extension=None) + assert conv.accepts(io.BytesIO(b""), stream_info)