diff --git a/CHANGELOG.md b/CHANGELOG.md index efa6d33a9c..1644a2ae32 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,4 +1,4 @@ -## 0.18.15-dev0 +## 0.18.15-dev1 ### Enhancements @@ -6,6 +6,8 @@ ### Fixes +* **Handle filenames without extensions in file type detection** + ## 0.18.14 ### Enhancements diff --git a/test_unstructured/file_utils/test_filetype.py b/test_unstructured/file_utils/test_filetype.py index b9ec83562b..6a0717b800 100644 --- a/test_unstructured/file_utils/test_filetype.py +++ b/test_unstructured/file_utils/test_filetype.py @@ -619,6 +619,8 @@ def and_it_derives_the_extension_from_metadata_file_path_when_file_object_has_no None, # -- case 2: file-like object has `.name` attribute but it's value is the empty string "", + # -- case 3: file-like object has name with no extension -- + "q3_invoices", ], ) def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_sources( @@ -631,6 +633,26 @@ def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name assert _FileTypeDetectionContext(file=file).extension == "" + @pytest.mark.parametrize( + "file_name", + [ + # -- case 1: file-like object has no `.name` attribute + None, + # -- case 2: file-like object has `.name` attribute but it's value is the empty string + "", + # -- case 3: file-like object has name with no extension -- + "q3_invoices", + ], + ) + def and_it_returns_the_empty_string_as_the_extension_when_there_are_no_file_name_nor_metadata( + self, file_name: str | None + ): + with open(example_doc_path("ideas-page.html"), "rb") as f: + file = io.BytesIO(f.read()) + file.name = None + + assert _FileTypeDetectionContext(file=file, metadata_file_path=file_name).extension == "" + # -- .file_head --------------------------------------------- def it_grabs_the_first_8k_bytes_of_the_file_for_use_by_magic(self): diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9d8d327217..c82416a4b0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.15-dev0" # pragma: no cover +__version__ = "0.18.15-dev1" # pragma: no cover diff --git a/unstructured/file_utils/filetype.py b/unstructured/file_utils/filetype.py index 1206b1e8b3..1f99ad7794 100644 --- a/unstructured/file_utils/filetype.py +++ b/unstructured/file_utils/filetype.py @@ -349,11 +349,15 @@ def extension(self) -> str: # -- get from file_path, or file when it has a name (path) -- with self.open() as file: if hasattr(file, "name") and file.name: - return os.path.splitext(file.name)[1].lower() + splitext = os.path.splitext(file.name) + if len(splitext) > 1: + return splitext[1].lower() # -- otherwise use metadata file-path when provided -- if file_path := self._metadata_file_path: - return os.path.splitext(file_path)[1].lower() + splitext = os.path.splitext(file_path) + if len(splitext) > 1: + return splitext[1].lower() # -- otherwise empty str means no extension, same as a path like "a/b/name-no-ext" -- return ""