diff --git a/CHANGELOG.md b/CHANGELOG.md index 95cbf64bf7..8b3daa5327 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,3 +1,8 @@ +## 0.18.16-dev0 + +### Enhancement +- Speed up function _assign_hash_ids by 34% (codeflash) + ## 0.18.15 ### Enhancements diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json index cc6ecebd11..248b38a649 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json @@ -360,7 +360,7 @@ "eng" ], "page_number": 1, - "orig_elements": "eJztUsmO2zAM/RVB58ZLYidxjz3NaVCg6WkwMGiLsoVYCyQ6TRHk30u1GWBaoH8wN1Hk01vEl5vEBS066o2Sn4VEPNTVfoBKbZvq0FVNpSuNXdvVGnYwNPKTkBYJFBDw/E3mQ5/8Gkf8XQeM1qRkvEv9Y+jlJq1Xub3b1cfj/ZXfiDj6qPrFj0A+/kECzVlCOXuLZVydw1j+8PFcri5RXEdaI6q/C7yCDQtulB9TOYNTg/fnTR0KvrjK+52ZtFmQfoZMLyGExTAjqysvThU+oLvaRftogdLGa21GZOiaEymYW4XoR2Q7brJL8dbJISzgphUmTNmgRDfJbCvwTe9WO2D2VGd+witl7tNsknh6SBTKYxLOkxgjAqEApUyWBYuIZpopCZjAsFVBM4pEjGMNgp0nQhQ+iu/OECrxjRiexOnRME5ouHDba+EZGkWASAZTkUW/5fAMMXIKFzxlcazy3zXoukrtj1V7GLb7o1aNbrbNoBpox6FtVbf7WIP/r8H7T38f+lce+8K/fZb3118ujDSv" + "orig_elements": "eJztUsmO2zAM/RVB58bL2Nl67KmnQYGmp8HAoCXKFmItkOg0RZB/L9VmgGmB/kFvosint4gvN4kLOvQ0WC0/Com4b5vdCI1+6pv9sekb0xg8bo+tgQ7GXn4Q0iGBBgKev8lyGHJYk8JfdcTkbM42+Dw8hl5u0gVd2l3XHg73V34joQpJD0tQQCH9RgLNRUI9B4d1Wr3HVH8P6VyvPlNaFa0J9Z8FXsHFBTc6qFzP4PUYwnnTxoovrvJ+ZyZjF6QfsdBLiHGxzMjq6ovXVYjor24xITmgvAnGWIUMXUsiFXPrmIJCtuMnt1RvnRLCAn5aYcJcDEr0kyy2It8MfnUjFk9t4Se8UuE+zTaLzw+JQgfMwgcSKiEQCtDaFlmwiGSnmbKACSxbFTSjyMQ41iDYeSZEEZL45i2hFl+J4VmcHg3rhYELt4MRgaFJREhkMVdF9FsOz5ASp3DBUxHHKv9eg2402wa2DW47rXBU4/6wM3vTd7tDt+8B/q/Bv9fg/ae/D/0Lj33i3z7L++tP43k1Mw==" } }, { @@ -760,7 +760,7 @@ "languages": [ "eng" ], - "orig_elements": "eJxVUEtuwyAQvUrEurbjpFGcbCv1ElFkjWFsowCDBmgdRb57IW0WXcEw78e7PAQatOhir5U4b8S7OnXbA5yOshskSnnctcOhha4dcbcd9kq8bYTFCAoiZPxDlEsfKLHE5+yRrQ5Bkwv9H+jyEJZUWe/3bdet16zBKIlVb0hCJP5lQpxLhGYmiw0n55Cbb+Jbk1yInGRMjOr/gAtYb7BSJEMzg1MD0a1qfZ0fFrGu2WnUBuPdF3sB3hudHXO65supmjy6xZqR2EIMFY2jlpipqTRSZ2/lmSTm77jJmvq1KSXMCAq5H4liPl4GnrUFvheAATclmDCUBgS6SVxLnIhLLMgP8nfW0/wUe9E/n2pivf4An2aN4g==" + "orig_elements": "eJxVUNFuwyAM/JWJ5zVZ1C6L9jppP1FVkQMmQQWMDGypqvz7IFsf9gS2z3fnO98FWnTo02iUeH8S0Es1dKfj6bV/64cJ9HQaZD9JOb1MEjstnp+EwwQKEhT8XdTPGCmzxL0OyM7EaMjH8Q90vgtHqo6Px24YtkvhYJTEarQkIRH/bkJaqoV2IYctZ++R22/ia5t9TJxlyozqf4EruGDxoEjGdgGvJqLroQtNaaxi24qSNhbTLeB+XQjWFMXirv3yqqGAfnVWEztI8UBaG4llNddEmqKtApPEco6fnW0ekxrCgqCQR02UyvMQCGwc8K0CLPg5w4yxJiDQz+JS7SRcU0V+ULixmZed7LH+ubOJ7fIDJz+OQw==" } } -] \ No newline at end of file +] diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 99e9be3387..52ed62c9c0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.15" # pragma: no cover +__version__ = "0.18.16-dev0" # pragma: no cover diff --git a/unstructured/partition/common/metadata.py b/unstructured/partition/common/metadata.py index 29704d97d0..08b0fe9597 100644 --- a/unstructured/partition/common/metadata.py +++ b/unstructured/partition/common/metadata.py @@ -5,7 +5,6 @@ import copy import datetime as dt import functools -import itertools import os from typing import Any, Callable, Iterator, Sequence @@ -252,15 +251,12 @@ def _assign_hash_ids(elements: list[Element]) -> list[Element]: or more fragments for parallel processing. """ # -- generate sequence number for each element on a page -- - page_numbers = [e.metadata.page_number for e in elements] - page_seq_numbers = [ - seq_on_page - for _, group in itertools.groupby(page_numbers) - for seq_on_page, _ in enumerate(group) - ] - - for element, seq_on_page_counter in zip(elements, page_seq_numbers): + page_seq_counts = {} + for element in elements: + page_number = element.metadata.page_number + seq_on_page_counter = page_seq_counts.get(page_number, 0) element.id_to_hash(seq_on_page_counter) + page_seq_counts[page_number] = seq_on_page_counter + 1 return elements