From 5cb28b99fd906145f1c6a96470e47c5583935a37 Mon Sep 17 00:00:00 2001 From: "codeflash-ai[bot]" <148906541+codeflash-ai[bot]@users.noreply.github.com> Date: Fri, 22 Aug 2025 12:37:47 +0000 Subject: [PATCH 1/7] =?UTF-8?q?=E2=9A=A1=EF=B8=8F=20Speed=20up=20function?= =?UTF-8?q?=20`=5Fassign=5Fhash=5Fids`=20by=2034%?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The optimization replaces `itertools.groupby` with a simple dictionary-based counting approach in the `_assign_hash_ids` function. **Key change:** Instead of creating intermediate lists (`page_numbers` and `page_seq_numbers`) and using `itertools.groupby`, the optimized version uses a dictionary `page_seq_counts` to track sequence numbers for each page in a single pass. **Why it's faster:** - **Eliminates list comprehensions:** The original code creates a full `page_numbers` list upfront, then processes it with `groupby`. The optimized version processes elements directly without intermediate collections. - **Removes `itertools.groupby` overhead:** `groupby` requires sorting/grouping operations that add computational complexity. The dictionary lookup `page_seq_counts.get(page_number, 0)` is O(1) vs the O(n) grouping operations. - **Single-pass processing:** Instead of two passes (first to collect page numbers, then to generate sequences), the optimization does everything in one loop through the elements. **Performance characteristics:** The optimization is particularly effective for documents with many pages or elements, as shown in the test results where empty lists see 300%+ speedups. The 34% overall speedup demonstrates the efficiency gain from eliminating the `itertools.groupby` bottleneck, which consumed 19.5% + 6.3% of the original runtime according to the line profiler. --- unstructured/documents/elements.py | 11 +++++------ unstructured/partition/common/metadata.py | 14 +++++--------- 2 files changed, 10 insertions(+), 15 deletions(-) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 0caf340b96..8f588a86ad 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -15,12 +15,11 @@ from typing_extensions import ParamSpec, TypeAlias, TypedDict -from unstructured.documents.coordinates import ( - TYPE_TO_COORDINATE_SYSTEM_MAP, - CoordinateSystem, - RelativeCoordinateSystem, -) -from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA +from unstructured.documents.coordinates import (TYPE_TO_COORDINATE_SYSTEM_MAP, + CoordinateSystem, + RelativeCoordinateSystem) +from unstructured.partition.utils.constants import \ + UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.utils import get_call_args_applying_defaults, lazyproperty Point: TypeAlias = "tuple[float, float]" diff --git a/unstructured/partition/common/metadata.py b/unstructured/partition/common/metadata.py index 29704d97d0..08b0fe9597 100644 --- a/unstructured/partition/common/metadata.py +++ b/unstructured/partition/common/metadata.py @@ -5,7 +5,6 @@ import copy import datetime as dt import functools -import itertools import os from typing import Any, Callable, Iterator, Sequence @@ -252,15 +251,12 @@ def _assign_hash_ids(elements: list[Element]) -> list[Element]: or more fragments for parallel processing. """ # -- generate sequence number for each element on a page -- - page_numbers = [e.metadata.page_number for e in elements] - page_seq_numbers = [ - seq_on_page - for _, group in itertools.groupby(page_numbers) - for seq_on_page, _ in enumerate(group) - ] - - for element, seq_on_page_counter in zip(elements, page_seq_numbers): + page_seq_counts = {} + for element in elements: + page_number = element.metadata.page_number + seq_on_page_counter = page_seq_counts.get(page_number, 0) element.id_to_hash(seq_on_page_counter) + page_seq_counts[page_number] = seq_on_page_counter + 1 return elements From 136fe26efb6f56e94471f011092e2ae30f51bb0b Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Thu, 28 Aug 2025 22:43:53 +0000 Subject: [PATCH 2/7] cleaning up --- unstructured/documents/elements.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/unstructured/documents/elements.py b/unstructured/documents/elements.py index 8f588a86ad..0caf340b96 100644 --- a/unstructured/documents/elements.py +++ b/unstructured/documents/elements.py @@ -15,11 +15,12 @@ from typing_extensions import ParamSpec, TypeAlias, TypedDict -from unstructured.documents.coordinates import (TYPE_TO_COORDINATE_SYSTEM_MAP, - CoordinateSystem, - RelativeCoordinateSystem) -from unstructured.partition.utils.constants import \ - UNSTRUCTURED_INCLUDE_DEBUG_METADATA +from unstructured.documents.coordinates import ( + TYPE_TO_COORDINATE_SYSTEM_MAP, + CoordinateSystem, + RelativeCoordinateSystem, +) +from unstructured.partition.utils.constants import UNSTRUCTURED_INCLUDE_DEBUG_METADATA from unstructured.utils import get_call_args_applying_defaults, lazyproperty Point: TypeAlias = "tuple[float, float]" From ea4a35aafb3e8a835bd2a222841dfb5b84e7a43f Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Fri, 5 Sep 2025 23:45:41 +0000 Subject: [PATCH 3/7] changelog --- CHANGELOG.md | 19 +++++++++++++++++++ unstructured/__version__.py | 2 +- 2 files changed, 20 insertions(+), 1 deletion(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index efa6d33a9c..50c354a84e 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,6 +1,25 @@ +## 0.18.15-dev2 + +### Enhancements +- Speed up function _assign_hash_ids by 34% (codeflash) + +### Features + +### Fixes + +## 0.18.15-dev1 + +### Enhancements +- Speed up function group_broken_paragraphs by 30% (codeflash) + +### Features + +### Fixes + ## 0.18.15-dev0 ### Enhancements +- Optimized the runtime of `ElementHtml._get_children_html` ### Features diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 9d8d327217..d4a801ee5e 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.15-dev0" # pragma: no cover +__version__ = "0.18.15-dev2" # pragma: no cover From 35770e09d8937cbd9830cc5dd27561686daebb2d Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 9 Sep 2025 18:11:42 -0700 Subject: [PATCH 4/7] Update __version__.py remove newline --- unstructured/__version__.py | 1 - 1 file changed, 1 deletion(-) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index ce245660f7..c82416a4b0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1,2 +1 @@ __version__ = "0.18.15-dev1" # pragma: no cover - From 4f8ce2ceb88191f2f181a483a2385fc64d29029e Mon Sep 17 00:00:00 2001 From: Aseem Saxena Date: Tue, 23 Sep 2025 18:47:13 +0000 Subject: [PATCH 5/7] changelog version update --- CHANGELOG.md | 6 +++++- unstructured/__version__.py | 2 +- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index ee60f3e6f5..8b3daa5327 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -1,7 +1,11 @@ +## 0.18.16-dev0 + +### Enhancement +- Speed up function _assign_hash_ids by 34% (codeflash) + ## 0.18.15 ### Enhancements -- Speed up function _assign_hash_ids by 34% (codeflash) - Speed up function ElementHtml._get_children_html by 234% (codeflash) - Speed up function group_broken_paragraphs by 30% (codeflash) diff --git a/unstructured/__version__.py b/unstructured/__version__.py index 99e9be3387..52ed62c9c0 100644 --- a/unstructured/__version__.py +++ b/unstructured/__version__.py @@ -1 +1 @@ -__version__ = "0.18.15" # pragma: no cover +__version__ = "0.18.16-dev0" # pragma: no cover From 42175a31030028fb7079858ebe12ad4b8250cc08 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Thu, 25 Sep 2025 13:37:14 -0500 Subject: [PATCH 6/7] Update handbook-1p.docx.json --- .../local-single-file-basic-chunking/handbook-1p.docx.json | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json index cc6ecebd11..248b38a649 100644 --- a/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json +++ b/test_unstructured_ingest/expected-structured-output/local-single-file-basic-chunking/handbook-1p.docx.json @@ -360,7 +360,7 @@ "eng" ], "page_number": 1, - "orig_elements": "eJztUsmO2zAM/RVB58ZLYidxjz3NaVCg6WkwMGiLsoVYCyQ6TRHk30u1GWBaoH8wN1Hk01vEl5vEBS066o2Sn4VEPNTVfoBKbZvq0FVNpSuNXdvVGnYwNPKTkBYJFBDw/E3mQ5/8Gkf8XQeM1qRkvEv9Y+jlJq1Xub3b1cfj/ZXfiDj6qPrFj0A+/kECzVlCOXuLZVydw1j+8PFcri5RXEdaI6q/C7yCDQtulB9TOYNTg/fnTR0KvrjK+52ZtFmQfoZMLyGExTAjqysvThU+oLvaRftogdLGa21GZOiaEymYW4XoR2Q7brJL8dbJISzgphUmTNmgRDfJbCvwTe9WO2D2VGd+witl7tNsknh6SBTKYxLOkxgjAqEApUyWBYuIZpopCZjAsFVBM4pEjGMNgp0nQhQ+iu/OECrxjRiexOnRME5ouHDba+EZGkWASAZTkUW/5fAMMXIKFzxlcazy3zXoukrtj1V7GLb7o1aNbrbNoBpox6FtVbf7WIP/r8H7T38f+lce+8K/fZb3118ujDSv" + "orig_elements": "eJztUsmO2zAM/RVB58bL2Nl67KmnQYGmp8HAoCXKFmItkOg0RZB/L9VmgGmB/kFvosint4gvN4kLOvQ0WC0/Com4b5vdCI1+6pv9sekb0xg8bo+tgQ7GXn4Q0iGBBgKev8lyGHJYk8JfdcTkbM42+Dw8hl5u0gVd2l3XHg73V34joQpJD0tQQCH9RgLNRUI9B4d1Wr3HVH8P6VyvPlNaFa0J9Z8FXsHFBTc6qFzP4PUYwnnTxoovrvJ+ZyZjF6QfsdBLiHGxzMjq6ovXVYjor24xITmgvAnGWIUMXUsiFXPrmIJCtuMnt1RvnRLCAn5aYcJcDEr0kyy2It8MfnUjFk9t4Se8UuE+zTaLzw+JQgfMwgcSKiEQCtDaFlmwiGSnmbKACSxbFTSjyMQ41iDYeSZEEZL45i2hFl+J4VmcHg3rhYELt4MRgaFJREhkMVdF9FsOz5ASp3DBUxHHKv9eg2402wa2DW47rXBU4/6wM3vTd7tDt+8B/q/Bv9fg/ae/D/0Lj33i3z7L++tP43k1Mw==" } }, { @@ -760,7 +760,7 @@ "languages": [ "eng" ], - "orig_elements": "eJxVUEtuwyAQvUrEurbjpFGcbCv1ElFkjWFsowCDBmgdRb57IW0WXcEw78e7PAQatOhir5U4b8S7OnXbA5yOshskSnnctcOhha4dcbcd9kq8bYTFCAoiZPxDlEsfKLHE5+yRrQ5Bkwv9H+jyEJZUWe/3bdet16zBKIlVb0hCJP5lQpxLhGYmiw0n55Cbb+Jbk1yInGRMjOr/gAtYb7BSJEMzg1MD0a1qfZ0fFrGu2WnUBuPdF3sB3hudHXO65supmjy6xZqR2EIMFY2jlpipqTRSZ2/lmSTm77jJmvq1KSXMCAq5H4liPl4GnrUFvheAATclmDCUBgS6SVxLnIhLLMgP8nfW0/wUe9E/n2pivf4An2aN4g==" + "orig_elements": "eJxVUNFuwyAM/JWJ5zVZ1C6L9jppP1FVkQMmQQWMDGypqvz7IFsf9gS2z3fnO98FWnTo02iUeH8S0Es1dKfj6bV/64cJ9HQaZD9JOb1MEjstnp+EwwQKEhT8XdTPGCmzxL0OyM7EaMjH8Q90vgtHqo6Px24YtkvhYJTEarQkIRH/bkJaqoV2IYctZ++R22/ia5t9TJxlyozqf4EruGDxoEjGdgGvJqLroQtNaaxi24qSNhbTLeB+XQjWFMXirv3yqqGAfnVWEztI8UBaG4llNddEmqKtApPEco6fnW0ekxrCgqCQR02UyvMQCGwc8K0CLPg5w4yxJiDQz+JS7SRcU0V+ULixmZed7LH+ubOJ7fIDJz+OQw==" } } -] \ No newline at end of file +] From 03e48f314d81ea0b64cee64a6e620c10ef6d7da2 Mon Sep 17 00:00:00 2001 From: qued <64741807+qued@users.noreply.github.com> Date: Thu, 25 Sep 2025 14:24:07 -0500 Subject: [PATCH 7/7] Update handbook-1p.docx.json