Filimoa · espoirMur · Sep 11, 2025 · Sep 11, 2025 · Sep 12, 2025 · Sep 24, 2025
diff --git a/pyproject.toml b/pyproject.toml
@@ -19,6 +19,8 @@ dependencies = [
     "tiktoken >= 0.3",
     "openai >= 1.0.0",
     "numpy",
+    "pdfplumber==0.11.7", 
+    "pypdfium2==4.30.0"
 ]
 
 [project.urls]

diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py
@@ -13,7 +13,8 @@
 
 from openparse.schemas import ImageElement
 
-IngestionPipelineType = TypeVar("IngestionPipelineType", bound=IngestionPipeline)
+IngestionPipelineType = TypeVar(
+    "IngestionPipelineType", bound=IngestionPipeline)
 
 
 class UnitableArgsDict(TypedDict, total=False):
@@ -34,15 +35,22 @@ class PyMuPDFArgsDict(TypedDict, total=False):
     table_output_format: Literal["markdown", "html"]
 
 
+class PDfPlumberArgsDict(TypedDict, total=False):
+    parsing_algorithm: Literal["pdfplumber"]
+    table_output_format: Literal["markdown", "html"]
+
+
 def _table_args_dict_to_model(
-    args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict],
-) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs]:
+    args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict, PDfPlumberArgsDict],
+) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs, tables.PDfPlumberArgs]:
     if args_dict["parsing_algorithm"] == "table-transformers":
         return tables.TableTransformersArgs(**args_dict)
     elif args_dict["parsing_algorithm"] == "pymupdf":
         return tables.PyMuPDFArgs(**args_dict)
     elif args_dict["parsing_algorithm"] == "unitable":
         return tables.UnitableArgs(**args_dict)
+    elif args_dict["parsing_algorithm"] == "pdfplumber":
+        return tables.PDfPlumberArgs(**args_dict)
     else:
         raise ValueError(
             f"Unsupported parsing_algorithm: {args_dict['parsing_algorithm']}"
@@ -63,7 +71,8 @@ class DocumentParser:
     def __init__(
         self,
         *,
-        processing_pipeline: Union[IngestionPipeline, NotGiven, None] = NOT_GIVEN,
+        processing_pipeline: Union[IngestionPipeline,
+                                   NotGiven, None] = NOT_GIVEN,
         table_args: Union[
             TableTransformersArgsDict, PyMuPDFArgsDict, NotGiven
         ] = NOT_GIVEN,
@@ -104,7 +113,8 @@ def parse(
         table_args_obj = None
         if self.table_args:
             table_args_obj = _table_args_dict_to_model(self.table_args)
-            table_elems = tables.ingest(doc, table_args_obj, verbose=self._verbose)
+            table_elems = tables.ingest(
+                doc, table_args_obj, verbose=self._verbose)
             table_nodes = self._elems_to_nodes(table_elems)
 
         nodes = text_nodes + table_nodes

diff --git a/src/openparse/processing/basic_transforms.py b/src/openparse/processing/basic_transforms.py
@@ -109,11 +109,13 @@ def process(self, nodes: List[Node]) -> List[Node]:
         for page, page_nodes in nodes_by_page.items():
             image_nodes = [e for e in page_nodes if e.variant == {"image"}]
             if image_nodes:
-                image_elements = get_elements_of_type(image_nodes, ImageElement)
+                image_elements = get_elements_of_type(
+                    image_nodes, ImageElement)
                 text_elements = get_elements_of_type(page_nodes, TextElement)
 
                 combined_image = self._combine_images_in_group(image_elements)
-                new_nodes.append(Node(elements=(combined_image, *text_elements)))
+                new_nodes.append(
+                    Node(elements=(combined_image, *text_elements)))
             else:
                 new_nodes.extend(page_nodes)
         return new_nodes
@@ -130,7 +132,8 @@ def process(self, nodes: List[Node]) -> List[Node]:
         for node in nodes:
             if node.variant == {"table"}:
                 for table_element in node.elements:
-                    tables_by_page[table_element.page].append(table_element.bbox)
+                    tables_by_page[table_element.page].append(
+                        table_element.bbox)
 
         updated_nodes = []
         for node in nodes:
@@ -148,7 +151,6 @@ def process(self, nodes: List[Node]) -> List[Node]:
                 )
                 if should_include:
                     new_elements.append(element)
-
             if new_elements and len(new_elements) != len(node.elements):
                 updated_nodes.append(Node(elements=tuple(new_elements)))
             elif len(new_elements) == len(node.elements):

diff --git a/src/openparse/tables/__init__.py b/src/openparse/tables/__init__.py
@@ -1,3 +1,4 @@
-from .parse import PyMuPDFArgs, TableTransformersArgs, UnitableArgs, ingest
+from .parse import PyMuPDFArgs, TableTransformersArgs, UnitableArgs, ingest, PDfPlumberArgs
 
-__all__ = ["ingest", "TableTransformersArgs", "PyMuPDFArgs", "UnitableArgs"]
+__all__ = ["ingest", "TableTransformersArgs",
+           "PyMuPDFArgs", "UnitableArgs", "PDfPlumberArgs"]
diff --git a/src/openparse/tables/parse.py b/src/openparse/tables/parse.py
@@ -1,17 +1,18 @@
-from typing import List, Literal, Union
+from typing import List, Literal, Union, Dict
 
 from pydantic import BaseModel, ConfigDict, Field
 
 from openparse.pdf import Pdf
 from openparse.schemas import Bbox, TableElement
-from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding
+from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding, pdf_plumber_table_data_to_markdown
 
 from . import pymupdf
 
 
 class ParsingArgs(BaseModel):
     parsing_algorithm: str
-    table_output_format: Literal["str", "markdown", "html"] = Field(default="html")
+    table_output_format: Literal["str",
+                                 "markdown", "html"] = Field(default="html")
 
 
 class TableTransformersArgs(BaseModel):
@@ -20,14 +21,16 @@ class TableTransformersArgs(BaseModel):
     )
     min_table_confidence: float = Field(default=0.75, ge=0.0, le=1.0)
     min_cell_confidence: float = Field(default=0.95, ge=0.0, le=1.0)
-    table_output_format: Literal["str", "markdown", "html"] = Field(default="html")
+    table_output_format: Literal["str",
+                                 "markdown", "html"] = Field(default="html")
 
     model_config = ConfigDict(extra="forbid")
 
 
 class PyMuPDFArgs(BaseModel):
     parsing_algorithm: Literal["pymupdf"] = Field(default="pymupdf")
-    table_output_format: Literal["str", "markdown", "html"] = Field(default="html")
+    table_output_format: Literal["str",
+                                 "markdown", "html"] = Field(default="html")
 
     model_config = ConfigDict(extra="forbid")
 
@@ -40,6 +43,13 @@ class UnitableArgs(BaseModel):
     model_config = ConfigDict(extra="forbid")
 
 
+class PDfPlumberArgs(BaseModel):
+    parsing_algorithm: Literal["pdfplumber"] = Field(default="pdfplumber")
+    table_output_format: Literal["markdown"] = Field(default="html")
+    table_parse_settings: Dict = Field(default_factory=dict)
+    model_config = ConfigDict(extra="forbid")
+
+
 def _ingest_with_pymupdf(
     doc: Pdf,
     parsing_args: PyMuPDFArgs,
@@ -69,7 +79,8 @@ def _ingest_with_pymupdf(
                 print(f"Page {page_num} - Table {i + 1}:\n{text}\n")
 
             # Flip y-coordinates to match the top-left origin system
-            bbox = pymupdf.combine_header_and_table_bboxes(tab.bbox, tab.header.bbox)
+            bbox = pymupdf.combine_header_and_table_bboxes(
+                tab.bbox, tab.header.bbox)
             fy0 = page.rect.height - bbox[3]
             fy1 = page.rect.height - bbox[1]
 
@@ -108,7 +119,8 @@ def _ingest_with_table_transformers(
 
     pages_with_tables = {}
     for page_num, img in enumerate(pdf_as_imgs):
-        pages_with_tables[page_num] = find_table_bboxes(img, args.min_table_confidence)
+        pages_with_tables[page_num] = find_table_bboxes(
+            img, args.min_table_confidence)
 
     tables = []
     for page_num, table_bboxes in pages_with_tables.items():
@@ -177,7 +189,8 @@ def _ingest_with_unitable(
 
     pages_with_tables = {}
     for page_num, img in enumerate(pdf_as_imgs):
-        pages_with_tables[page_num] = find_table_bboxes(img, args.min_table_confidence)
+        pages_with_tables[page_num] = find_table_bboxes(
+            img, args.min_table_confidence)
 
     tables = []
     for page_num, table_bboxes in pages_with_tables.items():
@@ -190,7 +203,8 @@ def _ingest_with_unitable(
                 page_height=page.rect.height,
                 padding_pct=padding_pct,
             )
-            table_img = crop_img_with_padding(pdf_as_imgs[page_num], padded_bbox)
+            table_img = crop_img_with_padding(
+                pdf_as_imgs[page_num], padded_bbox)
 
             table_str = table_img_to_html(table_img)
 
@@ -216,9 +230,55 @@ def _ingest_with_unitable(
     return tables
 
 
+def _ingest_with_pdfplumber(
+    doc: Pdf,
+    args: PDfPlumberArgs,
+    verbose: bool = False,
+) -> List[TableElement]:
+    try:
+        import pdfplumber
+    except ImportError as e:
+        raise ImportError(
+            "Table detection and extraction requires the `pdfplumber` library to be installed.",
+            e,
+        ) from e
+
+    pdf_plumber_doc = pdfplumber.open(doc.file_path)
+    found_tables = []
+    table_settings = getattr(args, "table_parse_settings", {})
+    for page_number, page in enumerate(pdf_plumber_doc.pages):
+        tables = page.find_tables(table_settings)
+        if not tables:
+            continue
+        for table in tables:
+            table_bounding_box = table.bbox
+            table_data = table.extract()
+            table_markdown = pdf_plumber_table_data_to_markdown(table_data)
+            # the application seems to use pdfminer style coordinates system, let make sure their are  align with the pdfplumber ones.
+
+            # check this discussion https://github.com/jsvine/pdfplumber/issues/198
+            # table bounding 3 is the bottom
+            # table bounding 1 is the top
+            y0 = page.height - table_bounding_box[3]
+            y1 = page.height - table_bounding_box[1]
+            table_bbox = Bbox(
+                page=page_number,
+                x0=table_bounding_box[0],
+                y0=y0,
+                x1=table_bounding_box[2],
+                y1=y1,
+                page_width=page.width,
+                page_height=page.height)
+            table_element = TableElement(
+                bbox=table_bbox, text=table_markdown)
+            found_tables.append(table_element)
+    return found_tables
+
+
 def ingest(
     doc: Pdf,
-    parsing_args: Union[TableTransformersArgs, PyMuPDFArgs, UnitableArgs, None] = None,
+    parsing_args: Union[TableTransformersArgs,
+                        PyMuPDFArgs, UnitableArgs, None] = None,
     verbose: bool = False,
 ) -> List[TableElement]:
     if isinstance(parsing_args, TableTransformersArgs):
@@ -227,5 +287,7 @@ def ingest(
         return _ingest_with_pymupdf(doc, parsing_args, verbose)
     elif isinstance(parsing_args, UnitableArgs):
         return _ingest_with_unitable(doc, parsing_args, verbose)
+    elif isinstance(parsing_args, PDfPlumberArgs):
+        return _ingest_with_pdfplumber(doc, parsing_args, verbose)
     else:
         raise ValueError("Unsupported parsing_algorithm.")
diff --git a/src/openparse/tables/utils.py b/src/openparse/tables/utils.py
@@ -62,7 +62,8 @@ def doc_to_imgs(doc) -> List[Image.Image]:
         for n in page_numbers:
             page = doc[n]
             pix = page.get_pixmap()
-            image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
+            image = Image.frombytes(
+                "RGB", [pix.width, pix.height], pix.samples)
             images.append(image)
 
     except ValueError as e:
@@ -189,9 +190,31 @@ def adjust_bbox_with_padding(
     # Adjust the bounding box coordinates with padding
     padded_x0 = max(x0 - padding_x, 0)  # Ensure x0 is not less than 0
     padded_y0 = max(y0 - padding_y, 0)  # Ensure y0 is not less than 0
-    padded_x1 = min(x1 + padding_x, page_width)  # Ensure x1 does not exceed page width
+    # Ensure x1 does not exceed page width
+    padded_x1 = min(x1 + padding_x, page_width)
     padded_y1 = min(
         y1 + padding_y, page_height
     )  # Ensure y1 does not exceed page height
 
     return padded_x0, padded_y0, padded_x1, padded_y1
+
+
+# pdf plumber utils
+
+def pdf_plumber_table_data_to_markdown(table_data: List[List[str | None]]) -> str:
+    """ Convert a 2D list of table data into a markdown table string."""
+    if not table_data or not table_data[0]:
+        return ""
+    header = table_data[0]
+    rows = table_data[1:]
+    # Build header row
+    table_markdown = "| " + \
+        " | ".join(str(h) if h is not None else "" for h in header) + " |\n"
+    # Build separator row
+    table_markdown += "| " + " | ".join("---" for _ in header) + " |\n"
+    # Build data rows
+    for row in rows:
+        table_markdown += "| " + \
+            " | ".join(
+                str(cell) if cell is not None else "" for cell in row) + " |\n"
+    return table_markdown
diff --git a/src/tests/tables/pymupdf/test_parse.py b/src/tests/tables/pymupdf/test_parse.py
@@ -11,7 +11,8 @@ def test_parse_output_to_markdown():
         "| 2022 | 100,000 | 50,000 |\n"
         "| 2021 | 90,000 | 45,000 |\n"
     )
-    assert output_to_markdown(headers, rows) == expected_output, "Standard case failed"
+    assert output_to_markdown(
+        headers, rows) == expected_output, "Standard case failed"
 
     # Case with missing values
     headers = ["Year", "Revenue", "Expenses"]
@@ -48,7 +49,8 @@ def test_output_to_html():
         "<tr><td>2021</td><td>90,000</td><td>45,000</td></tr>\n"
         "</table>"
     )
-    assert output_to_html(headers, rows) == expected_output, "Standard case failed"
+    assert output_to_html(
+        headers, rows) == expected_output, "Standard case failed"
 
     # Case with missing values
     headers = ["Year", "Revenue", "Expenses"]