diff --git a/pyproject.toml b/pyproject.toml index c35a224..c432661 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -19,6 +19,8 @@ dependencies = [ "tiktoken >= 0.3", "openai >= 1.0.0", "numpy", + "pdfplumber==0.11.7", + "pypdfium2==4.30.0" ] [project.urls] diff --git a/src/openparse/doc_parser.py b/src/openparse/doc_parser.py index 5caf975..41d3862 100644 --- a/src/openparse/doc_parser.py +++ b/src/openparse/doc_parser.py @@ -13,7 +13,8 @@ from openparse.schemas import ImageElement -IngestionPipelineType = TypeVar("IngestionPipelineType", bound=IngestionPipeline) +IngestionPipelineType = TypeVar( + "IngestionPipelineType", bound=IngestionPipeline) class UnitableArgsDict(TypedDict, total=False): @@ -34,15 +35,22 @@ class PyMuPDFArgsDict(TypedDict, total=False): table_output_format: Literal["markdown", "html"] +class PDfPlumberArgsDict(TypedDict, total=False): + parsing_algorithm: Literal["pdfplumber"] + table_output_format: Literal["markdown", "html"] + + def _table_args_dict_to_model( - args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict], -) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs]: + args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict, PDfPlumberArgsDict], +) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs, tables.PDfPlumberArgs]: if args_dict["parsing_algorithm"] == "table-transformers": return tables.TableTransformersArgs(**args_dict) elif args_dict["parsing_algorithm"] == "pymupdf": return tables.PyMuPDFArgs(**args_dict) elif args_dict["parsing_algorithm"] == "unitable": return tables.UnitableArgs(**args_dict) + elif args_dict["parsing_algorithm"] == "pdfplumber": + return tables.PDfPlumberArgs(**args_dict) else: raise ValueError( f"Unsupported parsing_algorithm: {args_dict['parsing_algorithm']}" @@ -63,7 +71,8 @@ class DocumentParser: def __init__( self, *, - processing_pipeline: Union[IngestionPipeline, NotGiven, None] = NOT_GIVEN, + processing_pipeline: Union[IngestionPipeline, + NotGiven, None] = NOT_GIVEN, table_args: Union[ TableTransformersArgsDict, PyMuPDFArgsDict, NotGiven ] = NOT_GIVEN, @@ -104,7 +113,8 @@ def parse( table_args_obj = None if self.table_args: table_args_obj = _table_args_dict_to_model(self.table_args) - table_elems = tables.ingest(doc, table_args_obj, verbose=self._verbose) + table_elems = tables.ingest( + doc, table_args_obj, verbose=self._verbose) table_nodes = self._elems_to_nodes(table_elems) nodes = text_nodes + table_nodes diff --git a/src/openparse/processing/basic_transforms.py b/src/openparse/processing/basic_transforms.py index 5070c1d..88fbe82 100644 --- a/src/openparse/processing/basic_transforms.py +++ b/src/openparse/processing/basic_transforms.py @@ -109,11 +109,13 @@ def process(self, nodes: List[Node]) -> List[Node]: for page, page_nodes in nodes_by_page.items(): image_nodes = [e for e in page_nodes if e.variant == {"image"}] if image_nodes: - image_elements = get_elements_of_type(image_nodes, ImageElement) + image_elements = get_elements_of_type( + image_nodes, ImageElement) text_elements = get_elements_of_type(page_nodes, TextElement) combined_image = self._combine_images_in_group(image_elements) - new_nodes.append(Node(elements=(combined_image, *text_elements))) + new_nodes.append( + Node(elements=(combined_image, *text_elements))) else: new_nodes.extend(page_nodes) return new_nodes @@ -130,7 +132,8 @@ def process(self, nodes: List[Node]) -> List[Node]: for node in nodes: if node.variant == {"table"}: for table_element in node.elements: - tables_by_page[table_element.page].append(table_element.bbox) + tables_by_page[table_element.page].append( + table_element.bbox) updated_nodes = [] for node in nodes: @@ -148,7 +151,6 @@ def process(self, nodes: List[Node]) -> List[Node]: ) if should_include: new_elements.append(element) - if new_elements and len(new_elements) != len(node.elements): updated_nodes.append(Node(elements=tuple(new_elements))) elif len(new_elements) == len(node.elements): diff --git a/src/openparse/tables/__init__.py b/src/openparse/tables/__init__.py index 007f45e..bae36fb 100644 --- a/src/openparse/tables/__init__.py +++ b/src/openparse/tables/__init__.py @@ -1,3 +1,4 @@ -from .parse import PyMuPDFArgs, TableTransformersArgs, UnitableArgs, ingest +from .parse import PyMuPDFArgs, TableTransformersArgs, UnitableArgs, ingest, PDfPlumberArgs -__all__ = ["ingest", "TableTransformersArgs", "PyMuPDFArgs", "UnitableArgs"] +__all__ = ["ingest", "TableTransformersArgs", + "PyMuPDFArgs", "UnitableArgs", "PDfPlumberArgs"] diff --git a/src/openparse/tables/parse.py b/src/openparse/tables/parse.py index 54ffbbb..be08649 100644 --- a/src/openparse/tables/parse.py +++ b/src/openparse/tables/parse.py @@ -1,17 +1,18 @@ -from typing import List, Literal, Union +from typing import List, Literal, Union, Dict from pydantic import BaseModel, ConfigDict, Field from openparse.pdf import Pdf from openparse.schemas import Bbox, TableElement -from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding +from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding, pdf_plumber_table_data_to_markdown from . import pymupdf class ParsingArgs(BaseModel): parsing_algorithm: str - table_output_format: Literal["str", "markdown", "html"] = Field(default="html") + table_output_format: Literal["str", + "markdown", "html"] = Field(default="html") class TableTransformersArgs(BaseModel): @@ -20,14 +21,16 @@ class TableTransformersArgs(BaseModel): ) min_table_confidence: float = Field(default=0.75, ge=0.0, le=1.0) min_cell_confidence: float = Field(default=0.95, ge=0.0, le=1.0) - table_output_format: Literal["str", "markdown", "html"] = Field(default="html") + table_output_format: Literal["str", + "markdown", "html"] = Field(default="html") model_config = ConfigDict(extra="forbid") class PyMuPDFArgs(BaseModel): parsing_algorithm: Literal["pymupdf"] = Field(default="pymupdf") - table_output_format: Literal["str", "markdown", "html"] = Field(default="html") + table_output_format: Literal["str", + "markdown", "html"] = Field(default="html") model_config = ConfigDict(extra="forbid") @@ -40,6 +43,13 @@ class UnitableArgs(BaseModel): model_config = ConfigDict(extra="forbid") +class PDfPlumberArgs(BaseModel): + parsing_algorithm: Literal["pdfplumber"] = Field(default="pdfplumber") + table_output_format: Literal["markdown"] = Field(default="html") + table_parse_settings: Dict = Field(default_factory=dict) + model_config = ConfigDict(extra="forbid") + + def _ingest_with_pymupdf( doc: Pdf, parsing_args: PyMuPDFArgs, @@ -69,7 +79,8 @@ def _ingest_with_pymupdf( print(f"Page {page_num} - Table {i + 1}:\n{text}\n") # Flip y-coordinates to match the top-left origin system - bbox = pymupdf.combine_header_and_table_bboxes(tab.bbox, tab.header.bbox) + bbox = pymupdf.combine_header_and_table_bboxes( + tab.bbox, tab.header.bbox) fy0 = page.rect.height - bbox[3] fy1 = page.rect.height - bbox[1] @@ -108,7 +119,8 @@ def _ingest_with_table_transformers( pages_with_tables = {} for page_num, img in enumerate(pdf_as_imgs): - pages_with_tables[page_num] = find_table_bboxes(img, args.min_table_confidence) + pages_with_tables[page_num] = find_table_bboxes( + img, args.min_table_confidence) tables = [] for page_num, table_bboxes in pages_with_tables.items(): @@ -177,7 +189,8 @@ def _ingest_with_unitable( pages_with_tables = {} for page_num, img in enumerate(pdf_as_imgs): - pages_with_tables[page_num] = find_table_bboxes(img, args.min_table_confidence) + pages_with_tables[page_num] = find_table_bboxes( + img, args.min_table_confidence) tables = [] for page_num, table_bboxes in pages_with_tables.items(): @@ -190,7 +203,8 @@ def _ingest_with_unitable( page_height=page.rect.height, padding_pct=padding_pct, ) - table_img = crop_img_with_padding(pdf_as_imgs[page_num], padded_bbox) + table_img = crop_img_with_padding( + pdf_as_imgs[page_num], padded_bbox) table_str = table_img_to_html(table_img) @@ -216,9 +230,55 @@ def _ingest_with_unitable( return tables +def _ingest_with_pdfplumber( + doc: Pdf, + args: PDfPlumberArgs, + verbose: bool = False, +) -> List[TableElement]: + try: + import pdfplumber + except ImportError as e: + raise ImportError( + "Table detection and extraction requires the `pdfplumber` library to be installed.", + e, + ) from e + + pdf_plumber_doc = pdfplumber.open(doc.file_path) + found_tables = [] + table_settings = getattr(args, "table_parse_settings", {}) + for page_number, page in enumerate(pdf_plumber_doc.pages): + tables = page.find_tables(table_settings) + if not tables: + continue + for table in tables: + table_bounding_box = table.bbox + table_data = table.extract() + table_markdown = pdf_plumber_table_data_to_markdown(table_data) + # the application seems to use pdfminer style coordinates system, let make sure their are align with the pdfplumber ones. + + # check this discussion https://github.com/jsvine/pdfplumber/issues/198 + # table bounding 3 is the bottom + # table bounding 1 is the top + y0 = page.height - table_bounding_box[3] + y1 = page.height - table_bounding_box[1] + table_bbox = Bbox( + page=page_number, + x0=table_bounding_box[0], + y0=y0, + x1=table_bounding_box[2], + y1=y1, + page_width=page.width, + page_height=page.height) + table_element = TableElement( + bbox=table_bbox, text=table_markdown) + found_tables.append(table_element) + return found_tables + + def ingest( doc: Pdf, - parsing_args: Union[TableTransformersArgs, PyMuPDFArgs, UnitableArgs, None] = None, + parsing_args: Union[TableTransformersArgs, + PyMuPDFArgs, UnitableArgs, None] = None, verbose: bool = False, ) -> List[TableElement]: if isinstance(parsing_args, TableTransformersArgs): @@ -227,5 +287,7 @@ def ingest( return _ingest_with_pymupdf(doc, parsing_args, verbose) elif isinstance(parsing_args, UnitableArgs): return _ingest_with_unitable(doc, parsing_args, verbose) + elif isinstance(parsing_args, PDfPlumberArgs): + return _ingest_with_pdfplumber(doc, parsing_args, verbose) else: raise ValueError("Unsupported parsing_algorithm.") diff --git a/src/openparse/tables/utils.py b/src/openparse/tables/utils.py index 780d4df..0c6712d 100644 --- a/src/openparse/tables/utils.py +++ b/src/openparse/tables/utils.py @@ -62,7 +62,8 @@ def doc_to_imgs(doc) -> List[Image.Image]: for n in page_numbers: page = doc[n] pix = page.get_pixmap() - image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples) + image = Image.frombytes( + "RGB", [pix.width, pix.height], pix.samples) images.append(image) except ValueError as e: @@ -189,9 +190,31 @@ def adjust_bbox_with_padding( # Adjust the bounding box coordinates with padding padded_x0 = max(x0 - padding_x, 0) # Ensure x0 is not less than 0 padded_y0 = max(y0 - padding_y, 0) # Ensure y0 is not less than 0 - padded_x1 = min(x1 + padding_x, page_width) # Ensure x1 does not exceed page width + # Ensure x1 does not exceed page width + padded_x1 = min(x1 + padding_x, page_width) padded_y1 = min( y1 + padding_y, page_height ) # Ensure y1 does not exceed page height return padded_x0, padded_y0, padded_x1, padded_y1 + + +# pdf plumber utils + +def pdf_plumber_table_data_to_markdown(table_data: List[List[str | None]]) -> str: + """ Convert a 2D list of table data into a markdown table string.""" + if not table_data or not table_data[0]: + return "" + header = table_data[0] + rows = table_data[1:] + # Build header row + table_markdown = "| " + \ + " | ".join(str(h) if h is not None else "" for h in header) + " |\n" + # Build separator row + table_markdown += "| " + " | ".join("---" for _ in header) + " |\n" + # Build data rows + for row in rows: + table_markdown += "| " + \ + " | ".join( + str(cell) if cell is not None else "" for cell in row) + " |\n" + return table_markdown diff --git a/src/tests/tables/pymupdf/test_parse.py b/src/tests/tables/pymupdf/test_parse.py index d0f99a0..9d90502 100644 --- a/src/tests/tables/pymupdf/test_parse.py +++ b/src/tests/tables/pymupdf/test_parse.py @@ -11,7 +11,8 @@ def test_parse_output_to_markdown(): "| 2022 | 100,000 | 50,000 |\n" "| 2021 | 90,000 | 45,000 |\n" ) - assert output_to_markdown(headers, rows) == expected_output, "Standard case failed" + assert output_to_markdown( + headers, rows) == expected_output, "Standard case failed" # Case with missing values headers = ["Year", "Revenue", "Expenses"] @@ -48,7 +49,8 @@ def test_output_to_html(): "