Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,8 @@ dependencies = [
"tiktoken >= 0.3",
"openai >= 1.0.0",
"numpy",
"pdfplumber==0.11.7",
"pypdfium2==4.30.0"
]

[project.urls]
Expand Down
20 changes: 15 additions & 5 deletions src/openparse/doc_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,8 @@

from openparse.schemas import ImageElement

IngestionPipelineType = TypeVar("IngestionPipelineType", bound=IngestionPipeline)
IngestionPipelineType = TypeVar(
"IngestionPipelineType", bound=IngestionPipeline)


class UnitableArgsDict(TypedDict, total=False):
Expand All @@ -34,15 +35,22 @@ class PyMuPDFArgsDict(TypedDict, total=False):
table_output_format: Literal["markdown", "html"]


class PDfPlumberArgsDict(TypedDict, total=False):
parsing_algorithm: Literal["pdfplumber"]
table_output_format: Literal["markdown", "html"]


def _table_args_dict_to_model(
args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict],
) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs]:
args_dict: Union[TableTransformersArgsDict, PyMuPDFArgsDict, PDfPlumberArgsDict],
) -> Union[tables.TableTransformersArgs, tables.PyMuPDFArgs, tables.PDfPlumberArgs]:
if args_dict["parsing_algorithm"] == "table-transformers":
return tables.TableTransformersArgs(**args_dict)
elif args_dict["parsing_algorithm"] == "pymupdf":
return tables.PyMuPDFArgs(**args_dict)
elif args_dict["parsing_algorithm"] == "unitable":
return tables.UnitableArgs(**args_dict)
elif args_dict["parsing_algorithm"] == "pdfplumber":
return tables.PDfPlumberArgs(**args_dict)
else:
raise ValueError(
f"Unsupported parsing_algorithm: {args_dict['parsing_algorithm']}"
Expand All @@ -63,7 +71,8 @@ class DocumentParser:
def __init__(
self,
*,
processing_pipeline: Union[IngestionPipeline, NotGiven, None] = NOT_GIVEN,
processing_pipeline: Union[IngestionPipeline,
NotGiven, None] = NOT_GIVEN,
table_args: Union[
TableTransformersArgsDict, PyMuPDFArgsDict, NotGiven
] = NOT_GIVEN,
Expand Down Expand Up @@ -104,7 +113,8 @@ def parse(
table_args_obj = None
if self.table_args:
table_args_obj = _table_args_dict_to_model(self.table_args)
table_elems = tables.ingest(doc, table_args_obj, verbose=self._verbose)
table_elems = tables.ingest(
doc, table_args_obj, verbose=self._verbose)
table_nodes = self._elems_to_nodes(table_elems)

nodes = text_nodes + table_nodes
Expand Down
10 changes: 6 additions & 4 deletions src/openparse/processing/basic_transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -109,11 +109,13 @@ def process(self, nodes: List[Node]) -> List[Node]:
for page, page_nodes in nodes_by_page.items():
image_nodes = [e for e in page_nodes if e.variant == {"image"}]
if image_nodes:
image_elements = get_elements_of_type(image_nodes, ImageElement)
image_elements = get_elements_of_type(
image_nodes, ImageElement)
text_elements = get_elements_of_type(page_nodes, TextElement)

combined_image = self._combine_images_in_group(image_elements)
new_nodes.append(Node(elements=(combined_image, *text_elements)))
new_nodes.append(
Node(elements=(combined_image, *text_elements)))
else:
new_nodes.extend(page_nodes)
return new_nodes
Expand All @@ -130,7 +132,8 @@ def process(self, nodes: List[Node]) -> List[Node]:
for node in nodes:
if node.variant == {"table"}:
for table_element in node.elements:
tables_by_page[table_element.page].append(table_element.bbox)
tables_by_page[table_element.page].append(
table_element.bbox)

updated_nodes = []
for node in nodes:
Expand All @@ -148,7 +151,6 @@ def process(self, nodes: List[Node]) -> List[Node]:
)
if should_include:
new_elements.append(element)

if new_elements and len(new_elements) != len(node.elements):
updated_nodes.append(Node(elements=tuple(new_elements)))
elif len(new_elements) == len(node.elements):
Expand Down
5 changes: 3 additions & 2 deletions src/openparse/tables/__init__.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
from .parse import PyMuPDFArgs, TableTransformersArgs, UnitableArgs, ingest
from .parse import PyMuPDFArgs, TableTransformersArgs, UnitableArgs, ingest, PDfPlumberArgs

__all__ = ["ingest", "TableTransformersArgs", "PyMuPDFArgs", "UnitableArgs"]
__all__ = ["ingest", "TableTransformersArgs",
"PyMuPDFArgs", "UnitableArgs", "PDfPlumberArgs"]
82 changes: 72 additions & 10 deletions src/openparse/tables/parse.py
Original file line number Diff line number Diff line change
@@ -1,17 +1,18 @@
from typing import List, Literal, Union
from typing import List, Literal, Union, Dict

from pydantic import BaseModel, ConfigDict, Field

from openparse.pdf import Pdf
from openparse.schemas import Bbox, TableElement
from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding
from openparse.tables.utils import adjust_bbox_with_padding, crop_img_with_padding, pdf_plumber_table_data_to_markdown

from . import pymupdf


class ParsingArgs(BaseModel):
parsing_algorithm: str
table_output_format: Literal["str", "markdown", "html"] = Field(default="html")
table_output_format: Literal["str",
"markdown", "html"] = Field(default="html")


class TableTransformersArgs(BaseModel):
Expand All @@ -20,14 +21,16 @@ class TableTransformersArgs(BaseModel):
)
min_table_confidence: float = Field(default=0.75, ge=0.0, le=1.0)
min_cell_confidence: float = Field(default=0.95, ge=0.0, le=1.0)
table_output_format: Literal["str", "markdown", "html"] = Field(default="html")
table_output_format: Literal["str",
"markdown", "html"] = Field(default="html")

model_config = ConfigDict(extra="forbid")


class PyMuPDFArgs(BaseModel):
parsing_algorithm: Literal["pymupdf"] = Field(default="pymupdf")
table_output_format: Literal["str", "markdown", "html"] = Field(default="html")
table_output_format: Literal["str",
"markdown", "html"] = Field(default="html")

model_config = ConfigDict(extra="forbid")

Expand All @@ -40,6 +43,13 @@ class UnitableArgs(BaseModel):
model_config = ConfigDict(extra="forbid")


class PDfPlumberArgs(BaseModel):
parsing_algorithm: Literal["pdfplumber"] = Field(default="pdfplumber")
table_output_format: Literal["markdown"] = Field(default="html")
table_parse_settings: Dict = Field(default_factory=dict)
model_config = ConfigDict(extra="forbid")


def _ingest_with_pymupdf(
doc: Pdf,
parsing_args: PyMuPDFArgs,
Expand Down Expand Up @@ -69,7 +79,8 @@ def _ingest_with_pymupdf(
print(f"Page {page_num} - Table {i + 1}:\n{text}\n")

# Flip y-coordinates to match the top-left origin system
bbox = pymupdf.combine_header_and_table_bboxes(tab.bbox, tab.header.bbox)
bbox = pymupdf.combine_header_and_table_bboxes(
tab.bbox, tab.header.bbox)
fy0 = page.rect.height - bbox[3]
fy1 = page.rect.height - bbox[1]

Expand Down Expand Up @@ -108,7 +119,8 @@ def _ingest_with_table_transformers(

pages_with_tables = {}
for page_num, img in enumerate(pdf_as_imgs):
pages_with_tables[page_num] = find_table_bboxes(img, args.min_table_confidence)
pages_with_tables[page_num] = find_table_bboxes(
img, args.min_table_confidence)

tables = []
for page_num, table_bboxes in pages_with_tables.items():
Expand Down Expand Up @@ -177,7 +189,8 @@ def _ingest_with_unitable(

pages_with_tables = {}
for page_num, img in enumerate(pdf_as_imgs):
pages_with_tables[page_num] = find_table_bboxes(img, args.min_table_confidence)
pages_with_tables[page_num] = find_table_bboxes(
img, args.min_table_confidence)

tables = []
for page_num, table_bboxes in pages_with_tables.items():
Expand All @@ -190,7 +203,8 @@ def _ingest_with_unitable(
page_height=page.rect.height,
padding_pct=padding_pct,
)
table_img = crop_img_with_padding(pdf_as_imgs[page_num], padded_bbox)
table_img = crop_img_with_padding(
pdf_as_imgs[page_num], padded_bbox)

table_str = table_img_to_html(table_img)

Expand All @@ -216,9 +230,55 @@ def _ingest_with_unitable(
return tables


def _ingest_with_pdfplumber(
doc: Pdf,
args: PDfPlumberArgs,
verbose: bool = False,
) -> List[TableElement]:
try:
import pdfplumber
except ImportError as e:
raise ImportError(
"Table detection and extraction requires the `pdfplumber` library to be installed.",
e,
) from e

pdf_plumber_doc = pdfplumber.open(doc.file_path)
found_tables = []
table_settings = getattr(args, "table_parse_settings", {})
for page_number, page in enumerate(pdf_plumber_doc.pages):
tables = page.find_tables(table_settings)
if not tables:
continue
for table in tables:
table_bounding_box = table.bbox
table_data = table.extract()
table_markdown = pdf_plumber_table_data_to_markdown(table_data)
# the application seems to use pdfminer style coordinates system, let make sure their are align with the pdfplumber ones.

# check this discussion https://github.com/jsvine/pdfplumber/issues/198
# table bounding 3 is the bottom
# table bounding 1 is the top
y0 = page.height - table_bounding_box[3]
y1 = page.height - table_bounding_box[1]
table_bbox = Bbox(
page=page_number,
x0=table_bounding_box[0],
y0=y0,
x1=table_bounding_box[2],
y1=y1,
page_width=page.width,
page_height=page.height)
table_element = TableElement(
bbox=table_bbox, text=table_markdown)
found_tables.append(table_element)
return found_tables


def ingest(
doc: Pdf,
parsing_args: Union[TableTransformersArgs, PyMuPDFArgs, UnitableArgs, None] = None,
parsing_args: Union[TableTransformersArgs,
PyMuPDFArgs, UnitableArgs, None] = None,
verbose: bool = False,
) -> List[TableElement]:
if isinstance(parsing_args, TableTransformersArgs):
Expand All @@ -227,5 +287,7 @@ def ingest(
return _ingest_with_pymupdf(doc, parsing_args, verbose)
elif isinstance(parsing_args, UnitableArgs):
return _ingest_with_unitable(doc, parsing_args, verbose)
elif isinstance(parsing_args, PDfPlumberArgs):
return _ingest_with_pdfplumber(doc, parsing_args, verbose)
else:
raise ValueError("Unsupported parsing_algorithm.")
27 changes: 25 additions & 2 deletions src/openparse/tables/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ def doc_to_imgs(doc) -> List[Image.Image]:
for n in page_numbers:
page = doc[n]
pix = page.get_pixmap()
image = Image.frombytes("RGB", [pix.width, pix.height], pix.samples)
image = Image.frombytes(
"RGB", [pix.width, pix.height], pix.samples)
images.append(image)

except ValueError as e:
Expand Down Expand Up @@ -189,9 +190,31 @@ def adjust_bbox_with_padding(
# Adjust the bounding box coordinates with padding
padded_x0 = max(x0 - padding_x, 0) # Ensure x0 is not less than 0
padded_y0 = max(y0 - padding_y, 0) # Ensure y0 is not less than 0
padded_x1 = min(x1 + padding_x, page_width) # Ensure x1 does not exceed page width
# Ensure x1 does not exceed page width
padded_x1 = min(x1 + padding_x, page_width)
padded_y1 = min(
y1 + padding_y, page_height
) # Ensure y1 does not exceed page height

return padded_x0, padded_y0, padded_x1, padded_y1


# pdf plumber utils

def pdf_plumber_table_data_to_markdown(table_data: List[List[str | None]]) -> str:
""" Convert a 2D list of table data into a markdown table string."""
if not table_data or not table_data[0]:
return ""
header = table_data[0]
rows = table_data[1:]
# Build header row
table_markdown = "| " + \
" | ".join(str(h) if h is not None else "" for h in header) + " |\n"
# Build separator row
table_markdown += "| " + " | ".join("---" for _ in header) + " |\n"
# Build data rows
for row in rows:
table_markdown += "| " + \
" | ".join(
str(cell) if cell is not None else "" for cell in row) + " |\n"
return table_markdown
6 changes: 4 additions & 2 deletions src/tests/tables/pymupdf/test_parse.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,8 @@ def test_parse_output_to_markdown():
"| 2022 | 100,000 | 50,000 |\n"
"| 2021 | 90,000 | 45,000 |\n"
)
assert output_to_markdown(headers, rows) == expected_output, "Standard case failed"
assert output_to_markdown(
headers, rows) == expected_output, "Standard case failed"

# Case with missing values
headers = ["Year", "Revenue", "Expenses"]
Expand Down Expand Up @@ -48,7 +49,8 @@ def test_output_to_html():
"<tr><td>2021</td><td>90,000</td><td>45,000</td></tr>\n"
"</table>"
)
assert output_to_html(headers, rows) == expected_output, "Standard case failed"
assert output_to_html(
headers, rows) == expected_output, "Standard case failed"

# Case with missing values
headers = ["Year", "Revenue", "Expenses"]
Expand Down