From f734bff98d15bf9784ec4f1b864e38c89681bb68 Mon Sep 17 00:00:00 2001 From: Alejo Arias Date: Tue, 25 Nov 2025 18:24:13 +0100 Subject: [PATCH] feat(pdf): add mime_type parameter to AmazonTextractPDFLoader and AmazonTextractPDFParser --- .../langchain_community/document_loaders/parsers/pdf.py | 5 +++++ libs/community/langchain_community/document_loaders/pdf.py | 4 +++- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py index 59952729..72b95548 100644 --- a/libs/community/langchain_community/document_loaders/parsers/pdf.py +++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py @@ -1542,6 +1542,7 @@ def __init__( client: Optional[Any] = None, *, linearization_config: Optional[TextLinearizationConfig] = None, + mime_type: Optional[str] = None, ) -> None: """Initializes the parser. @@ -1553,6 +1554,7 @@ def __init__( linearization_config: Config to be used for linearization of the output should be an instance of TextLinearizationConfig from the `textractor` pkg + mime_type: Mime type of the document to be parsed. """ try: @@ -1561,6 +1563,7 @@ def __init__( self.tc = tc self.textractor = textractor + self.mime_type = mime_type if textract_features is not None: self.textract_features = [ @@ -1617,6 +1620,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: input_document=str(blob.path), features=self.textract_features, boto3_textract_client=self.boto3_textract_client, + mime_type=self.mime_type, ) else: textract_response_json = self.tc.call_textract( @@ -1624,6 +1628,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]: features=self.textract_features, call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC, boto3_textract_client=self.boto3_textract_client, + mime_type=self.mime_type, ) document = self.textractor.Document.open(textract_response_json) diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py index 6b51e481..470e7eeb 100644 --- a/libs/community/langchain_community/document_loaders/pdf.py +++ b/libs/community/langchain_community/document_loaders/pdf.py @@ -1079,6 +1079,7 @@ def __init__( headers: Optional[dict] = None, *, linearization_config: Optional["TextLinearizationConfig"] = None, + mime_type: Optional[str] = None, ) -> None: """Initialize the loader. @@ -1142,6 +1143,7 @@ def __init__( textract_features=features, client=client, linearization_config=linearization_config, + mime_type=mime_type, ) def load(self) -> list[Document]: @@ -1380,7 +1382,7 @@ def __init__( Hosted models are passed in format "/" Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001" See more details in zerox documentation. - **zerox_kwargs: + **zerox_kwargs: Arguments specific to the zerox function. see datailed list of arguments here in zerox repository: https://github.com/getomni-ai/zerox/blob/main/py_zerox/pyzerox/core/zerox.py#L25