From f734bff98d15bf9784ec4f1b864e38c89681bb68 Mon Sep 17 00:00:00 2001
From: Alejo Arias <alejoar@gmail.com>
Date: Tue, 25 Nov 2025 18:24:13 +0100
Subject: [PATCH] feat(pdf): add mime_type parameter to AmazonTextractPDFLoader
 and AmazonTextractPDFParser

---
 .../langchain_community/document_loaders/parsers/pdf.py      | 5 +++++
 libs/community/langchain_community/document_loaders/pdf.py   | 4 +++-
 2 files changed, 8 insertions(+), 1 deletion(-)
diff --git a/libs/community/langchain_community/document_loaders/parsers/pdf.py b/libs/community/langchain_community/document_loaders/parsers/pdf.py
index 59952729..72b95548 100644
--- a/libs/community/langchain_community/document_loaders/parsers/pdf.py
+++ b/libs/community/langchain_community/document_loaders/parsers/pdf.py
@@ -1542,6 +1542,7 @@ def __init__(
         client: Optional[Any] = None,
         *,
         linearization_config: Optional[TextLinearizationConfig] = None,
+        mime_type: Optional[str] = None,
     ) -> None:
         """Initializes the parser.
 
@@ -1553,6 +1554,7 @@ def __init__(
             linearization_config: Config to be used for linearization of the output
                                   should be an instance of TextLinearizationConfig from
                                   the `textractor` pkg
+            mime_type: Mime type of the document to be parsed.
         """
 
         try:
@@ -1561,6 +1563,7 @@ def __init__(
 
             self.tc = tc
             self.textractor = textractor
+            self.mime_type = mime_type
 
             if textract_features is not None:
                 self.textract_features = [
@@ -1617,6 +1620,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
                 input_document=str(blob.path),
                 features=self.textract_features,
                 boto3_textract_client=self.boto3_textract_client,
+                mime_type=self.mime_type,
             )
         else:
             textract_response_json = self.tc.call_textract(
@@ -1624,6 +1628,7 @@ def lazy_parse(self, blob: Blob) -> Iterator[Document]:
                 features=self.textract_features,
                 call_mode=self.tc.Textract_Call_Mode.FORCE_SYNC,
                 boto3_textract_client=self.boto3_textract_client,
+                mime_type=self.mime_type,
             )
 
         document = self.textractor.Document.open(textract_response_json)
diff --git a/libs/community/langchain_community/document_loaders/pdf.py b/libs/community/langchain_community/document_loaders/pdf.py
index 6b51e481..470e7eeb 100644
--- a/libs/community/langchain_community/document_loaders/pdf.py
+++ b/libs/community/langchain_community/document_loaders/pdf.py
@@ -1079,6 +1079,7 @@ def __init__(
         headers: Optional[dict] = None,
         *,
         linearization_config: Optional["TextLinearizationConfig"] = None,
+        mime_type: Optional[str] = None,
     ) -> None:
         """Initialize the loader.
 
@@ -1142,6 +1143,7 @@ def __init__(
             textract_features=features,
             client=client,
             linearization_config=linearization_config,
+            mime_type=mime_type,
         )
 
     def load(self) -> list[Document]:
@@ -1380,7 +1382,7 @@ def __init__(
                 Hosted models are passed in format "<provider>/<model>"
                 Examples: "azure/gpt-4o-mini", "vertex_ai/gemini-1.5-flash-001"
                           See more details in zerox documentation.
-            **zerox_kwargs: 
+            **zerox_kwargs:
                 Arguments specific to the zerox function.
                 see datailed list of arguments here in zerox repository:
                 https://github.com/getomni-ai/zerox/blob/main/py_zerox/pyzerox/core/zerox.py#L25