From fa1cf65bd434c212aa66314fb733fe504b8f49de Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 11:47:18 +0100 Subject: [PATCH 01/11] Add --server to override any baseurl --- Readme.md | 22 ++++++++++++++++++++-- example.py | 5 +++++ grobid_client/grobid_client.py | 31 ++++++++++++++++++++++++++++++- tests/test_grobid_client.py | 2 +- 4 files changed, 56 insertions(+), 4 deletions(-) diff --git a/Readme.md b/Readme.md index c6094ef..3a1be01 100644 --- a/Readme.md +++ b/Readme.md @@ -55,7 +55,7 @@ usage: grobid_client [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG] [--n N] [--generateIDs] [--consolidate_header] [--consolidate_citations] [--include_raw_citations] [--include_raw_affiliations] [--force] [--teiCoordinates] - [--verbose] [--flavor FLAVOR] + [--verbose] [--flavor FLAVOR] [--server SERVER] service Client for GROBID services @@ -75,6 +75,7 @@ optional arguments: (optional) --config CONFIG path to the config file, default is ./config.json --n N concurrency for service usage + --server SERVER GROBID server URL (default: http://localhost:8070) --generateIDs generate random xml:id to textual XML elements of the result files --consolidate_header call GROBID with consolidation of the metadata @@ -132,6 +133,15 @@ The following command example will process all the PDF files present in the inpu > grobid_client --input ~/tmp/in2 --output ~/tmp/out --teiCoordinates --segmentSentences processFulltextDocument ``` +To use a different GROBID server (e.g., a hosted service), use the `--server` argument: + +```console +> grobid_client --server https://lfoppiano-grobid.hf.space --input ~/tmp/in2 --output ~/tmp/out processFulltextDocument +``` + +> [!NOTE] +> The `--server` argument will override the server URL specified in the config file. If both are provided, the CLI argument takes precedence. + The file `example.py` gives an example of usage as a library, from a another python script. ## Using the client in your python @@ -141,8 +151,13 @@ Import and call the client as follow: ```python from grobid_client.grobid_client import GrobidClient +# Using default localhost server client = GrobidClient(config_path="./config.json") client.process("processFulltextDocument", "/mnt/data/covid/pdfs", n=20) + +# Using a custom server +client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json") +client.process("processFulltextDocument", "/mnt/data/covid/pdfs", n=20) ``` See also `example.py`. @@ -150,7 +165,10 @@ See also `example.py`. ## Configuration of the client > [!TIP] -> from version 0.0.12 the `config.json` will be optional, by default the client will connect to the local server (`http://localhost:8070`). +> from version 0.0.12 the `config.json` will be optional, by default the client will connect to the local server (`http://localhost:8070`). + +> [!NOTE] +> When using the CLI, the `--server` argument will override the `grobid_server` value in the config file. This allows you to use a config file for most settings while easily switching servers via command line. There are a few parameters that can be set with the `config.json` file. diff --git a/example.py b/example.py index 1db1f81..0a91f9f 100644 --- a/example.py +++ b/example.py @@ -1,5 +1,10 @@ from grobid_client.grobid_client import GrobidClient if __name__ == "__main__": + # Example with default localhost server client = GrobidClient(config_path="./config.json") client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) + + # Example with custom server (uncomment to use) + # client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json") + # client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index d530f74..f2b81af 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -81,9 +81,33 @@ def __init__( } } + # Store constructor parameters to ensure they take precedence over config file + # We'll use these to override config file values after loading + self._constructor_params = { + 'grobid_server': grobid_server, + 'batch_size': batch_size, + 'coordinates': coordinates, + 'sleep_time': sleep_time, + 'timeout': timeout + } + # Load config first (which may override logging settings) if config_path: self._load_config(config_path) + + # Ensure constructor parameters take precedence over config file values + # This ensures CLI arguments override config file values + # Only override if the parameter was explicitly provided (not the default) + if grobid_server != 'http://localhost:8070': + self.config['grobid_server'] = grobid_server + if batch_size != 1000: + self.config['batch_size'] = batch_size + if coordinates is not None: # coordinates has a complex default, so check if explicitly provided + self.config['coordinates'] = coordinates + if sleep_time != 5: + self.config['sleep_time'] = sleep_time + if timeout != 180: + self.config['timeout'] = timeout # Configure logging based on config self._configure_logging() @@ -718,6 +742,11 @@ def main(): default=None, help="Define the flavor to be used for the fulltext extraction", ) + parser.add_argument( + "--server", + default="http://localhost:8070", + help="GROBID server URL (default: http://localhost:8070)", + ) args = parser.parse_args() @@ -736,7 +765,7 @@ def main(): # Initialize GrobidClient which will configure logging based on config.json try: - client = GrobidClient(config_path=config_path) + client = GrobidClient(grobid_server=args.server, config_path=config_path) # Now use the client's logger for all subsequent logging logger = client.logger except ServerUnavailableException as e: diff --git a/tests/test_grobid_client.py b/tests/test_grobid_client.py index 265e6ba..61d0d0d 100644 --- a/tests/test_grobid_client.py +++ b/tests/test_grobid_client.py @@ -43,7 +43,7 @@ def test_init_default_values(self, mock_configure_logging, mock_test_server): assert client.config['grobid_server'] == 'http://localhost:8070' assert client.config['batch_size'] == 1000 assert client.config['sleep_time'] == 5 - assert client.config['timeout'] == 60 + assert client.config['timeout'] == 180 assert 'persName' in client.config['coordinates'] mock_configure_logging.assert_called_once() From c807b6c736591725060450c99f795ae84730d534 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 12:03:58 +0100 Subject: [PATCH 02/11] Update tests --- tests/test_integration.py | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/tests/test_integration.py b/tests/test_integration.py index cdf4af1..dd698b0 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -96,9 +96,9 @@ def test_configuration_validation(self): check_server=False ) - # Config file values should override constructor values - assert client.config['grobid_server'] == self.test_server_url - assert client.config['batch_size'] == 10 + # Constructor values should override config file values (CLI precedence) + assert client.config['grobid_server'] == 'http://custom:9090' + assert client.config['batch_size'] == 500 def test_logging_configuration(self): """Test logging configuration from config file.""" From 28ec8396e01112cf499664e737cd757f712a2bdd Mon Sep 17 00:00:00 2001 From: Luca <15426+lfoppiano@users.noreply.github.com> Date: Tue, 26 Aug 2025 12:09:07 +0100 Subject: [PATCH 03/11] remove unused constructor params Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- grobid_client/grobid_client.py | 10 ---------- 1 file changed, 10 deletions(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index f2b81af..ca0a413 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -81,16 +81,6 @@ def __init__( } } - # Store constructor parameters to ensure they take precedence over config file - # We'll use these to override config file values after loading - self._constructor_params = { - 'grobid_server': grobid_server, - 'batch_size': batch_size, - 'coordinates': coordinates, - 'sleep_time': sleep_time, - 'timeout': timeout - } - # Load config first (which may override logging settings) if config_path: self._load_config(config_path) From b67c3ea1ab498ffbb439ca5299903279eceb6627 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 12:30:07 +0100 Subject: [PATCH 04/11] update default values override --- example.py | 8 +++-- grobid_client/grobid_client.py | 66 ++++++++++++++++++++++++---------- 2 files changed, 53 insertions(+), 21 deletions(-) diff --git a/example.py b/example.py index 0a91f9f..ff821c7 100644 --- a/example.py +++ b/example.py @@ -1,10 +1,14 @@ from grobid_client.grobid_client import GrobidClient if __name__ == "__main__": - # Example with default localhost server + # Example 1: Using config file values (no constructor parameters) client = GrobidClient(config_path="./config.json") client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) - # Example with custom server (uncomment to use) + # Example 2: Overriding config file with explicit server parameter # client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json") # client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) + + # Example 3: Using default values (no config file, no parameters) + # client = GrobidClient() + # client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index ca0a413..0d7be48 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -40,14 +40,37 @@ class GrobidClient(ApiClient): def __init__( self, - grobid_server='http://localhost:8070', - batch_size=1000, + grobid_server=None, + batch_size=None, coordinates=None, - sleep_time=5, - timeout=180, + sleep_time=None, + timeout=None, config_path=None, check_server=True ): + # Set default coordinates if None provided + if coordinates is None: + coordinates = [ + "title", + "persName", + "affiliation", + "orgName", + "formula", + "figure", + "ref", + "biblStruct", + "head", + "p", + "s", + "note" + ] + + # Set default values for parameters that are None + default_grobid_server = grobid_server if grobid_server is not None else 'http://localhost:8070' + default_batch_size = batch_size if batch_size is not None else 1000 + default_sleep_time = sleep_time if sleep_time is not None else 5 + default_timeout = timeout if timeout is not None else 180 + # Set default coordinates if None provided if coordinates is None: coordinates = [ @@ -66,11 +89,11 @@ def __init__( ] self.config = { - 'grobid_server': grobid_server, - 'batch_size': batch_size, + 'grobid_server': default_grobid_server, + 'batch_size': default_batch_size, 'coordinates': coordinates, - 'sleep_time': sleep_time, - 'timeout': timeout, + 'sleep_time': default_sleep_time, + 'timeout': default_timeout, 'logging': { 'level': 'INFO', 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', @@ -85,18 +108,18 @@ def __init__( if config_path: self._load_config(config_path) - # Ensure constructor parameters take precedence over config file values - # This ensures CLI arguments override config file values - # Only override if the parameter was explicitly provided (not the default) - if grobid_server != 'http://localhost:8070': + # Only override config file values if constructor parameters were explicitly provided (not None) + # This ensures CLI arguments override config file values, but allows config files to be used + # when no constructor parameters are provided + if grobid_server is not None: self.config['grobid_server'] = grobid_server - if batch_size != 1000: + if batch_size is not None: self.config['batch_size'] = batch_size - if coordinates is not None: # coordinates has a complex default, so check if explicitly provided + if coordinates is not None: self.config['coordinates'] = coordinates - if sleep_time != 5: + if sleep_time is not None: self.config['sleep_time'] = sleep_time - if timeout != 180: + if timeout is not None: self.config['timeout'] = timeout # Configure logging based on config @@ -734,8 +757,8 @@ def main(): ) parser.add_argument( "--server", - default="http://localhost:8070", - help="GROBID server URL (default: http://localhost:8070)", + default=None, + help="GROBID server URL overide of the config file. If config not provided, default is http://localhost:8070", ) args = parser.parse_args() @@ -755,7 +778,12 @@ def main(): # Initialize GrobidClient which will configure logging based on config.json try: - client = GrobidClient(grobid_server=args.server, config_path=config_path) + # Only pass grobid_server if it was explicitly provided (not the default) + client_kwargs = {'config_path': config_path} + if args.server is not None: # Only override if user specified a different server + client_kwargs['grobid_server'] = args.server + + client = GrobidClient(**client_kwargs) # Now use the client's logger for all subsequent logging logger = client.logger except ServerUnavailableException as e: From 9a2a2066d550609c17cf9b2bdb1700f282d90845 Mon Sep 17 00:00:00 2001 From: Luca <15426+lfoppiano@users.noreply.github.com> Date: Tue, 26 Aug 2025 12:32:20 +0100 Subject: [PATCH 05/11] Update grobid_client/grobid_client.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- grobid_client/grobid_client.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index 0d7be48..59e6868 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -758,7 +758,7 @@ def main(): parser.add_argument( "--server", default=None, - help="GROBID server URL overide of the config file. If config not provided, default is http://localhost:8070", + help="GROBID server URL override of the config file. If config not provided, default is http://localhost:8070", ) args = parser.parse_args() From 8f292254bcdc4466913b2deaad52d02d6ef045b9 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 12:41:35 +0100 Subject: [PATCH 06/11] cleanup --- grobid_client/grobid_client.py | 236 ++++++++++++++++----------------- 1 file changed, 117 insertions(+), 119 deletions(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index 59e6868..9c26a2a 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -37,6 +37,35 @@ def __init__(self, message="GROBID server is not available"): class GrobidClient(ApiClient): + # Default configuration values + DEFAULT_CONFIG = { + 'grobid_server': 'http://localhost:8070', + 'batch_size': 1000, + 'sleep_time': 5, + 'timeout': 180, + 'coordinates': [ + "title", + "persName", + "affiliation", + "orgName", + "formula", + "figure", + "ref", + "biblStruct", + "head", + "p", + "s", + "note" + ], + 'logging': { + 'level': 'INFO', + 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + 'console': True, + 'file': None, # Disabled by default + 'max_file_size': '10MB', + 'backup_count': 3 + } + } def __init__( self, @@ -48,79 +77,31 @@ def __init__( config_path=None, check_server=True ): - # Set default coordinates if None provided - if coordinates is None: - coordinates = [ - "title", - "persName", - "affiliation", - "orgName", - "formula", - "figure", - "ref", - "biblStruct", - "head", - "p", - "s", - "note" - ] - - # Set default values for parameters that are None - default_grobid_server = grobid_server if grobid_server is not None else 'http://localhost:8070' - default_batch_size = batch_size if batch_size is not None else 1000 - default_sleep_time = sleep_time if sleep_time is not None else 5 - default_timeout = timeout if timeout is not None else 180 + # Initialize config with defaults + self.config = self.DEFAULT_CONFIG.copy() - # Set default coordinates if None provided - if coordinates is None: - coordinates = [ - "title", - "persName", - "affiliation", - "orgName", - "formula", - "figure", - "ref", - "biblStruct", - "head", - "p", - "s", - "note" - ] - - self.config = { - 'grobid_server': default_grobid_server, - 'batch_size': default_batch_size, + # Override with provided parameters (None values will use defaults) + self._set_config_params({ + 'grobid_server': grobid_server, + 'batch_size': batch_size, 'coordinates': coordinates, - 'sleep_time': default_sleep_time, - 'timeout': default_timeout, - 'logging': { - 'level': 'INFO', - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - 'console': True, - 'file': None, # Disabled by default - 'max_file_size': '10MB', - 'backup_count': 3 - } - } + 'sleep_time': sleep_time, + 'timeout': timeout + }) - # Load config first (which may override logging settings) + # Load config file (which may override current values) if config_path: self._load_config(config_path) - # Only override config file values if constructor parameters were explicitly provided (not None) - # This ensures CLI arguments override config file values, but allows config files to be used - # when no constructor parameters are provided - if grobid_server is not None: - self.config['grobid_server'] = grobid_server - if batch_size is not None: - self.config['batch_size'] = batch_size - if coordinates is not None: - self.config['coordinates'] = coordinates - if sleep_time is not None: - self.config['sleep_time'] = sleep_time - if timeout is not None: - self.config['timeout'] = timeout + # Constructor parameters take precedence over config file values + # This ensures CLI arguments override config file values + self._set_config_params({ + 'grobid_server': grobid_server, + 'batch_size': batch_size, + 'coordinates': coordinates, + 'sleep_time': sleep_time, + 'timeout': timeout + }) # Configure logging based on config self._configure_logging() @@ -128,6 +109,28 @@ def __init__( if check_server: self._test_server_connection() + def _set_config_params(self, params): + """Set configuration parameters, only if they are not None.""" + for key, value in params.items(): + if value is not None: + self.config[key] = value + + def _handle_server_busy_retry(self, file_path, retry_func, *args, **kwargs): + """Handle server busy (503) retry logic.""" + self.logger.warning(f"Server busy (503), retrying {file_path} after {self.config['sleep_time']} seconds") + time.sleep(self.config["sleep_time"]) + return retry_func(*args, **kwargs) + + def _handle_request_error(self, file_path, error, error_type="Request"): + """Handle request errors with consistent logging and return format.""" + self.logger.error(f"{error_type} failed for {file_path}: {str(error)}") + return (file_path, 500, f"{error_type} failed: {str(error)}") + + def _handle_unexpected_error(self, file_path, error): + """Handle unexpected errors with consistent logging and return format.""" + self.logger.error(f"Unexpected error processing {file_path}: {str(error)}") + return (file_path, 500, f"Unexpected error: {str(error)}") + def _configure_logging(self): """Configure logging based on the configuration settings.""" # Get logging config with defaults @@ -522,50 +525,50 @@ def process_pdf( self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}") return (pdf_file, 500, f"Failed to open file: {str(e)}") - files = { - "input": ( - pdf_file, - pdf_handle, - "application/pdf", - {"Expires": "0"}, - ) - } - - the_url = self.get_server_url(service) + try: + files = { + "input": ( + pdf_file, + pdf_handle, + "application/pdf", + {"Expires": "0"}, + ) + } - # set the GROBID parameters - the_data = {} - if generateIDs: - the_data["generateIDs"] = "1" - if consolidate_header: - the_data["consolidateHeader"] = "1" - if consolidate_citations: - the_data["consolidateCitations"] = "1" - if include_raw_citations: - the_data["includeRawCitations"] = "1" - if include_raw_affiliations: - the_data["includeRawAffiliations"] = "1" - if tei_coordinates: - the_data["teiCoordinates"] = self.config["coordinates"] - if segment_sentences: - the_data["segmentSentences"] = "1" - if flavor: - the_data["flavor"] = flavor - if start and start > 0: - the_data["start"] = str(start) - if end and end > 0: - the_data["end"] = str(end) + the_url = self.get_server_url(service) + + # set the GROBID parameters + the_data = {} + if generateIDs: + the_data["generateIDs"] = "1" + if consolidate_header: + the_data["consolidateHeader"] = "1" + if consolidate_citations: + the_data["consolidateCitations"] = "1" + if include_raw_citations: + the_data["includeRawCitations"] = "1" + if include_raw_affiliations: + the_data["includeRawAffiliations"] = "1" + if tei_coordinates: + the_data["teiCoordinates"] = self.config["coordinates"] + if segment_sentences: + the_data["segmentSentences"] = "1" + if flavor: + the_data["flavor"] = flavor + if start and start > 0: + the_data["start"] = str(start) + if end and end > 0: + the_data["end"] = str(end) - try: res, status = self.post( url=the_url, files=files, data=the_data, headers={"Accept": "text/plain"}, timeout=self.config['timeout'] ) if status == 503: - self.logger.warning(f"Server busy (503), retrying {pdf_file} after {self.config['sleep_time']} seconds") - time.sleep(self.config["sleep_time"]) - return self.process_pdf( + return self._handle_server_busy_retry( + pdf_file, + self.process_pdf, service, pdf_file, generateIDs, @@ -579,21 +582,18 @@ def process_pdf( start, end ) + + return (pdf_file, status, res.text) + except requests.exceptions.ReadTimeout as e: self.logger.error(f"Request timeout for {pdf_file}: {str(e)}") - pdf_handle.close() return (pdf_file, 408, f"Request timeout: {str(e)}") except requests.exceptions.RequestException as e: - self.logger.error(f"Request failed for {pdf_file}: {str(e)}") - pdf_handle.close() - return (pdf_file, 500, f"Request failed: {str(e)}") + return self._handle_request_error(pdf_file, e) except Exception as e: - self.logger.error(f"Unexpected error processing {pdf_file}: {str(e)}") + return self._handle_unexpected_error(pdf_file, e) + finally: pdf_handle.close() - return (pdf_file, 500, f"Unexpected error: {str(e)}") - - pdf_handle.close() - return (pdf_file, status, res.text) def get_server_url(self, service): return self.config['grobid_server'] + "/api/" + service @@ -637,9 +637,9 @@ def process_txt( ) if status == 503: - self.logger.warning(f"Server busy (503), retrying {txt_file} after {self.config['sleep_time']} seconds") - time.sleep(self.config["sleep_time"]) - return self.process_txt( + return self._handle_server_busy_retry( + txt_file, + self.process_txt, service, txt_file, generateIDs, @@ -651,11 +651,9 @@ def process_txt( segment_sentences ) except requests.exceptions.RequestException as e: - self.logger.error(f"Request failed for {txt_file}: {str(e)}") - return (txt_file, 500, f"Request failed: {str(e)}") + return self._handle_request_error(txt_file, e) except Exception as e: - self.logger.error(f"Unexpected error processing {txt_file}: {str(e)}") - return (txt_file, 500, f"Unexpected error: {str(e)}") + return self._handle_unexpected_error(txt_file, e) return (txt_file, status, res.text) From 9a7cc750ef95157c11444b8b6096d45c0ce46cdf Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 12:52:18 +0100 Subject: [PATCH 07/11] Remove duplicated code --- grobid_client/grobid_client.py | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index 9c26a2a..c759871 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -24,6 +24,7 @@ import pathlib import logging from typing import Tuple +import copy from .client import ApiClient @@ -78,17 +79,8 @@ def __init__( check_server=True ): # Initialize config with defaults - self.config = self.DEFAULT_CONFIG.copy() - - # Override with provided parameters (None values will use defaults) - self._set_config_params({ - 'grobid_server': grobid_server, - 'batch_size': batch_size, - 'coordinates': coordinates, - 'sleep_time': sleep_time, - 'timeout': timeout - }) - + self.config = copy.deepcopy(self.DEFAULT_CONFIG) + # Load config file (which may override current values) if config_path: self._load_config(config_path) From 5f6af7c2a9ddf8ffa772f76096ac254a239e86c4 Mon Sep 17 00:00:00 2001 From: Luca <15426+lfoppiano@users.noreply.github.com> Date: Tue, 26 Aug 2025 13:20:51 +0100 Subject: [PATCH 08/11] Update grobid_client/grobid_client.py Co-authored-by: Copilot <175728472+Copilot@users.noreply.github.com> --- grobid_client/grobid_client.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index c759871..de4723e 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -584,8 +584,8 @@ def process_pdf( return self._handle_request_error(pdf_file, e) except Exception as e: return self._handle_unexpected_error(pdf_file, e) - finally: - pdf_handle.close() + if pdf_handle is not None: + pdf_handle.close() def get_server_url(self, service): return self.config['grobid_server'] + "/api/" + service From 1b08be73c9361897fd5bdff0896a26649cc15957 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 13:26:55 +0100 Subject: [PATCH 09/11] basically I'm rewrite it --- grobid_client/grobid_client.py | 31 ++++++++++++++----------------- 1 file changed, 14 insertions(+), 17 deletions(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index de4723e..cfbd553 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -512,20 +512,16 @@ def process_pdf( end=-1 ): try: - pdf_handle = open(pdf_file, "rb") - except IOError as e: - self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}") - return (pdf_file, 500, f"Failed to open file: {str(e)}") - - try: - files = { - "input": ( - pdf_file, - pdf_handle, - "application/pdf", - {"Expires": "0"}, - ) - } + with open(pdf_file, "rb") as fi: + + files = { + "input": ( + pdf_file, + fi, + "application/pdf", + {"Expires": "0"}, + ) + } the_url = self.get_server_url(service) @@ -576,7 +572,10 @@ def process_pdf( ) return (pdf_file, status, res.text) - + + except IOError as e: + self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}") + return (pdf_file, 400, f"Failed to open file: {str(e)}") except requests.exceptions.ReadTimeout as e: self.logger.error(f"Request timeout for {pdf_file}: {str(e)}") return (pdf_file, 408, f"Request timeout: {str(e)}") @@ -584,8 +583,6 @@ def process_pdf( return self._handle_request_error(pdf_file, e) except Exception as e: return self._handle_unexpected_error(pdf_file, e) - if pdf_handle is not None: - pdf_handle.close() def get_server_url(self, service): return self.config['grobid_server'] + "/api/" + service From 88c143d60a7e9b58b3cf9393651f059f9431e5f7 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 13:31:08 +0100 Subject: [PATCH 10/11] update tests --- tests/test_grobid_client.py | 2 +- tests/test_integration.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/test_grobid_client.py b/tests/test_grobid_client.py index 61d0d0d..678b211 100644 --- a/tests/test_grobid_client.py +++ b/tests/test_grobid_client.py @@ -315,7 +315,7 @@ def test_process_pdf_file_not_found(self, mock_file): segment_sentences=False ) - assert result[1] == 500 + assert result[1] == 400 assert 'Failed to open file' in result[2] @patch('builtins.open', new_callable=mock_open, read_data='Reference 1\nReference 2\n') diff --git a/tests/test_integration.py b/tests/test_integration.py index dd698b0..6fedbd1 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -199,7 +199,7 @@ def test_error_handling_and_recovery(self): False, False, False, False, False, False, False ) - assert result[1] == 500 + assert result[1] == 400 assert 'Failed to open file' in result[2] def test_different_file_types(self): From 69f55992a9b3242b7b6960e982cac1e639d2478e Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Tue, 26 Aug 2025 13:39:55 +0100 Subject: [PATCH 11/11] fix error handling properly --- grobid_client/grobid_client.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index cfbd553..6524cd7 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -511,17 +511,18 @@ def process_pdf( start=-1, end=-1 ): + pdf_handle = None try: - with open(pdf_file, "rb") as fi: + pdf_handle = open(pdf_file, "rb") - files = { - "input": ( - pdf_file, - fi, - "application/pdf", - {"Expires": "0"}, - ) - } + files = { + "input": ( + pdf_file, + pdf_handle, + "application/pdf", + {"Expires": "0"}, + ) + } the_url = self.get_server_url(service) @@ -583,6 +584,9 @@ def process_pdf( return self._handle_request_error(pdf_file, e) except Exception as e: return self._handle_unexpected_error(pdf_file, e) + finally: + if pdf_handle: + pdf_handle.close() def get_server_url(self, service): return self.config['grobid_server'] + "/api/" + service