diff --git a/Readme.md b/Readme.md index c6094ef..3a1be01 100644 --- a/Readme.md +++ b/Readme.md @@ -55,7 +55,7 @@ usage: grobid_client [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG] [--n N] [--generateIDs] [--consolidate_header] [--consolidate_citations] [--include_raw_citations] [--include_raw_affiliations] [--force] [--teiCoordinates] - [--verbose] [--flavor FLAVOR] + [--verbose] [--flavor FLAVOR] [--server SERVER] service Client for GROBID services @@ -75,6 +75,7 @@ optional arguments: (optional) --config CONFIG path to the config file, default is ./config.json --n N concurrency for service usage + --server SERVER GROBID server URL (default: http://localhost:8070) --generateIDs generate random xml:id to textual XML elements of the result files --consolidate_header call GROBID with consolidation of the metadata @@ -132,6 +133,15 @@ The following command example will process all the PDF files present in the inpu > grobid_client --input ~/tmp/in2 --output ~/tmp/out --teiCoordinates --segmentSentences processFulltextDocument ``` +To use a different GROBID server (e.g., a hosted service), use the `--server` argument: + +```console +> grobid_client --server https://lfoppiano-grobid.hf.space --input ~/tmp/in2 --output ~/tmp/out processFulltextDocument +``` + +> [!NOTE] +> The `--server` argument will override the server URL specified in the config file. If both are provided, the CLI argument takes precedence. + The file `example.py` gives an example of usage as a library, from a another python script. ## Using the client in your python @@ -141,8 +151,13 @@ Import and call the client as follow: ```python from grobid_client.grobid_client import GrobidClient +# Using default localhost server client = GrobidClient(config_path="./config.json") client.process("processFulltextDocument", "/mnt/data/covid/pdfs", n=20) + +# Using a custom server +client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json") +client.process("processFulltextDocument", "/mnt/data/covid/pdfs", n=20) ``` See also `example.py`. @@ -150,7 +165,10 @@ See also `example.py`. ## Configuration of the client > [!TIP] -> from version 0.0.12 the `config.json` will be optional, by default the client will connect to the local server (`http://localhost:8070`). +> from version 0.0.12 the `config.json` will be optional, by default the client will connect to the local server (`http://localhost:8070`). + +> [!NOTE] +> When using the CLI, the `--server` argument will override the `grobid_server` value in the config file. This allows you to use a config file for most settings while easily switching servers via command line. There are a few parameters that can be set with the `config.json` file. diff --git a/example.py b/example.py index 1db1f81..ff821c7 100644 --- a/example.py +++ b/example.py @@ -1,5 +1,14 @@ from grobid_client.grobid_client import GrobidClient if __name__ == "__main__": + # Example 1: Using config file values (no constructor parameters) client = GrobidClient(config_path="./config.json") client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) + + # Example 2: Overriding config file with explicit server parameter + # client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json") + # client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) + + # Example 3: Using default values (no config file, no parameters) + # client = GrobidClient() + # client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index d530f74..6524cd7 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -24,6 +24,7 @@ import pathlib import logging from typing import Tuple +import copy from .client import ApiClient @@ -37,53 +38,62 @@ def __init__(self, message="GROBID server is not available"): class GrobidClient(ApiClient): + # Default configuration values + DEFAULT_CONFIG = { + 'grobid_server': 'http://localhost:8070', + 'batch_size': 1000, + 'sleep_time': 5, + 'timeout': 180, + 'coordinates': [ + "title", + "persName", + "affiliation", + "orgName", + "formula", + "figure", + "ref", + "biblStruct", + "head", + "p", + "s", + "note" + ], + 'logging': { + 'level': 'INFO', + 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', + 'console': True, + 'file': None, # Disabled by default + 'max_file_size': '10MB', + 'backup_count': 3 + } + } def __init__( self, - grobid_server='http://localhost:8070', - batch_size=1000, + grobid_server=None, + batch_size=None, coordinates=None, - sleep_time=5, - timeout=180, + sleep_time=None, + timeout=None, config_path=None, check_server=True ): - # Set default coordinates if None provided - if coordinates is None: - coordinates = [ - "title", - "persName", - "affiliation", - "orgName", - "formula", - "figure", - "ref", - "biblStruct", - "head", - "p", - "s", - "note" - ] - - self.config = { + # Initialize config with defaults + self.config = copy.deepcopy(self.DEFAULT_CONFIG) + + # Load config file (which may override current values) + if config_path: + self._load_config(config_path) + + # Constructor parameters take precedence over config file values + # This ensures CLI arguments override config file values + self._set_config_params({ 'grobid_server': grobid_server, 'batch_size': batch_size, 'coordinates': coordinates, 'sleep_time': sleep_time, - 'timeout': timeout, - 'logging': { - 'level': 'INFO', - 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', - 'console': True, - 'file': None, # Disabled by default - 'max_file_size': '10MB', - 'backup_count': 3 - } - } - - # Load config first (which may override logging settings) - if config_path: - self._load_config(config_path) + 'timeout': timeout + }) # Configure logging based on config self._configure_logging() @@ -91,6 +101,28 @@ def __init__( if check_server: self._test_server_connection() + def _set_config_params(self, params): + """Set configuration parameters, only if they are not None.""" + for key, value in params.items(): + if value is not None: + self.config[key] = value + + def _handle_server_busy_retry(self, file_path, retry_func, *args, **kwargs): + """Handle server busy (503) retry logic.""" + self.logger.warning(f"Server busy (503), retrying {file_path} after {self.config['sleep_time']} seconds") + time.sleep(self.config["sleep_time"]) + return retry_func(*args, **kwargs) + + def _handle_request_error(self, file_path, error, error_type="Request"): + """Handle request errors with consistent logging and return format.""" + self.logger.error(f"{error_type} failed for {file_path}: {str(error)}") + return (file_path, 500, f"{error_type} failed: {str(error)}") + + def _handle_unexpected_error(self, file_path, error): + """Handle unexpected errors with consistent logging and return format.""" + self.logger.error(f"Unexpected error processing {file_path}: {str(error)}") + return (file_path, 500, f"Unexpected error: {str(error)}") + def _configure_logging(self): """Configure logging based on the configuration settings.""" # Get logging config with defaults @@ -479,56 +511,53 @@ def process_pdf( start=-1, end=-1 ): + pdf_handle = None try: pdf_handle = open(pdf_file, "rb") - except IOError as e: - self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}") - return (pdf_file, 500, f"Failed to open file: {str(e)}") - - files = { - "input": ( - pdf_file, - pdf_handle, - "application/pdf", - {"Expires": "0"}, - ) - } - - the_url = self.get_server_url(service) + + files = { + "input": ( + pdf_file, + pdf_handle, + "application/pdf", + {"Expires": "0"}, + ) + } - # set the GROBID parameters - the_data = {} - if generateIDs: - the_data["generateIDs"] = "1" - if consolidate_header: - the_data["consolidateHeader"] = "1" - if consolidate_citations: - the_data["consolidateCitations"] = "1" - if include_raw_citations: - the_data["includeRawCitations"] = "1" - if include_raw_affiliations: - the_data["includeRawAffiliations"] = "1" - if tei_coordinates: - the_data["teiCoordinates"] = self.config["coordinates"] - if segment_sentences: - the_data["segmentSentences"] = "1" - if flavor: - the_data["flavor"] = flavor - if start and start > 0: - the_data["start"] = str(start) - if end and end > 0: - the_data["end"] = str(end) + the_url = self.get_server_url(service) + + # set the GROBID parameters + the_data = {} + if generateIDs: + the_data["generateIDs"] = "1" + if consolidate_header: + the_data["consolidateHeader"] = "1" + if consolidate_citations: + the_data["consolidateCitations"] = "1" + if include_raw_citations: + the_data["includeRawCitations"] = "1" + if include_raw_affiliations: + the_data["includeRawAffiliations"] = "1" + if tei_coordinates: + the_data["teiCoordinates"] = self.config["coordinates"] + if segment_sentences: + the_data["segmentSentences"] = "1" + if flavor: + the_data["flavor"] = flavor + if start and start > 0: + the_data["start"] = str(start) + if end and end > 0: + the_data["end"] = str(end) - try: res, status = self.post( url=the_url, files=files, data=the_data, headers={"Accept": "text/plain"}, timeout=self.config['timeout'] ) if status == 503: - self.logger.warning(f"Server busy (503), retrying {pdf_file} after {self.config['sleep_time']} seconds") - time.sleep(self.config["sleep_time"]) - return self.process_pdf( + return self._handle_server_busy_retry( + pdf_file, + self.process_pdf, service, pdf_file, generateIDs, @@ -542,21 +571,22 @@ def process_pdf( start, end ) + + return (pdf_file, status, res.text) + + except IOError as e: + self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}") + return (pdf_file, 400, f"Failed to open file: {str(e)}") except requests.exceptions.ReadTimeout as e: self.logger.error(f"Request timeout for {pdf_file}: {str(e)}") - pdf_handle.close() return (pdf_file, 408, f"Request timeout: {str(e)}") except requests.exceptions.RequestException as e: - self.logger.error(f"Request failed for {pdf_file}: {str(e)}") - pdf_handle.close() - return (pdf_file, 500, f"Request failed: {str(e)}") + return self._handle_request_error(pdf_file, e) except Exception as e: - self.logger.error(f"Unexpected error processing {pdf_file}: {str(e)}") - pdf_handle.close() - return (pdf_file, 500, f"Unexpected error: {str(e)}") - - pdf_handle.close() - return (pdf_file, status, res.text) + return self._handle_unexpected_error(pdf_file, e) + finally: + if pdf_handle: + pdf_handle.close() def get_server_url(self, service): return self.config['grobid_server'] + "/api/" + service @@ -600,9 +630,9 @@ def process_txt( ) if status == 503: - self.logger.warning(f"Server busy (503), retrying {txt_file} after {self.config['sleep_time']} seconds") - time.sleep(self.config["sleep_time"]) - return self.process_txt( + return self._handle_server_busy_retry( + txt_file, + self.process_txt, service, txt_file, generateIDs, @@ -614,11 +644,9 @@ def process_txt( segment_sentences ) except requests.exceptions.RequestException as e: - self.logger.error(f"Request failed for {txt_file}: {str(e)}") - return (txt_file, 500, f"Request failed: {str(e)}") + return self._handle_request_error(txt_file, e) except Exception as e: - self.logger.error(f"Unexpected error processing {txt_file}: {str(e)}") - return (txt_file, 500, f"Unexpected error: {str(e)}") + return self._handle_unexpected_error(txt_file, e) return (txt_file, status, res.text) @@ -718,6 +746,11 @@ def main(): default=None, help="Define the flavor to be used for the fulltext extraction", ) + parser.add_argument( + "--server", + default=None, + help="GROBID server URL override of the config file. If config not provided, default is http://localhost:8070", + ) args = parser.parse_args() @@ -736,7 +769,12 @@ def main(): # Initialize GrobidClient which will configure logging based on config.json try: - client = GrobidClient(config_path=config_path) + # Only pass grobid_server if it was explicitly provided (not the default) + client_kwargs = {'config_path': config_path} + if args.server is not None: # Only override if user specified a different server + client_kwargs['grobid_server'] = args.server + + client = GrobidClient(**client_kwargs) # Now use the client's logger for all subsequent logging logger = client.logger except ServerUnavailableException as e: diff --git a/tests/test_grobid_client.py b/tests/test_grobid_client.py index 265e6ba..678b211 100644 --- a/tests/test_grobid_client.py +++ b/tests/test_grobid_client.py @@ -43,7 +43,7 @@ def test_init_default_values(self, mock_configure_logging, mock_test_server): assert client.config['grobid_server'] == 'http://localhost:8070' assert client.config['batch_size'] == 1000 assert client.config['sleep_time'] == 5 - assert client.config['timeout'] == 60 + assert client.config['timeout'] == 180 assert 'persName' in client.config['coordinates'] mock_configure_logging.assert_called_once() @@ -315,7 +315,7 @@ def test_process_pdf_file_not_found(self, mock_file): segment_sentences=False ) - assert result[1] == 500 + assert result[1] == 400 assert 'Failed to open file' in result[2] @patch('builtins.open', new_callable=mock_open, read_data='Reference 1\nReference 2\n') diff --git a/tests/test_integration.py b/tests/test_integration.py index cdf4af1..6fedbd1 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -96,9 +96,9 @@ def test_configuration_validation(self): check_server=False ) - # Config file values should override constructor values - assert client.config['grobid_server'] == self.test_server_url - assert client.config['batch_size'] == 10 + # Constructor values should override config file values (CLI precedence) + assert client.config['grobid_server'] == 'http://custom:9090' + assert client.config['batch_size'] == 500 def test_logging_configuration(self): """Test logging configuration from config file.""" @@ -199,7 +199,7 @@ def test_error_handling_and_recovery(self): False, False, False, False, False, False, False ) - assert result[1] == 500 + assert result[1] == 400 assert 'Failed to open file' in result[2] def test_different_file_types(self):