Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
22 changes: 20 additions & 2 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ usage: grobid_client [-h] [--input INPUT] [--output OUTPUT] [--config CONFIG]
[--n N] [--generateIDs] [--consolidate_header]
[--consolidate_citations] [--include_raw_citations]
[--include_raw_affiliations] [--force] [--teiCoordinates]
[--verbose] [--flavor FLAVOR]
[--verbose] [--flavor FLAVOR] [--server SERVER]
service

Client for GROBID services
Expand All @@ -75,6 +75,7 @@ optional arguments:
(optional)
--config CONFIG path to the config file, default is ./config.json
--n N concurrency for service usage
--server SERVER GROBID server URL (default: http://localhost:8070)
--generateIDs generate random xml:id to textual XML elements of the
result files
--consolidate_header call GROBID with consolidation of the metadata
Expand Down Expand Up @@ -132,6 +133,15 @@ The following command example will process all the PDF files present in the inpu
> grobid_client --input ~/tmp/in2 --output ~/tmp/out --teiCoordinates --segmentSentences processFulltextDocument
```

To use a different GROBID server (e.g., a hosted service), use the `--server` argument:

```console
> grobid_client --server https://lfoppiano-grobid.hf.space --input ~/tmp/in2 --output ~/tmp/out processFulltextDocument
```

> [!NOTE]
> The `--server` argument will override the server URL specified in the config file. If both are provided, the CLI argument takes precedence.

The file `example.py` gives an example of usage as a library, from a another python script.

## Using the client in your python
Expand All @@ -141,16 +151,24 @@ Import and call the client as follow:
```python
from grobid_client.grobid_client import GrobidClient

# Using default localhost server
client = GrobidClient(config_path="./config.json")
client.process("processFulltextDocument", "/mnt/data/covid/pdfs", n=20)

# Using a custom server
client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json")
client.process("processFulltextDocument", "/mnt/data/covid/pdfs", n=20)
```

See also `example.py`.

## Configuration of the client

> [!TIP]
> from version 0.0.12 the `config.json` will be optional, by default the client will connect to the local server (`http://localhost:8070`).
> from version 0.0.12 the `config.json` will be optional, by default the client will connect to the local server (`http://localhost:8070`).

> [!NOTE]
> When using the CLI, the `--server` argument will override the `grobid_server` value in the config file. This allows you to use a config file for most settings while easily switching servers via command line.

There are a few parameters that can be set with the `config.json` file.

Expand Down
9 changes: 9 additions & 0 deletions example.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,14 @@
from grobid_client.grobid_client import GrobidClient

if __name__ == "__main__":
# Example 1: Using config file values (no constructor parameters)
client = GrobidClient(config_path="./config.json")
client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True)

# Example 2: Overriding config file with explicit server parameter
# client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json")
# client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True)

# Example 3: Using default values (no config file, no parameters)
# client = GrobidClient()
# client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True)
226 changes: 132 additions & 94 deletions grobid_client/grobid_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@
import pathlib
import logging
from typing import Tuple
import copy

from .client import ApiClient

Expand All @@ -37,60 +38,91 @@ def __init__(self, message="GROBID server is not available"):


class GrobidClient(ApiClient):
# Default configuration values
DEFAULT_CONFIG = {
'grobid_server': 'http://localhost:8070',
'batch_size': 1000,
'sleep_time': 5,
'timeout': 180,
'coordinates': [
"title",
"persName",
"affiliation",
"orgName",
"formula",
"figure",
"ref",
"biblStruct",
"head",
"p",
"s",
"note"
],
'logging': {
'level': 'INFO',
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
'console': True,
'file': None, # Disabled by default
'max_file_size': '10MB',
'backup_count': 3
}
}

def __init__(
self,
grobid_server='http://localhost:8070',
batch_size=1000,
grobid_server=None,
batch_size=None,
coordinates=None,
sleep_time=5,
timeout=180,
sleep_time=None,
timeout=None,
config_path=None,
check_server=True
):
# Set default coordinates if None provided
if coordinates is None:
coordinates = [
"title",
"persName",
"affiliation",
"orgName",
"formula",
"figure",
"ref",
"biblStruct",
"head",
"p",
"s",
"note"
]

self.config = {
# Initialize config with defaults
self.config = copy.deepcopy(self.DEFAULT_CONFIG)

# Load config file (which may override current values)
if config_path:
self._load_config(config_path)

# Constructor parameters take precedence over config file values
Copy link

Copilot AI Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The _set_config_params method is called twice with the same parameters (lines 84-90 and 98-104). This duplication means config file values will always be overwritten by constructor parameters, making the first call redundant. Consider removing the first call or restructuring the logic.

Copilot uses AI. Check for mistakes.
# This ensures CLI arguments override config file values
self._set_config_params({
'grobid_server': grobid_server,
'batch_size': batch_size,
'coordinates': coordinates,
'sleep_time': sleep_time,
'timeout': timeout,
'logging': {
'level': 'INFO',
'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s',
'console': True,
'file': None, # Disabled by default
'max_file_size': '10MB',
'backup_count': 3
}
}

# Load config first (which may override logging settings)
if config_path:
self._load_config(config_path)
'timeout': timeout
})

# Configure logging based on config
self._configure_logging()

if check_server:
self._test_server_connection()

def _set_config_params(self, params):
"""Set configuration parameters, only if they are not None."""
for key, value in params.items():
if value is not None:
self.config[key] = value

def _handle_server_busy_retry(self, file_path, retry_func, *args, **kwargs):
"""Handle server busy (503) retry logic."""
self.logger.warning(f"Server busy (503), retrying {file_path} after {self.config['sleep_time']} seconds")
time.sleep(self.config["sleep_time"])
return retry_func(*args, **kwargs)

def _handle_request_error(self, file_path, error, error_type="Request"):
"""Handle request errors with consistent logging and return format."""
self.logger.error(f"{error_type} failed for {file_path}: {str(error)}")
return (file_path, 500, f"{error_type} failed: {str(error)}")

def _handle_unexpected_error(self, file_path, error):
"""Handle unexpected errors with consistent logging and return format."""
self.logger.error(f"Unexpected error processing {file_path}: {str(error)}")
return (file_path, 500, f"Unexpected error: {str(error)}")

def _configure_logging(self):
"""Configure logging based on the configuration settings."""
# Get logging config with defaults
Expand Down Expand Up @@ -479,56 +511,53 @@ def process_pdf(
start=-1,
end=-1
):
pdf_handle = None
try:
pdf_handle = open(pdf_file, "rb")
except IOError as e:
self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}")
return (pdf_file, 500, f"Failed to open file: {str(e)}")

files = {
"input": (
pdf_file,
pdf_handle,
"application/pdf",
{"Expires": "0"},
)
}

the_url = self.get_server_url(service)

files = {
"input": (
pdf_file,
pdf_handle,
"application/pdf",
{"Expires": "0"},
)
}

# set the GROBID parameters
the_data = {}
if generateIDs:
the_data["generateIDs"] = "1"
if consolidate_header:
the_data["consolidateHeader"] = "1"
if consolidate_citations:
the_data["consolidateCitations"] = "1"
if include_raw_citations:
the_data["includeRawCitations"] = "1"
if include_raw_affiliations:
the_data["includeRawAffiliations"] = "1"
if tei_coordinates:
the_data["teiCoordinates"] = self.config["coordinates"]
if segment_sentences:
the_data["segmentSentences"] = "1"
if flavor:
the_data["flavor"] = flavor
if start and start > 0:
the_data["start"] = str(start)
if end and end > 0:
the_data["end"] = str(end)
the_url = self.get_server_url(service)

# set the GROBID parameters
the_data = {}
if generateIDs:
the_data["generateIDs"] = "1"
if consolidate_header:
the_data["consolidateHeader"] = "1"
if consolidate_citations:
the_data["consolidateCitations"] = "1"
if include_raw_citations:
the_data["includeRawCitations"] = "1"
if include_raw_affiliations:
the_data["includeRawAffiliations"] = "1"
if tei_coordinates:
the_data["teiCoordinates"] = self.config["coordinates"]
if segment_sentences:
the_data["segmentSentences"] = "1"
if flavor:
the_data["flavor"] = flavor
if start and start > 0:
the_data["start"] = str(start)
if end and end > 0:
the_data["end"] = str(end)

try:
res, status = self.post(
url=the_url, files=files, data=the_data, headers={"Accept": "text/plain"},
timeout=self.config['timeout']
)

if status == 503:
self.logger.warning(f"Server busy (503), retrying {pdf_file} after {self.config['sleep_time']} seconds")
time.sleep(self.config["sleep_time"])
return self.process_pdf(
return self._handle_server_busy_retry(
pdf_file,
self.process_pdf,
service,
pdf_file,
generateIDs,
Expand All @@ -542,21 +571,22 @@ def process_pdf(
start,
end
)

return (pdf_file, status, res.text)

except IOError as e:
self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}")
return (pdf_file, 400, f"Failed to open file: {str(e)}")
except requests.exceptions.ReadTimeout as e:
self.logger.error(f"Request timeout for {pdf_file}: {str(e)}")
pdf_handle.close()
return (pdf_file, 408, f"Request timeout: {str(e)}")
except requests.exceptions.RequestException as e:
self.logger.error(f"Request failed for {pdf_file}: {str(e)}")
pdf_handle.close()
return (pdf_file, 500, f"Request failed: {str(e)}")
return self._handle_request_error(pdf_file, e)
except Exception as e:
self.logger.error(f"Unexpected error processing {pdf_file}: {str(e)}")
pdf_handle.close()
return (pdf_file, 500, f"Unexpected error: {str(e)}")

pdf_handle.close()
return (pdf_file, status, res.text)
return self._handle_unexpected_error(pdf_file, e)
Copy link

Copilot AI Aug 26, 2025

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This line is unreachable code. It appears after a return statement and will never execute. The pdf_handle.close() should be moved to a finally block or handled before the return statements.

Suggested change
return self._handle_unexpected_error(pdf_file, e)
return self._handle_unexpected_error(pdf_file, e)
finally:

Copilot uses AI. Check for mistakes.
finally:
if pdf_handle:
pdf_handle.close()

def get_server_url(self, service):
return self.config['grobid_server'] + "/api/" + service
Expand Down Expand Up @@ -600,9 +630,9 @@ def process_txt(
)

if status == 503:
self.logger.warning(f"Server busy (503), retrying {txt_file} after {self.config['sleep_time']} seconds")
time.sleep(self.config["sleep_time"])
return self.process_txt(
return self._handle_server_busy_retry(
txt_file,
self.process_txt,
service,
txt_file,
generateIDs,
Expand All @@ -614,11 +644,9 @@ def process_txt(
segment_sentences
)
except requests.exceptions.RequestException as e:
self.logger.error(f"Request failed for {txt_file}: {str(e)}")
return (txt_file, 500, f"Request failed: {str(e)}")
return self._handle_request_error(txt_file, e)
except Exception as e:
self.logger.error(f"Unexpected error processing {txt_file}: {str(e)}")
return (txt_file, 500, f"Unexpected error: {str(e)}")
return self._handle_unexpected_error(txt_file, e)

return (txt_file, status, res.text)

Expand Down Expand Up @@ -718,6 +746,11 @@ def main():
default=None,
help="Define the flavor to be used for the fulltext extraction",
)
parser.add_argument(
"--server",
default=None,
help="GROBID server URL override of the config file. If config not provided, default is http://localhost:8070",
)

args = parser.parse_args()

Expand All @@ -736,7 +769,12 @@ def main():

# Initialize GrobidClient which will configure logging based on config.json
try:
client = GrobidClient(config_path=config_path)
# Only pass grobid_server if it was explicitly provided (not the default)
client_kwargs = {'config_path': config_path}
if args.server is not None: # Only override if user specified a different server
client_kwargs['grobid_server'] = args.server

client = GrobidClient(**client_kwargs)
# Now use the client's logger for all subsequent logging
logger = client.logger
except ServerUnavailableException as e:
Expand Down
Loading