From ed4ce8ed5f4b0c9a8ac58a6e40fb9c47219e0fe1 Mon Sep 17 00:00:00 2001 From: Luca Foppiano Date: Thu, 5 Mar 2026 15:19:12 +0100 Subject: [PATCH] refactor: standardize parameter naming from generateIDs to generate_ids and update documentation --- Readme.md | 20 ++++++++--------- grobid_client/grobid_client.py | 39 ++++++++++++++++++---------------- tests/test_grobid_client.py | 18 ++++++++-------- tests/test_integration.py | 4 ++-- 4 files changed, 42 insertions(+), 39 deletions(-) diff --git a/Readme.md b/Readme.md index 4a40f99..2005366 100644 --- a/Readme.md +++ b/Readme.md @@ -134,13 +134,13 @@ grobid_client [OPTIONS] SERVICE | Option | Description | |------------------------------|-------------------------------------------| -| `--generateIDs` | Generate random XML IDs | +| `--generate_ids` | Generate random XML IDs | | `--consolidate_header` | Consolidate header metadata | | `--consolidate_citations` | Consolidate bibliographic references | | `--include_raw_citations` | Include raw citation text | | `--include_raw_affiliations` | Include raw affiliation text | -| `--teiCoordinates` | Add PDF coordinates to XML | -| `--segmentSentences` | Segment sentences with coordinates | +| `--tei_coordinates` | Add PDF coordinates to XML | +| `--segment_sentences` | Segment sentences with coordinates | | `--flavor` | Processing flavor for fulltext extraction | | `--json` | Convert TEI output to JSON format | | `--markdown` | Convert TEI output to Markdown format | @@ -153,7 +153,7 @@ grobid_client [OPTIONS] SERVICE grobid_client --input ~/documents --output ~/results processFulltextDocument # High concurrency with coordinates -grobid_client --input ~/pdfs --output ~/tei --n 20 --teiCoordinates processFulltextDocument +grobid_client --input ~/pdfs --output ~/tei --n 20 --tei_coordinates processFulltextDocument # Process with JSON output grobid_client --input ~/pdfs --output ~/results --json processFulltextDocument @@ -165,7 +165,7 @@ grobid_client --input ~/pdfs --output ~/results --markdown processFulltextDocume grobid_client --server https://grobid.example.com --input ~/citations.txt processCitationList # Force reprocessing with sentence segmentation and JSON output -grobid_client --input ~/docs --force --segmentSentences --json processFulltextDocument +grobid_client --input ~/docs --force --segment_sentences --json processFulltextDocument ``` ### Python Library @@ -202,10 +202,10 @@ client.process( input_path="/path/to/pdfs", output_path="/path/to/output", n=10, - generateIDs=True, + generate_ids=True, consolidate_header=True, - teiCoordinates=True, - segmentSentences=True + tei_coordinates=True, + segment_sentences=True ) # Process with JSON output @@ -454,7 +454,7 @@ When using the `--json` flag, the client converts TEI XML output to a structured grobid_client --input pdfs/ --output results/ --json processFulltextDocument # JSON output with coordinates and sentence segmentation -grobid_client --input pdfs/ --output results/ --json --teiCoordinates --segmentSentences processFulltextDocument +grobid_client --input pdfs/ --output results/ --json --tei_coordinates --segment_sentences processFulltextDocument ``` ```python @@ -535,7 +535,7 @@ Competing interests statement... grobid_client --input pdfs/ --output results/ --markdown processFulltextDocument # Markdown output with coordinates and sentence segmentation -grobid_client --input pdfs/ --output results/ --markdown --teiCoordinates --segmentSentences processFulltextDocument +grobid_client --input pdfs/ --output results/ --markdown --tei_coordinates --segment_sentences processFulltextDocument ``` ```python diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py index 1af79d1..f1ff591 100644 --- a/grobid_client/grobid_client.py +++ b/grobid_client/grobid_client.py @@ -20,6 +20,7 @@ import time import concurrent.futures import ntpath +import re import requests import pathlib import logging @@ -209,7 +210,6 @@ def _parse_file_size(self, size_str): size_str = str(size_str).upper().strip() # Extract number and unit - import re match = re.match(r'(\d+(?:\.\d+)?)\s*([KMGT]?B?)', size_str) if not match: return 10 * 1024 * 1024 # Default 10MB @@ -329,7 +329,7 @@ def process( input_path, output=None, n=10, - generateIDs=False, + generate_ids=False, consolidate_header=True, consolidate_citations=False, include_raw_citations=False, @@ -391,7 +391,7 @@ def process( input_path, output, n, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -417,7 +417,7 @@ def process( input_path, output, n, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -455,7 +455,7 @@ def process_batch( input_path, output, n, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -542,7 +542,7 @@ def process_batch( selected_process, service, input_file, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -639,7 +639,7 @@ def process_pdf( self, service, pdf_file, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -667,7 +667,7 @@ def process_pdf( # set the GROBID parameters the_data = {} - if generateIDs: + if generate_ids: the_data["generateIDs"] = "1" if consolidate_header: the_data["consolidateHeader"] = "1" @@ -699,7 +699,7 @@ def process_pdf( self.process_pdf, service, pdf_file, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -734,7 +734,7 @@ def process_txt( self, service, txt_file, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -777,7 +777,7 @@ def process_txt( self.process_txt, service, txt_file, - generateIDs, + generate_ids, consolidate_header, consolidate_citations, include_raw_citations, @@ -840,7 +840,8 @@ def main(): help="concurrency for service usage" ) parser.add_argument( - "--generateIDs", + "--generate_ids", "--generateIDs", + dest="generate_ids", action="store_true", help="generate random xml:id to textual XML elements of the result files", ) @@ -870,12 +871,14 @@ def main(): help="force re-processing pdf input files when tei output files already exist", ) parser.add_argument( - "--teiCoordinates", + "--tei_coordinates", "--teiCoordinates", + dest="tei_coordinates", action="store_true", help="add the original PDF coordinates (bounding boxes) to the extracted elements", ) parser.add_argument( - "--segmentSentences", + "--segment_sentences", "--segmentSentences", + dest="segment_sentences", action="store_true", help="segment sentences in the text content of the document with additional elements", ) @@ -951,14 +954,14 @@ def main(): exit(1) service = args.service - generateIDs = args.generateIDs + generate_ids = args.generate_ids consolidate_header = args.consolidate_header consolidate_citations = args.consolidate_citations include_raw_citations = args.include_raw_citations include_raw_affiliations = args.include_raw_affiliations force = args.force - tei_coordinates = args.teiCoordinates - segment_sentences = args.segmentSentences + tei_coordinates = args.tei_coordinates + segment_sentences = args.segment_sentences verbose = args.verbose if service is None or service not in valid_services: @@ -973,7 +976,7 @@ def main(): input_path, output=output_path, n=n, - generateIDs=generateIDs, + generate_ids=generate_ids, consolidate_header=consolidate_header, consolidate_citations=consolidate_citations, include_raw_citations=include_raw_citations, diff --git a/tests/test_grobid_client.py b/tests/test_grobid_client.py index b6ab324..7dd6ec3 100644 --- a/tests/test_grobid_client.py +++ b/tests/test_grobid_client.py @@ -284,7 +284,7 @@ def test_process_pdf_success(self, mock_post, mock_file): result = client.process_pdf( 'processFulltextDocument', '/test/document.pdf', - generateIDs=True, + generate_ids=True, consolidate_header=True, consolidate_citations=False, include_raw_citations=False, @@ -308,7 +308,7 @@ def test_process_pdf_file_not_found(self, mock_file): result = client.process_pdf( 'processFulltextDocument', '/nonexistent/document.pdf', - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, @@ -335,7 +335,7 @@ def test_process_txt_success(self, mock_post, mock_file): result = client.process_txt( 'processCitationList', '/test/references.txt', - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=True, include_raw_citations=True, @@ -371,7 +371,7 @@ def test_process_pdf_server_busy_retry(self, mock_post): result = client.process_pdf( 'processFulltextDocument', '/test/document.pdf', - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, @@ -412,7 +412,7 @@ def test_process_batch(self, mock_isfile, mock_executor): '/test', '/output', n=2, - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, @@ -493,7 +493,7 @@ def test_process_batch_empty_input_files(self, mock_configure_logging, mock_test input_path='/test', output='/output', n=1, - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, @@ -535,7 +535,7 @@ def test_process_txt_unicode_error(self, mock_configure_logging, mock_test_serve result = client.process_txt( 'processCitationList', '/test/references.txt', - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, @@ -585,7 +585,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve result = client.process_pdf( 'processFulltextDocument', '/test/document.pdf', - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, @@ -600,7 +600,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve result = client.process_pdf( 'processFulltextDocument', '/test/document.pdf', - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, diff --git a/tests/test_integration.py b/tests/test_integration.py index 30675a9..1ec2dd5 100644 --- a/tests/test_integration.py +++ b/tests/test_integration.py @@ -171,7 +171,7 @@ def test_batch_processing(self): '/test', '/output', n=2, - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False, @@ -332,7 +332,7 @@ def test_concurrent_processing_stress(self): '/test', '/output', n=5, # 5 concurrent threads - generateIDs=False, + generate_ids=False, consolidate_header=False, consolidate_citations=False, include_raw_citations=False,