From ed4ce8ed5f4b0c9a8ac58a6e40fb9c47219e0fe1 Mon Sep 17 00:00:00 2001
From: Luca Foppiano <luca@foppiano.org>
Date: Thu, 5 Mar 2026 15:19:12 +0100
Subject: [PATCH] refactor: standardize parameter naming from generateIDs to
 generate_ids and update documentation

---
 Readme.md                      | 20 ++++++++---------
 grobid_client/grobid_client.py | 39 ++++++++++++++++++----------------
 tests/test_grobid_client.py    | 18 ++++++++--------
 tests/test_integration.py      |  4 ++--
 4 files changed, 42 insertions(+), 39 deletions(-)

diff --git a/Readme.md b/Readme.md
index 4a40f99..2005366 100644
--- a/Readme.md
+++ b/Readme.md
@@ -134,13 +134,13 @@ grobid_client [OPTIONS] SERVICE
 
 | Option                       | Description                               |
 |------------------------------|-------------------------------------------|
-| `--generateIDs`              | Generate random XML IDs                   |
+| `--generate_ids`             | Generate random XML IDs                   |
 | `--consolidate_header`       | Consolidate header metadata               |
 | `--consolidate_citations`    | Consolidate bibliographic references      |
 | `--include_raw_citations`    | Include raw citation text                 |
 | `--include_raw_affiliations` | Include raw affiliation text              |
-| `--teiCoordinates`           | Add PDF coordinates to XML                |
-| `--segmentSentences`         | Segment sentences with coordinates        |
+| `--tei_coordinates`          | Add PDF coordinates to XML                |
+| `--segment_sentences`        | Segment sentences with coordinates        |
 | `--flavor`                   | Processing flavor for fulltext extraction |
 | `--json`                     | Convert TEI output to JSON format         |
 | `--markdown`                 | Convert TEI output to Markdown format     |
@@ -153,7 +153,7 @@ grobid_client [OPTIONS] SERVICE
 grobid_client --input ~/documents --output ~/results processFulltextDocument
 
 # High concurrency with coordinates
-grobid_client --input ~/pdfs --output ~/tei --n 20 --teiCoordinates processFulltextDocument
+grobid_client --input ~/pdfs --output ~/tei --n 20 --tei_coordinates processFulltextDocument
 
 # Process with JSON output
 grobid_client --input ~/pdfs --output ~/results --json processFulltextDocument
@@ -165,7 +165,7 @@ grobid_client --input ~/pdfs --output ~/results --markdown processFulltextDocume
 grobid_client --server https://grobid.example.com --input ~/citations.txt processCitationList
 
 # Force reprocessing with sentence segmentation and JSON output
-grobid_client --input ~/docs --force --segmentSentences --json processFulltextDocument
+grobid_client --input ~/docs --force --segment_sentences --json processFulltextDocument
 ```
 
 ### Python Library
@@ -202,10 +202,10 @@ client.process(
     input_path="/path/to/pdfs",
     output_path="/path/to/output",
     n=10,
-    generateIDs=True,
+    generate_ids=True,
     consolidate_header=True,
-    teiCoordinates=True,
-    segmentSentences=True
+    tei_coordinates=True,
+    segment_sentences=True
 )
 
 # Process with JSON output
@@ -454,7 +454,7 @@ When using the `--json` flag, the client converts TEI XML output to a structured
 grobid_client --input pdfs/ --output results/ --json processFulltextDocument
 
 # JSON output with coordinates and sentence segmentation
-grobid_client --input pdfs/ --output results/ --json --teiCoordinates --segmentSentences processFulltextDocument
+grobid_client --input pdfs/ --output results/ --json --tei_coordinates --segment_sentences processFulltextDocument
 ```
 
 ```python
@@ -535,7 +535,7 @@ Competing interests statement...
 grobid_client --input pdfs/ --output results/ --markdown processFulltextDocument
 
 # Markdown output with coordinates and sentence segmentation
-grobid_client --input pdfs/ --output results/ --markdown --teiCoordinates --segmentSentences processFulltextDocument
+grobid_client --input pdfs/ --output results/ --markdown --tei_coordinates --segment_sentences processFulltextDocument
 ```
 
 ```python
diff --git a/grobid_client/grobid_client.py b/grobid_client/grobid_client.py
index 1af79d1..f1ff591 100644
--- a/grobid_client/grobid_client.py
+++ b/grobid_client/grobid_client.py
@@ -20,6 +20,7 @@
 import time
 import concurrent.futures
 import ntpath
+import re
 import requests
 import pathlib
 import logging
@@ -209,7 +210,6 @@ def _parse_file_size(self, size_str):
         size_str = str(size_str).upper().strip()
 
         # Extract number and unit
-        import re
         match = re.match(r'(\d+(?:\.\d+)?)\s*([KMGT]?B?)', size_str)
         if not match:
             return 10 * 1024 * 1024  # Default 10MB
@@ -329,7 +329,7 @@ def process(
             input_path,
             output=None,
             n=10,
-            generateIDs=False,
+            generate_ids=False,
             consolidate_header=True,
             consolidate_citations=False,
             include_raw_citations=False,
@@ -391,7 +391,7 @@ def process(
                     input_path,
                     output,
                     n,
-                    generateIDs,
+                    generate_ids,
                     consolidate_header,
                     consolidate_citations,
                     include_raw_citations,
@@ -417,7 +417,7 @@ def process(
                 input_path,
                 output,
                 n,
-                generateIDs,
+                generate_ids,
                 consolidate_header,
                 consolidate_citations,
                 include_raw_citations,
@@ -455,7 +455,7 @@ def process_batch(
             input_path,
             output,
             n,
-            generateIDs,
+            generate_ids,
             consolidate_header,
             consolidate_citations,
             include_raw_citations,
@@ -542,7 +542,7 @@ def process_batch(
                     selected_process,
                     service,
                     input_file,
-                    generateIDs,
+                    generate_ids,
                     consolidate_header,
                     consolidate_citations,
                     include_raw_citations,
@@ -639,7 +639,7 @@ def process_pdf(
             self,
             service,
             pdf_file,
-            generateIDs,
+            generate_ids,
             consolidate_header,
             consolidate_citations,
             include_raw_citations,
@@ -667,7 +667,7 @@ def process_pdf(
 
             # set the GROBID parameters
             the_data = {}
-            if generateIDs:
+            if generate_ids:
                 the_data["generateIDs"] = "1"
             if consolidate_header:
                 the_data["consolidateHeader"] = "1"
@@ -699,7 +699,7 @@ def process_pdf(
                     self.process_pdf,
                     service,
                     pdf_file,
-                    generateIDs,
+                    generate_ids,
                     consolidate_header,
                     consolidate_citations,
                     include_raw_citations,
@@ -734,7 +734,7 @@ def process_txt(
             self,
             service,
             txt_file,
-            generateIDs,
+            generate_ids,
             consolidate_header,
             consolidate_citations,
             include_raw_citations,
@@ -777,7 +777,7 @@ def process_txt(
                     self.process_txt,
                     service,
                     txt_file,
-                    generateIDs,
+                    generate_ids,
                     consolidate_header,
                     consolidate_citations,
                     include_raw_citations,
@@ -840,7 +840,8 @@ def main():
         help="concurrency for service usage"
     )
     parser.add_argument(
-        "--generateIDs",
+        "--generate_ids", "--generateIDs",
+        dest="generate_ids",
         action="store_true",
         help="generate random xml:id to textual XML elements of the result files",
     )
@@ -870,12 +871,14 @@ def main():
         help="force re-processing pdf input files when tei output files already exist",
     )
     parser.add_argument(
-        "--teiCoordinates",
+        "--tei_coordinates", "--teiCoordinates",
+        dest="tei_coordinates",
         action="store_true",
         help="add the original PDF coordinates (bounding boxes) to the extracted elements",
     )
     parser.add_argument(
-        "--segmentSentences",
+        "--segment_sentences", "--segmentSentences",
+        dest="segment_sentences",
         action="store_true",
         help="segment sentences in the text content of the document with additional <s> elements",
     )
@@ -951,14 +954,14 @@ def main():
             exit(1)
 
     service = args.service
-    generateIDs = args.generateIDs
+    generate_ids = args.generate_ids
     consolidate_header = args.consolidate_header
     consolidate_citations = args.consolidate_citations
     include_raw_citations = args.include_raw_citations
     include_raw_affiliations = args.include_raw_affiliations
     force = args.force
-    tei_coordinates = args.teiCoordinates
-    segment_sentences = args.segmentSentences
+    tei_coordinates = args.tei_coordinates
+    segment_sentences = args.segment_sentences
     verbose = args.verbose
 
     if service is None or service not in valid_services:
@@ -973,7 +976,7 @@ def main():
             input_path,
             output=output_path,
             n=n,
-            generateIDs=generateIDs,
+            generate_ids=generate_ids,
             consolidate_header=consolidate_header,
             consolidate_citations=consolidate_citations,
             include_raw_citations=include_raw_citations,
diff --git a/tests/test_grobid_client.py b/tests/test_grobid_client.py
index b6ab324..7dd6ec3 100644
--- a/tests/test_grobid_client.py
+++ b/tests/test_grobid_client.py
@@ -284,7 +284,7 @@ def test_process_pdf_success(self, mock_post, mock_file):
                 result = client.process_pdf(
                     'processFulltextDocument',
                     '/test/document.pdf',
-                    generateIDs=True,
+                    generate_ids=True,
                     consolidate_header=True,
                     consolidate_citations=False,
                     include_raw_citations=False,
@@ -308,7 +308,7 @@ def test_process_pdf_file_not_found(self, mock_file):
                 result = client.process_pdf(
                     'processFulltextDocument',
                     '/nonexistent/document.pdf',
-                    generateIDs=False,
+                    generate_ids=False,
                     consolidate_header=False,
                     consolidate_citations=False,
                     include_raw_citations=False,
@@ -335,7 +335,7 @@ def test_process_txt_success(self, mock_post, mock_file):
                 result = client.process_txt(
                     'processCitationList',
                     '/test/references.txt',
-                    generateIDs=False,
+                    generate_ids=False,
                     consolidate_header=False,
                     consolidate_citations=True,
                     include_raw_citations=True,
@@ -371,7 +371,7 @@ def test_process_pdf_server_busy_retry(self, mock_post):
                         result = client.process_pdf(
                             'processFulltextDocument',
                             '/test/document.pdf',
-                            generateIDs=False,
+                            generate_ids=False,
                             consolidate_header=False,
                             consolidate_citations=False,
                             include_raw_citations=False,
@@ -412,7 +412,7 @@ def test_process_batch(self, mock_isfile, mock_executor):
                                 '/test',
                                 '/output',
                                 n=2,
-                                generateIDs=False,
+                                generate_ids=False,
                                 consolidate_header=False,
                                 consolidate_citations=False,
                                 include_raw_citations=False,
@@ -493,7 +493,7 @@ def test_process_batch_empty_input_files(self, mock_configure_logging, mock_test
             input_path='/test',
             output='/output',
             n=1,
-            generateIDs=False,
+            generate_ids=False,
             consolidate_header=False,
             consolidate_citations=False,
             include_raw_citations=False,
@@ -535,7 +535,7 @@ def test_process_txt_unicode_error(self, mock_configure_logging, mock_test_serve
             result = client.process_txt(
                 'processCitationList',
                 '/test/references.txt',
-                generateIDs=False,
+                generate_ids=False,
                 consolidate_header=False,
                 consolidate_citations=False,
                 include_raw_citations=False,
@@ -585,7 +585,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve
                 result = client.process_pdf(
                     'processFulltextDocument',
                     '/test/document.pdf',
-                    generateIDs=False,
+                    generate_ids=False,
                     consolidate_header=False,
                     consolidate_citations=False,
                     include_raw_citations=False,
@@ -600,7 +600,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve
                     result = client.process_pdf(
                         'processFulltextDocument',
                         '/test/document.pdf',
-                        generateIDs=False,
+                        generate_ids=False,
                         consolidate_header=False,
                         consolidate_citations=False,
                         include_raw_citations=False,
diff --git a/tests/test_integration.py b/tests/test_integration.py
index 30675a9..1ec2dd5 100644
--- a/tests/test_integration.py
+++ b/tests/test_integration.py
@@ -171,7 +171,7 @@ def test_batch_processing(self):
                                         '/test',
                                         '/output',
                                         n=2,
-                                        generateIDs=False,
+                                        generate_ids=False,
                                         consolidate_header=False,
                                         consolidate_citations=False,
                                         include_raw_citations=False,
@@ -332,7 +332,7 @@ def test_concurrent_processing_stress(self):
                                         '/test',
                                         '/output',
                                         n=5,  # 5 concurrent threads
-                                        generateIDs=False,
+                                        generate_ids=False,
                                         consolidate_header=False,
                                         consolidate_citations=False,
                                         include_raw_citations=False,