Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
20 changes: 10 additions & 10 deletions Readme.md
Original file line number Diff line number Diff line change
Expand Up @@ -134,13 +134,13 @@ grobid_client [OPTIONS] SERVICE

| Option | Description |
|------------------------------|-------------------------------------------|
| `--generateIDs` | Generate random XML IDs |
| `--generate_ids` | Generate random XML IDs |
| `--consolidate_header` | Consolidate header metadata |
| `--consolidate_citations` | Consolidate bibliographic references |
| `--include_raw_citations` | Include raw citation text |
| `--include_raw_affiliations` | Include raw affiliation text |
| `--teiCoordinates` | Add PDF coordinates to XML |
| `--segmentSentences` | Segment sentences with coordinates |
| `--tei_coordinates` | Add PDF coordinates to XML |
| `--segment_sentences` | Segment sentences with coordinates |
| `--flavor` | Processing flavor for fulltext extraction |
| `--json` | Convert TEI output to JSON format |
| `--markdown` | Convert TEI output to Markdown format |
Expand All @@ -153,7 +153,7 @@ grobid_client [OPTIONS] SERVICE
grobid_client --input ~/documents --output ~/results processFulltextDocument

# High concurrency with coordinates
grobid_client --input ~/pdfs --output ~/tei --n 20 --teiCoordinates processFulltextDocument
grobid_client --input ~/pdfs --output ~/tei --n 20 --tei_coordinates processFulltextDocument

# Process with JSON output
grobid_client --input ~/pdfs --output ~/results --json processFulltextDocument
Expand All @@ -165,7 +165,7 @@ grobid_client --input ~/pdfs --output ~/results --markdown processFulltextDocume
grobid_client --server https://grobid.example.com --input ~/citations.txt processCitationList

# Force reprocessing with sentence segmentation and JSON output
grobid_client --input ~/docs --force --segmentSentences --json processFulltextDocument
grobid_client --input ~/docs --force --segment_sentences --json processFulltextDocument
```

### Python Library
Expand Down Expand Up @@ -202,10 +202,10 @@ client.process(
input_path="/path/to/pdfs",
output_path="/path/to/output",
n=10,
generateIDs=True,
generate_ids=True,
consolidate_header=True,
teiCoordinates=True,
segmentSentences=True
tei_coordinates=True,
segment_sentences=True
)

# Process with JSON output
Expand Down Expand Up @@ -454,7 +454,7 @@ When using the `--json` flag, the client converts TEI XML output to a structured
grobid_client --input pdfs/ --output results/ --json processFulltextDocument

# JSON output with coordinates and sentence segmentation
grobid_client --input pdfs/ --output results/ --json --teiCoordinates --segmentSentences processFulltextDocument
grobid_client --input pdfs/ --output results/ --json --tei_coordinates --segment_sentences processFulltextDocument
```

```python
Expand Down Expand Up @@ -535,7 +535,7 @@ Competing interests statement...
grobid_client --input pdfs/ --output results/ --markdown processFulltextDocument

# Markdown output with coordinates and sentence segmentation
grobid_client --input pdfs/ --output results/ --markdown --teiCoordinates --segmentSentences processFulltextDocument
grobid_client --input pdfs/ --output results/ --markdown --tei_coordinates --segment_sentences processFulltextDocument
```

```python
Expand Down
39 changes: 21 additions & 18 deletions grobid_client/grobid_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -20,6 +20,7 @@
import time
import concurrent.futures
import ntpath
import re
import requests
import pathlib
import logging
Expand Down Expand Up @@ -209,7 +210,6 @@ def _parse_file_size(self, size_str):
size_str = str(size_str).upper().strip()

# Extract number and unit
import re
match = re.match(r'(\d+(?:\.\d+)?)\s*([KMGT]?B?)', size_str)
if not match:
return 10 * 1024 * 1024 # Default 10MB
Expand Down Expand Up @@ -329,7 +329,7 @@ def process(
input_path,
output=None,
n=10,
generateIDs=False,
generate_ids=False,
consolidate_header=True,
consolidate_citations=False,
include_raw_citations=False,
Expand Down Expand Up @@ -391,7 +391,7 @@ def process(
input_path,
output,
n,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand All @@ -417,7 +417,7 @@ def process(
input_path,
output,
n,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand Down Expand Up @@ -455,7 +455,7 @@ def process_batch(
input_path,
output,
n,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand Down Expand Up @@ -542,7 +542,7 @@ def process_batch(
selected_process,
service,
input_file,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand Down Expand Up @@ -639,7 +639,7 @@ def process_pdf(
self,
service,
pdf_file,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand Down Expand Up @@ -667,7 +667,7 @@ def process_pdf(

# set the GROBID parameters
the_data = {}
if generateIDs:
if generate_ids:
the_data["generateIDs"] = "1"
if consolidate_header:
the_data["consolidateHeader"] = "1"
Expand Down Expand Up @@ -699,7 +699,7 @@ def process_pdf(
self.process_pdf,
service,
pdf_file,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand Down Expand Up @@ -734,7 +734,7 @@ def process_txt(
self,
service,
txt_file,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand Down Expand Up @@ -777,7 +777,7 @@ def process_txt(
self.process_txt,
service,
txt_file,
generateIDs,
generate_ids,
consolidate_header,
consolidate_citations,
include_raw_citations,
Expand Down Expand Up @@ -840,7 +840,8 @@ def main():
help="concurrency for service usage"
)
parser.add_argument(
"--generateIDs",
"--generate_ids", "--generateIDs",
dest="generate_ids",
action="store_true",
help="generate random xml:id to textual XML elements of the result files",
)
Expand Down Expand Up @@ -870,12 +871,14 @@ def main():
help="force re-processing pdf input files when tei output files already exist",
)
parser.add_argument(
"--teiCoordinates",
"--tei_coordinates", "--teiCoordinates",
dest="tei_coordinates",
action="store_true",
help="add the original PDF coordinates (bounding boxes) to the extracted elements",
)
parser.add_argument(
"--segmentSentences",
"--segment_sentences", "--segmentSentences",
dest="segment_sentences",
action="store_true",
help="segment sentences in the text content of the document with additional <s> elements",
)
Expand Down Expand Up @@ -951,14 +954,14 @@ def main():
exit(1)

service = args.service
generateIDs = args.generateIDs
generate_ids = args.generate_ids
consolidate_header = args.consolidate_header
consolidate_citations = args.consolidate_citations
include_raw_citations = args.include_raw_citations
include_raw_affiliations = args.include_raw_affiliations
force = args.force
tei_coordinates = args.teiCoordinates
segment_sentences = args.segmentSentences
tei_coordinates = args.tei_coordinates
segment_sentences = args.segment_sentences
verbose = args.verbose

if service is None or service not in valid_services:
Expand All @@ -973,7 +976,7 @@ def main():
input_path,
output=output_path,
n=n,
generateIDs=generateIDs,
generate_ids=generate_ids,
consolidate_header=consolidate_header,
consolidate_citations=consolidate_citations,
include_raw_citations=include_raw_citations,
Expand Down
18 changes: 9 additions & 9 deletions tests/test_grobid_client.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def test_process_pdf_success(self, mock_post, mock_file):
result = client.process_pdf(
'processFulltextDocument',
'/test/document.pdf',
generateIDs=True,
generate_ids=True,
consolidate_header=True,
consolidate_citations=False,
include_raw_citations=False,
Expand All @@ -308,7 +308,7 @@ def test_process_pdf_file_not_found(self, mock_file):
result = client.process_pdf(
'processFulltextDocument',
'/nonexistent/document.pdf',
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand All @@ -335,7 +335,7 @@ def test_process_txt_success(self, mock_post, mock_file):
result = client.process_txt(
'processCitationList',
'/test/references.txt',
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=True,
include_raw_citations=True,
Expand Down Expand Up @@ -371,7 +371,7 @@ def test_process_pdf_server_busy_retry(self, mock_post):
result = client.process_pdf(
'processFulltextDocument',
'/test/document.pdf',
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand Down Expand Up @@ -412,7 +412,7 @@ def test_process_batch(self, mock_isfile, mock_executor):
'/test',
'/output',
n=2,
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand Down Expand Up @@ -493,7 +493,7 @@ def test_process_batch_empty_input_files(self, mock_configure_logging, mock_test
input_path='/test',
output='/output',
n=1,
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand Down Expand Up @@ -535,7 +535,7 @@ def test_process_txt_unicode_error(self, mock_configure_logging, mock_test_serve
result = client.process_txt(
'processCitationList',
'/test/references.txt',
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand Down Expand Up @@ -585,7 +585,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve
result = client.process_pdf(
'processFulltextDocument',
'/test/document.pdf',
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand All @@ -600,7 +600,7 @@ def test_process_pdf_timeout_error(self, mock_configure_logging, mock_test_serve
result = client.process_pdf(
'processFulltextDocument',
'/test/document.pdf',
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand Down
4 changes: 2 additions & 2 deletions tests/test_integration.py
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@ def test_batch_processing(self):
'/test',
'/output',
n=2,
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand Down Expand Up @@ -332,7 +332,7 @@ def test_concurrent_processing_stress(self):
'/test',
'/output',
n=5, # 5 concurrent threads
generateIDs=False,
generate_ids=False,
consolidate_header=False,
consolidate_citations=False,
include_raw_citations=False,
Expand Down