-
Notifications
You must be signed in to change notification settings - Fork 82
Add --server to override any baseurl #90
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Merged
Merged
Changes from all commits
Commits
Show all changes
11 commits
Select commit
Hold shift + click to select a range
fa1cf65
Add --server to override any baseurl
lfoppiano c807b6c
Update tests
lfoppiano 28ec839
remove unused constructor params
lfoppiano b67c3ea
update default values override
lfoppiano 9a2a206
Update grobid_client/grobid_client.py
lfoppiano 8f29225
cleanup
lfoppiano 9a7cc75
Remove duplicated code
lfoppiano 5f6af7c
Update grobid_client/grobid_client.py
lfoppiano 1b08be7
basically I'm rewrite it
lfoppiano 88c143d
update tests
lfoppiano 69f5599
fix error handling properly
lfoppiano File filter
Filter by extension
Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
There are no files selected for viewing
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -1,5 +1,14 @@ | ||
| from grobid_client.grobid_client import GrobidClient | ||
|
|
||
| if __name__ == "__main__": | ||
| # Example 1: Using config file values (no constructor parameters) | ||
| client = GrobidClient(config_path="./config.json") | ||
| client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) | ||
|
|
||
| # Example 2: Overriding config file with explicit server parameter | ||
| # client = GrobidClient(grobid_server="https://lfoppiano-grobid.hf.space", config_path="./config.json") | ||
| # client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) | ||
|
|
||
| # Example 3: Using default values (no config file, no parameters) | ||
| # client = GrobidClient() | ||
| # client.process("processFulltextDocument", "./resources/test_pdf", output="./resources/test_out/", consolidate_citations=True, tei_coordinates=True, force=True) |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Original file line number | Diff line number | Diff line change | ||||||
|---|---|---|---|---|---|---|---|---|
|
|
@@ -24,6 +24,7 @@ | |||||||
| import pathlib | ||||||||
| import logging | ||||||||
| from typing import Tuple | ||||||||
| import copy | ||||||||
|
|
||||||||
| from .client import ApiClient | ||||||||
|
|
||||||||
|
|
@@ -37,60 +38,91 @@ def __init__(self, message="GROBID server is not available"): | |||||||
|
|
||||||||
|
|
||||||||
| class GrobidClient(ApiClient): | ||||||||
| # Default configuration values | ||||||||
| DEFAULT_CONFIG = { | ||||||||
| 'grobid_server': 'http://localhost:8070', | ||||||||
| 'batch_size': 1000, | ||||||||
| 'sleep_time': 5, | ||||||||
| 'timeout': 180, | ||||||||
| 'coordinates': [ | ||||||||
| "title", | ||||||||
| "persName", | ||||||||
| "affiliation", | ||||||||
| "orgName", | ||||||||
| "formula", | ||||||||
| "figure", | ||||||||
| "ref", | ||||||||
| "biblStruct", | ||||||||
| "head", | ||||||||
| "p", | ||||||||
| "s", | ||||||||
| "note" | ||||||||
| ], | ||||||||
| 'logging': { | ||||||||
| 'level': 'INFO', | ||||||||
| 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||||||
| 'console': True, | ||||||||
| 'file': None, # Disabled by default | ||||||||
| 'max_file_size': '10MB', | ||||||||
| 'backup_count': 3 | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| def __init__( | ||||||||
| self, | ||||||||
| grobid_server='http://localhost:8070', | ||||||||
| batch_size=1000, | ||||||||
| grobid_server=None, | ||||||||
| batch_size=None, | ||||||||
| coordinates=None, | ||||||||
| sleep_time=5, | ||||||||
| timeout=180, | ||||||||
| sleep_time=None, | ||||||||
| timeout=None, | ||||||||
| config_path=None, | ||||||||
| check_server=True | ||||||||
| ): | ||||||||
| # Set default coordinates if None provided | ||||||||
| if coordinates is None: | ||||||||
| coordinates = [ | ||||||||
| "title", | ||||||||
| "persName", | ||||||||
| "affiliation", | ||||||||
| "orgName", | ||||||||
| "formula", | ||||||||
| "figure", | ||||||||
| "ref", | ||||||||
| "biblStruct", | ||||||||
| "head", | ||||||||
| "p", | ||||||||
| "s", | ||||||||
| "note" | ||||||||
| ] | ||||||||
|
|
||||||||
| self.config = { | ||||||||
| # Initialize config with defaults | ||||||||
| self.config = copy.deepcopy(self.DEFAULT_CONFIG) | ||||||||
|
|
||||||||
| # Load config file (which may override current values) | ||||||||
| if config_path: | ||||||||
| self._load_config(config_path) | ||||||||
|
|
||||||||
| # Constructor parameters take precedence over config file values | ||||||||
| # This ensures CLI arguments override config file values | ||||||||
| self._set_config_params({ | ||||||||
| 'grobid_server': grobid_server, | ||||||||
| 'batch_size': batch_size, | ||||||||
| 'coordinates': coordinates, | ||||||||
| 'sleep_time': sleep_time, | ||||||||
| 'timeout': timeout, | ||||||||
| 'logging': { | ||||||||
| 'level': 'INFO', | ||||||||
| 'format': '%(asctime)s - %(name)s - %(levelname)s - %(message)s', | ||||||||
| 'console': True, | ||||||||
| 'file': None, # Disabled by default | ||||||||
| 'max_file_size': '10MB', | ||||||||
| 'backup_count': 3 | ||||||||
| } | ||||||||
| } | ||||||||
|
|
||||||||
| # Load config first (which may override logging settings) | ||||||||
| if config_path: | ||||||||
| self._load_config(config_path) | ||||||||
| 'timeout': timeout | ||||||||
| }) | ||||||||
|
|
||||||||
| # Configure logging based on config | ||||||||
| self._configure_logging() | ||||||||
|
|
||||||||
| if check_server: | ||||||||
| self._test_server_connection() | ||||||||
|
|
||||||||
| def _set_config_params(self, params): | ||||||||
| """Set configuration parameters, only if they are not None.""" | ||||||||
| for key, value in params.items(): | ||||||||
| if value is not None: | ||||||||
| self.config[key] = value | ||||||||
|
|
||||||||
| def _handle_server_busy_retry(self, file_path, retry_func, *args, **kwargs): | ||||||||
| """Handle server busy (503) retry logic.""" | ||||||||
| self.logger.warning(f"Server busy (503), retrying {file_path} after {self.config['sleep_time']} seconds") | ||||||||
| time.sleep(self.config["sleep_time"]) | ||||||||
| return retry_func(*args, **kwargs) | ||||||||
|
|
||||||||
| def _handle_request_error(self, file_path, error, error_type="Request"): | ||||||||
| """Handle request errors with consistent logging and return format.""" | ||||||||
| self.logger.error(f"{error_type} failed for {file_path}: {str(error)}") | ||||||||
| return (file_path, 500, f"{error_type} failed: {str(error)}") | ||||||||
|
|
||||||||
| def _handle_unexpected_error(self, file_path, error): | ||||||||
| """Handle unexpected errors with consistent logging and return format.""" | ||||||||
| self.logger.error(f"Unexpected error processing {file_path}: {str(error)}") | ||||||||
| return (file_path, 500, f"Unexpected error: {str(error)}") | ||||||||
|
|
||||||||
| def _configure_logging(self): | ||||||||
| """Configure logging based on the configuration settings.""" | ||||||||
| # Get logging config with defaults | ||||||||
|
|
@@ -479,56 +511,53 @@ def process_pdf( | |||||||
| start=-1, | ||||||||
| end=-1 | ||||||||
| ): | ||||||||
| pdf_handle = None | ||||||||
lfoppiano marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
| try: | ||||||||
| pdf_handle = open(pdf_file, "rb") | ||||||||
| except IOError as e: | ||||||||
| self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}") | ||||||||
| return (pdf_file, 500, f"Failed to open file: {str(e)}") | ||||||||
|
|
||||||||
| files = { | ||||||||
| "input": ( | ||||||||
| pdf_file, | ||||||||
| pdf_handle, | ||||||||
| "application/pdf", | ||||||||
| {"Expires": "0"}, | ||||||||
| ) | ||||||||
| } | ||||||||
|
|
||||||||
| the_url = self.get_server_url(service) | ||||||||
|
|
||||||||
| files = { | ||||||||
| "input": ( | ||||||||
| pdf_file, | ||||||||
| pdf_handle, | ||||||||
| "application/pdf", | ||||||||
| {"Expires": "0"}, | ||||||||
| ) | ||||||||
| } | ||||||||
|
|
||||||||
| # set the GROBID parameters | ||||||||
| the_data = {} | ||||||||
| if generateIDs: | ||||||||
| the_data["generateIDs"] = "1" | ||||||||
| if consolidate_header: | ||||||||
| the_data["consolidateHeader"] = "1" | ||||||||
| if consolidate_citations: | ||||||||
| the_data["consolidateCitations"] = "1" | ||||||||
| if include_raw_citations: | ||||||||
| the_data["includeRawCitations"] = "1" | ||||||||
| if include_raw_affiliations: | ||||||||
| the_data["includeRawAffiliations"] = "1" | ||||||||
| if tei_coordinates: | ||||||||
| the_data["teiCoordinates"] = self.config["coordinates"] | ||||||||
| if segment_sentences: | ||||||||
| the_data["segmentSentences"] = "1" | ||||||||
| if flavor: | ||||||||
| the_data["flavor"] = flavor | ||||||||
| if start and start > 0: | ||||||||
| the_data["start"] = str(start) | ||||||||
| if end and end > 0: | ||||||||
| the_data["end"] = str(end) | ||||||||
| the_url = self.get_server_url(service) | ||||||||
|
|
||||||||
| # set the GROBID parameters | ||||||||
| the_data = {} | ||||||||
| if generateIDs: | ||||||||
| the_data["generateIDs"] = "1" | ||||||||
| if consolidate_header: | ||||||||
| the_data["consolidateHeader"] = "1" | ||||||||
| if consolidate_citations: | ||||||||
| the_data["consolidateCitations"] = "1" | ||||||||
| if include_raw_citations: | ||||||||
| the_data["includeRawCitations"] = "1" | ||||||||
| if include_raw_affiliations: | ||||||||
| the_data["includeRawAffiliations"] = "1" | ||||||||
| if tei_coordinates: | ||||||||
| the_data["teiCoordinates"] = self.config["coordinates"] | ||||||||
| if segment_sentences: | ||||||||
| the_data["segmentSentences"] = "1" | ||||||||
| if flavor: | ||||||||
| the_data["flavor"] = flavor | ||||||||
| if start and start > 0: | ||||||||
| the_data["start"] = str(start) | ||||||||
| if end and end > 0: | ||||||||
| the_data["end"] = str(end) | ||||||||
|
|
||||||||
| try: | ||||||||
| res, status = self.post( | ||||||||
| url=the_url, files=files, data=the_data, headers={"Accept": "text/plain"}, | ||||||||
| timeout=self.config['timeout'] | ||||||||
| ) | ||||||||
|
|
||||||||
| if status == 503: | ||||||||
| self.logger.warning(f"Server busy (503), retrying {pdf_file} after {self.config['sleep_time']} seconds") | ||||||||
| time.sleep(self.config["sleep_time"]) | ||||||||
| return self.process_pdf( | ||||||||
| return self._handle_server_busy_retry( | ||||||||
| pdf_file, | ||||||||
| self.process_pdf, | ||||||||
| service, | ||||||||
| pdf_file, | ||||||||
| generateIDs, | ||||||||
|
|
@@ -542,21 +571,22 @@ def process_pdf( | |||||||
| start, | ||||||||
| end | ||||||||
| ) | ||||||||
|
|
||||||||
| return (pdf_file, status, res.text) | ||||||||
|
|
||||||||
| except IOError as e: | ||||||||
| self.logger.error(f"Failed to open PDF file {pdf_file}: {str(e)}") | ||||||||
| return (pdf_file, 400, f"Failed to open file: {str(e)}") | ||||||||
lfoppiano marked this conversation as resolved.
Show resolved
Hide resolved
|
||||||||
| except requests.exceptions.ReadTimeout as e: | ||||||||
| self.logger.error(f"Request timeout for {pdf_file}: {str(e)}") | ||||||||
| pdf_handle.close() | ||||||||
| return (pdf_file, 408, f"Request timeout: {str(e)}") | ||||||||
| except requests.exceptions.RequestException as e: | ||||||||
| self.logger.error(f"Request failed for {pdf_file}: {str(e)}") | ||||||||
| pdf_handle.close() | ||||||||
| return (pdf_file, 500, f"Request failed: {str(e)}") | ||||||||
| return self._handle_request_error(pdf_file, e) | ||||||||
| except Exception as e: | ||||||||
| self.logger.error(f"Unexpected error processing {pdf_file}: {str(e)}") | ||||||||
| pdf_handle.close() | ||||||||
| return (pdf_file, 500, f"Unexpected error: {str(e)}") | ||||||||
|
|
||||||||
| pdf_handle.close() | ||||||||
| return (pdf_file, status, res.text) | ||||||||
| return self._handle_unexpected_error(pdf_file, e) | ||||||||
|
||||||||
| return self._handle_unexpected_error(pdf_file, e) | |
| return self._handle_unexpected_error(pdf_file, e) | |
| finally: |
Oops, something went wrong.
Add this suggestion to a batch that can be applied as a single commit.
This suggestion is invalid because no changes were made to the code.
Suggestions cannot be applied while the pull request is closed.
Suggestions cannot be applied while viewing a subset of changes.
Only one suggestion per line can be applied in a batch.
Add this suggestion to a batch that can be applied as a single commit.
Applying suggestions on deleted lines is not supported.
You must change the existing code in this line in order to create a valid suggestion.
Outdated suggestions cannot be applied.
This suggestion has been applied or marked resolved.
Suggestions cannot be applied from pending reviews.
Suggestions cannot be applied on multi-line comments.
Suggestions cannot be applied while the pull request is queued to merge.
Suggestion cannot be applied right now. Please check back later.
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
The _set_config_params method is called twice with the same parameters (lines 84-90 and 98-104). This duplication means config file values will always be overwritten by constructor parameters, making the first call redundant. Consider removing the first call or restructuring the logic.