Skip to content

Commit 04fec4e

Browse files
committed
feat: Add runtime and performance statistics for batch and overall processing.
1 parent 29f3b54 commit 04fec4e

File tree

1 file changed

+20
-0
lines changed

1 file changed

+20
-0
lines changed

grobid_client/grobid_client.py

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -342,6 +342,7 @@ def process(
342342
json_output=False,
343343
markdown_output=False
344344
):
345+
start_time = time.time()
345346
batch_size_pdf = self.config["batch_size"]
346347

347348
# First pass: count all eligible files
@@ -433,11 +434,19 @@ def process(
433434
errors_files_count += batch_errors
434435
skipped_files_count += batch_skipped
435436

437+
runtime = time.time() - start_time
438+
docs_per_second = processed_files_count / runtime if runtime > 0 else 0
439+
seconds_per_doc = runtime / processed_files_count if processed_files_count > 0 else 0
440+
436441
# Log final statistics - always visible
437442
print(f"Processing completed: {processed_files_count} out of {total_files} files processed")
438443
print(f"Errors: {errors_files_count} out of {total_files} files processed")
439444
if skipped_files_count > 0:
440445
print(f"Skipped: {skipped_files_count} out of {total_files} files (already existed, use --force to reprocess)")
446+
447+
print(f"⏱️ Total runtime: {runtime:.2f} seconds")
448+
print(f"🚀 Speed: {docs_per_second:.2f} documents/second")
449+
print(f" Throughput: {seconds_per_doc:.2f} seconds/document")
441450

442451
def process_batch(
443452
self,
@@ -459,6 +468,7 @@ def process_batch(
459468
json_output=False,
460469
markdown_output=False
461470
):
471+
batch_start_time = time.time()
462472
if verbose:
463473
self.logger.info(f"{len(input_files)} files to process in current batch")
464474

@@ -613,6 +623,16 @@ def process_batch(
613623
except OSError as e:
614624
self.logger.error(f"Failed to write TEI XML file {filename}: {str(e)}")
615625

626+
# Calculate batch statistics
627+
batch_runtime = time.time() - batch_start_time
628+
batch_docs_per_second = processed_count / batch_runtime if batch_runtime > 0 else 0
629+
batch_seconds_per_docs = batch_runtime / processed_count if processed_count > 0 else 0
630+
631+
if verbose:
632+
self.logger.info(f"⏱️ Runtime: {batch_runtime:.2f} seconds")
633+
self.logger.info(f"🚀 Speed: {batch_docs_per_second:.2f} documents/second")
634+
self.logger.info(f" Throughput: {batch_seconds_per_docs:.2f} seconds/document")
635+
616636
return processed_count, error_count, skipped_count
617637

618638
def process_pdf(

0 commit comments

Comments
 (0)