From 29583742442f0a492f5b73e10b9a3d08f7a5a491 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Oct 2025 10:24:44 -0700 Subject: [PATCH 1/3] input throttling --- run.py | 75 +++++++++++++++++++++++++++++++++++++++++++++++++++------- 1 file changed, 67 insertions(+), 8 deletions(-) mode change 100755 => 100644 run.py diff --git a/run.py b/run.py old mode 100755 new mode 100644 index 12bc533..598d3d0 --- a/run.py +++ b/run.py @@ -1,5 +1,6 @@ import sys import os, fnmatch +from collections import defaultdict from adsputils import setup_logging, load_config, get_date from datetime import timedelta @@ -16,6 +17,7 @@ app = tasks.app logger = setup_logging('run.py') +processed_log = setup_logging('processed_subdirectories.py') def run_diagnostics(bibcodes: list, source_filenames: list) -> None: @@ -38,23 +40,40 @@ def run_diagnostics(bibcodes: list, source_filenames: list) -> None: return -def get_source_filenames(source_file_path: str, file_extension: str, date_cutoff: time.struct_time) -> list: +def get_source_filenames(source_file_path, file_extension, date_cutoff): """ - retrieves a list of files from the given directory with the specified file extension and modified date after the cutoff + Return a list of lists of matching files, grouped by the first-level + subdirectory under `source_file_path`. If files live directly in + `source_file_path`, they are grouped together as one inner list. :param source_file_path: the path of the directory to search for files :param file_extension: the file extension pattern to match :param date_cutoff: the modified date cutoff, files modified after this date will be included only - :return: list of files in the directory with modified date after the cutoff, if any + :return: list of lists of files in the directory with modified date after the cutoff, if any """ - list_files = [] + groups = defaultdict(list) + ROOT = "__ROOT__" + for root, dirs, files in os.walk(source_file_path): for basename in files: if fnmatch.fnmatch(basename, file_extension): filename = os.path.join(root, basename) if get_date_modified_struct_time(filename) >= date_cutoff: - list_files.append(filename) - return list_files + rel_dir = os.path.relpath(root, source_file_path) + key = ROOT if rel_dir in (".", "") else rel_dir.split(os.sep, 1)[0] + groups[key].append(filename) + + if not groups: + return [] + + # Build a stable list-of-lists: root group first (if present), then subdirs sorted + result = [] + if ROOT in groups: + result.append(sorted(groups[ROOT])) + for key in sorted(k for k in groups.keys() if k != ROOT): + result.append(sorted(groups[key])) + return result + def queue_references(references: list, source_filename: str, source_bibcode: str, parsername: str) -> None: @@ -276,6 +295,19 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib dest='fail', action='store_true', help='Reprocess records that failed to get resolved') + resolve.add_argument('-t', + '--time_delay', + dest='time_delay', + action='store', + default=10., + help='Add time delay between processing subdirectories for large batches. The delay time is batch size divided by input value in seconds.') + resolve.add_argument('-sp', + '--skip_processed_directories', + dest='skip_processed', + action='store', + default=None, + help='Skip directories that have been previously processed') + stats = subparsers.add_parser('STATS', help='Print out statistics of the reference source file') stats.add_argument('-b', @@ -316,6 +348,7 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib help='Return all resolved bibcode') args = parser.parse_args() + #import pdb;pdb.set_trace() if args.action == 'DIAGNOSTICS': if args.parse_filename: @@ -345,8 +378,34 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib else: date_cutoff = get_date('1972') source_filenames = get_source_filenames(args.path, args.extension, date_cutoff.timetuple()) + if args.time_delay: + delay_rate = args.time_delay + else: + delay_rate = 1000. if len(source_filenames) > 0: - process_files(source_filenames) + for subdir in source_filenames: + subdir_name = subdir[0].split('/') + subdir_name = "/".join(subdir_name[:-1]) + delay_time = float(len(subdir))/float(delay_rate) + if args.skip_processed: + skip_file = args.skip_processed + try: + with open(skip_file,'r') as file: + skip_files = file.read().splitlines() + print(f'Skipping {len(skip_files)} subdirectories') + except: + skip_files = [] + print('No files to skip') + if subdir_name not in skip_files: + process_files(subdir) + processed_log.info(f"{subdir_name}") + logger.info(f"Processed subdirectoy: {subdir_name}") + print(f"Processed subdirectoy: {subdir_name}") + logger.info(f"Pause for {delay_time} seconds to process") + print(f"Pause for {delay_time} seconds to process") + time.sleep(delay_time) + else: + print(f'Skipping {subdir_name}') elif args.confidence: date_cutoff = get_date() - timedelta(days=int(args.days)) if args.days else None reprocess_references(ReprocessQueryType.score, score_cutoff=float(args.confidence), date_cutoff=date_cutoff) @@ -391,4 +450,4 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib # if args.all: # else: - sys.exit(0) \ No newline at end of file + sys.exit(0) From 0fcb42eb9825b21fb9e6e09c41cd4d295a9a9682 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Oct 2025 10:29:32 -0700 Subject: [PATCH 2/3] type hints --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index 598d3d0..0c7eabb 100644 --- a/run.py +++ b/run.py @@ -40,7 +40,7 @@ def run_diagnostics(bibcodes: list, source_filenames: list) -> None: return -def get_source_filenames(source_file_path, file_extension, date_cutoff): +def get_source_filenames(source_file_path: str, file_extension: str, date_cutoff: time.struct_time) -> list): """ Return a list of lists of matching files, grouped by the first-level subdirectory under `source_file_path`. If files live directly in From ae3bac8c917073cd830374d9d40c53fcd7a194ec Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 21 Oct 2025 10:30:51 -0700 Subject: [PATCH 3/3] fix typo --- run.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/run.py b/run.py index 0c7eabb..6d97120 100644 --- a/run.py +++ b/run.py @@ -40,7 +40,7 @@ def run_diagnostics(bibcodes: list, source_filenames: list) -> None: return -def get_source_filenames(source_file_path: str, file_extension: str, date_cutoff: time.struct_time) -> list): +def get_source_filenames(source_file_path: str, file_extension: str, date_cutoff: time.struct_time) -> list: """ Return a list of lists of matching files, grouped by the first-level subdirectory under `source_file_path`. If files live directly in