Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
73 changes: 66 additions & 7 deletions run.py
100755 → 100644
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
import sys
import os, fnmatch
from collections import defaultdict

from adsputils import setup_logging, load_config, get_date
from datetime import timedelta
Expand All @@ -16,6 +17,7 @@

app = tasks.app
logger = setup_logging('run.py')
processed_log = setup_logging('processed_subdirectories.py')


def run_diagnostics(bibcodes: list, source_filenames: list) -> None:
Expand All @@ -40,21 +42,38 @@ def run_diagnostics(bibcodes: list, source_filenames: list) -> None:

def get_source_filenames(source_file_path: str, file_extension: str, date_cutoff: time.struct_time) -> list:
"""
retrieves a list of files from the given directory with the specified file extension and modified date after the cutoff
Return a list of lists of matching files, grouped by the first-level
subdirectory under `source_file_path`. If files live directly in
`source_file_path`, they are grouped together as one inner list.

:param source_file_path: the path of the directory to search for files
:param file_extension: the file extension pattern to match
:param date_cutoff: the modified date cutoff, files modified after this date will be included only
:return: list of files in the directory with modified date after the cutoff, if any
:return: list of lists of files in the directory with modified date after the cutoff, if any
"""
list_files = []
groups = defaultdict(list)
ROOT = "__ROOT__"

for root, dirs, files in os.walk(source_file_path):
for basename in files:
if fnmatch.fnmatch(basename, file_extension):
filename = os.path.join(root, basename)
if get_date_modified_struct_time(filename) >= date_cutoff:
list_files.append(filename)
return list_files
rel_dir = os.path.relpath(root, source_file_path)
key = ROOT if rel_dir in (".", "") else rel_dir.split(os.sep, 1)[0]
groups[key].append(filename)

if not groups:
return []

# Build a stable list-of-lists: root group first (if present), then subdirs sorted
result = []
if ROOT in groups:
result.append(sorted(groups[ROOT]))
for key in sorted(k for k in groups.keys() if k != ROOT):
result.append(sorted(groups[key]))
return result



def queue_references(references: list, source_filename: str, source_bibcode: str, parsername: str) -> None:
Expand Down Expand Up @@ -276,6 +295,19 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib
dest='fail',
action='store_true',
help='Reprocess records that failed to get resolved')
resolve.add_argument('-t',
'--time_delay',
dest='time_delay',
action='store',
default=10.,
help='Add time delay between processing subdirectories for large batches. The delay time is batch size divided by input value in seconds.')
resolve.add_argument('-sp',
'--skip_processed_directories',
dest='skip_processed',
action='store',
default=None,
help='Skip directories that have been previously processed')


stats = subparsers.add_parser('STATS', help='Print out statistics of the reference source file')
stats.add_argument('-b',
Expand Down Expand Up @@ -316,6 +348,7 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib
help='Return all resolved bibcode')

args = parser.parse_args()
#import pdb;pdb.set_trace()

if args.action == 'DIAGNOSTICS':
if args.parse_filename:
Expand Down Expand Up @@ -345,8 +378,34 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib
else:
date_cutoff = get_date('1972')
source_filenames = get_source_filenames(args.path, args.extension, date_cutoff.timetuple())
if args.time_delay:
delay_rate = args.time_delay
else:
delay_rate = 1000.
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Make 1000. a value set in the config (like DEFAULT_DELAY)

if len(source_filenames) > 0:
process_files(source_filenames)
for subdir in source_filenames:
subdir_name = subdir[0].split('/')
subdir_name = "/".join(subdir_name[:-1])
delay_time = float(len(subdir))/float(delay_rate)
if args.skip_processed:
skip_file = args.skip_processed
try:
with open(skip_file,'r') as file:
skip_files = file.read().splitlines()
print(f'Skipping {len(skip_files)} subdirectories')
except:
skip_files = []
print('No files to skip')
if subdir_name not in skip_files:
process_files(subdir)
processed_log.info(f"{subdir_name}")
logger.info(f"Processed subdirectoy: {subdir_name}")
print(f"Processed subdirectoy: {subdir_name}")
logger.info(f"Pause for {delay_time} seconds to process")
print(f"Pause for {delay_time} seconds to process")
time.sleep(delay_time)
else:
print(f'Skipping {subdir_name}')
elif args.confidence:
date_cutoff = get_date() - timedelta(days=int(args.days)) if args.days else None
reprocess_references(ReprocessQueryType.score, score_cutoff=float(args.confidence), date_cutoff=date_cutoff)
Expand Down Expand Up @@ -391,4 +450,4 @@ def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bib
# if args.all:
# else:

sys.exit(0)
sys.exit(0)