From cbc75575772751da68f52888ef0862e23c03bf13 Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Fri, 22 Mar 2024 09:03:24 -0400 Subject: [PATCH 1/9] fix for issue 3, for when the file is empty --- adsrefpipe/refparsers/toREFs.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/adsrefpipe/refparsers/toREFs.py b/adsrefpipe/refparsers/toREFs.py index f9c15f6..1e3745b 100644 --- a/adsrefpipe/refparsers/toREFs.py +++ b/adsrefpipe/refparsers/toREFs.py @@ -431,7 +431,7 @@ def get_references(self, filename, encoding="utf8"): except Exception as error: logger.error("Unable to open file %s. Exception %s." % (filename, error)) return [] - if buffer is None: + if not buffer: return [] return self.get_reference_blob(buffer, self.detect_ref_format(buffer)) From 2183cb5ee5f4751f047235c55d278ebbba7c6932 Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Fri, 22 Mar 2024 09:04:29 -0400 Subject: [PATCH 2/9] add log --- adsrefpipe/refparsers/toREFs.py | 1 + 1 file changed, 1 insertion(+) diff --git a/adsrefpipe/refparsers/toREFs.py b/adsrefpipe/refparsers/toREFs.py index 1e3745b..e6aabb5 100644 --- a/adsrefpipe/refparsers/toREFs.py +++ b/adsrefpipe/refparsers/toREFs.py @@ -432,6 +432,7 @@ def get_references(self, filename, encoding="utf8"): logger.error("Unable to open file %s. Exception %s." % (filename, error)) return [] if not buffer: + logger.error("File %s is empty." % filename) return [] return self.get_reference_blob(buffer, self.detect_ref_format(buffer)) From 6b55231d5b7f0fcf60f2f3ff6fe01e724c441b13 Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Wed, 5 Mar 2025 16:32:18 -0500 Subject: [PATCH 3/9] implementing issue 14 - batch submit 1 --- README.md | 2 +- adsrefpipe/app.py | 635 +++++++++++------- adsrefpipe/models.py | 153 +++-- adsrefpipe/refparsers/unicode.py | 159 +++-- adsrefpipe/tasks.py | 18 +- .../dbdata.py} | 0 .../{test_db_query.py => test_app.py} | 405 ++++++++++- .../tests/unittests/test_ref_parsers.py | 515 ++++++-------- .../tests/unittests/test_ref_parsers_xml.py | 261 +++++++ adsrefpipe/tests/unittests/test_tasks.py | 66 +- adsrefpipe/tests/unittests/test_utils.py | 146 ++++ adsrefpipe/utils.py | 111 +-- ...c_move_mapping_of_paicz_and_cokon_from_.py | 44 ++ pytest.ini | 2 +- run.py | 93 ++- uml/adsrefpipe_plantuml.txt | 105 +++ uml/database_plantuml.txt | 59 ++ 17 files changed, 1989 insertions(+), 785 deletions(-) mode change 100644 => 100755 README.md mode change 100644 => 100755 adsrefpipe/app.py mode change 100644 => 100755 adsrefpipe/models.py mode change 100644 => 100755 adsrefpipe/refparsers/unicode.py mode change 100644 => 100755 adsrefpipe/tasks.py rename adsrefpipe/tests/unittests/{data_test_db_query.py => stubdata/dbdata.py} (100%) rename adsrefpipe/tests/unittests/{test_db_query.py => test_app.py} (52%) mode change 100644 => 100755 adsrefpipe/tests/unittests/test_ref_parsers.py create mode 100644 adsrefpipe/tests/unittests/test_ref_parsers_xml.py mode change 100644 => 100755 adsrefpipe/tests/unittests/test_tasks.py create mode 100644 adsrefpipe/tests/unittests/test_utils.py mode change 100644 => 100755 adsrefpipe/utils.py create mode 100755 alembic/versions/378ac509c8dc_move_mapping_of_paicz_and_cokon_from_.py mode change 100644 => 100755 pytest.ini mode change 100644 => 100755 run.py create mode 100644 uml/adsrefpipe_plantuml.txt create mode 100755 uml/database_plantuml.txt diff --git a/README.md b/README.md old mode 100644 new mode 100755 index 8aebdb0..d677f80 --- a/README.md +++ b/README.md @@ -54,7 +54,7 @@ This pipeline is to process source reference files, if xml to parse them first a python run.py RESOLVE -s ``` - 2. Specify a directory, and file extension, to recursively search all sub directories for this type of reference file, and queue them all for processing, use the command + 2. Specify a directory, and file extension (i.e. -e *.raw), to recursively search all sub directories for this type of reference file, and queue them all for processing, use the command ``` python run.py RESOLVE -p -e ``` diff --git a/adsrefpipe/app.py b/adsrefpipe/app.py old mode 100644 new mode 100755 index 4e5d2a5..1a587cb --- a/adsrefpipe/app.py +++ b/adsrefpipe/app.py @@ -8,6 +8,7 @@ from builtins import str from adsputils import ADSCelery from datetime import datetime, timedelta +from typing import List, Dict from adsrefpipe.models import Action, Parser, ReferenceSource, ProcessedHistory, ResolvedReference, CompareClassic from adsrefpipe.utils import get_date_created, get_date_modified, get_date_now, get_resolved_filename, \ @@ -17,28 +18,37 @@ from sqlalchemy import and_, literal from sqlalchemy.sql import exists from sqlalchemy.sql.expression import case, func +from sqlalchemy import desc from texttable import Texttable class ADSReferencePipelineCelery(ADSCelery): + """ + celery-based pipeline for processing and resolving references + handles reference parsing, resolution, and database management + """ + + # matches an identifier starting with 'H', followed by a number (history_id), then 'I', followed by another number (item_num) RE_PARSE_ID = re.compile(r'^H(?P\d+)+I(?P\d+)$') + # captures a double file extension at the end of a string, such as 'test.aas.raw' RE_MATCH_EXT = re.compile(r'.*(\..*?\.[a-z]+)$') default_parsers = {} - def __init__(self, app_name, *args, **kwargs): + def __init__(self, app_name: str, *args: tuple, **kwargs: Dict): """ + initialize the ADS reference pipeline celery application - :param app_name: - :param args: - :param kwargs: + :param app_name: name of the application + :param args: additional positional arguments + :param kwargs: additional keyword arguments """ ADSCelery.__init__(self, app_name, *args, **kwargs) - def init_default_parsers(self): + def init_default_parsers(self) -> None: """ - read into memory parser info from the lookup table + load parser information from the database into memory :return: """ @@ -62,13 +72,14 @@ def init_default_parsers(self): } self.default_parsers[to_dict['extension_pattern']] = to_dict - def match_parser(self, rows, journal, volume): + def match_parser(self, rows: List, journal: str, volume: str) -> Dict: """ + match a parser based on journal and volume information - :param rows: - :param journal: - :param volume: - :return: + :param rows: List of parser records + :param journal: journal name + :param volume: volume number or identifier + :return: matching parser record as a dictionary """ for row in rows: for match in row.get_matches(): @@ -89,11 +100,12 @@ def match_parser(self, rows, journal, volume): return row.toJSON() return {} - def get_parser(self, source_filename): + def get_parser(self, source_filename: str) -> Dict: """ + retrieve a parser based on the source filename - :param source_filename: - :return: + :param source_filename: filename of the source reference + :return: parser details as a dictionary """ if not self.default_parsers: self.init_default_parsers() @@ -130,12 +142,12 @@ def get_parser(self, source_filename): self.logger.error("Unrecognizable source file %s."%source_filename) return {} - def get_reference_service_endpoint(self, parsername): + def get_reference_service_endpoint(self, parsername: str) -> str: """ - given parsername find the endpoint that shall be called for the reference to get resolved + retrieve the reference service endpoint for a given parser - :param parsername: - :return: + :param parsername: name of the parser + :return: service endpoint URL """ with self.session_scope() as session: rows = session.query(Parser).filter(Parser.name == parsername).all() @@ -145,14 +157,14 @@ def get_reference_service_endpoint(self, parsername): self.logger.error("No unique record found in table `Parser` matching name %s." % parsername) return '' - def query_reference_source_tbl(self, bibcode_list=None, source_filename_list=None, parsername=None): + def query_reference_source_tbl(self, bibcode_list: List = None, source_filename_list: List = None, parsername: str = None) -> List: """ - Queries reference table and returns results. + query the reference source table - :param bibcode_list: - :param source_filename_list: - :param parsername: - :return: + :param bibcode_list: List of bibcodes to filter + :param source_filename_list: List of source filenames to filter + :param parsername: parser name to filter + :return: List of reference source records """ with self.session_scope() as session: if bibcode_list and source_filename_list: @@ -191,13 +203,13 @@ def query_reference_source_tbl(self, bibcode_list=None, source_filename_list=Non results.append(row.toJSON()) return results - def query_processed_history_tbl(self, bibcode_list=None, source_filename_list=None): + def query_processed_history_tbl(self, bibcode_list: List = None, source_filename_list: List = None) -> List: """ - Queries history table and returns results. + query the processed history table - :param bibcode_list: - :param source_filename_list: - :return: + :param bibcode_list: List of bibcodes to filter + :param source_filename_list: List of source filenames to filter + :return: List of processed history records """ with self.session_scope() as session: if bibcode_list and source_filename_list: @@ -262,13 +274,14 @@ def query_processed_history_tbl(self, bibcode_list=None, source_filename_list=No }) return results - def query_resolved_reference_tbl(self, history_id_list=None): + def query_resolved_reference_tbl(self, history_id_list: List = None) -> List: """ - Queries resolved table and returns results. + query the resolved reference table - :param history_id_list: - :return: + :param history_id_list: List of history IDs to filter + :return: List of resolved reference records """ + results = [] with self.session_scope() as session: if history_id_list: rows = session.query(func.count(ResolvedReference.item_num).label('num_references'), @@ -278,27 +291,28 @@ def query_resolved_reference_tbl(self, history_id_list=None): .group_by(ResolvedReference.history_id).all() self.logger.info("Fetched records for history ids = %s." % (','.join(str(h) for h in history_id_list))) - if len(rows) == 0: - if history_id_list: + if len(rows) == 0: self.logger.error("No records found for history ids = %s." % (','.join(str(h) for h in history_id_list))) - else: - self.logger.error("No records found in table `ResolvedReference`.") + return results + + for row in rows: + results.append({ + 'last_run_num_references': row.num_references, + 'last_run_num_resolved_references': row.num_resolved_references, + 'history_id': row.history_id, + }) + else: + self.logger.error("No history_id provided, returning no records.") - results = [] - for row in rows: - results.append({ - 'last_run_num_references': row.num_references, - 'last_run_num_resolved_references': row.num_resolved_references, - 'history_id': row.history_id, - }) - return results + return results - def diagnostic_query(self, bibcode_list=None, source_filename_list=None): + def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List = None) -> List: """ + perform a diagnostic query to retrieve combined reference records - :param bibcode_list - :param source_filename_list - :return: list of json records or None + :param bibcode_list: List of bibcodes to filter + :param source_filename_list: List of source filenames to filter + :return: List of combined records from multiple tables """ results = [] @@ -314,7 +328,7 @@ def diagnostic_query(self, bibcode_list=None, source_filename_list=None): history_bibcodes = [item['bibcode'] for item in processed_history] # find unique bibcodes bibcodes = sorted(list(set(reference_bibcodes) | set(history_bibcodes))) - # go through the list and combine records from all three sources + # go through the List and combine records from all three sources for bibcode in bibcodes: result = {} reference_record = next(item for item in reference_source if item['bibcode'] == bibcode) @@ -333,13 +347,13 @@ def diagnostic_query(self, bibcode_list=None, source_filename_list=None): return results - def insert_reference_source_record(self, session, reference): + def insert_reference_source_record(self, session: object, reference: ReferenceSource) -> tuple: """ - check to see if the record already exists in the db first, if not, then add it in + insert a new record into the reference source table if it does not exist - :param session: - :param reference: - :return: + :param session: database session + :param reference: reference source record + :return: tuple containing bibcode and source filename """ found = session.query(exists().where(and_(ReferenceSource.bibcode == reference.bibcode, ReferenceSource.source_filename == reference.source_filename))).scalar() @@ -351,63 +365,66 @@ def insert_reference_source_record(self, session, reference): self.logger.debug("Added a `Reference` record successfully.") return reference.bibcode, reference.source_filename - def insert_history_record(self, session, history): + def insert_history_record(self, session: object, history: ProcessedHistory) -> int: """ + insert a new record into the processed history table - :param session: - :param history: - :return: + :param session: database session + :param history: processed history record + :return: history record ID """ session.add(history) session.flush() self.logger.debug("Added a `ProcessedHistory` record successfully.") return history.id - def insert_resolved_referencce_records(self, session, resolved_list): + def insert_resolved_reference_records(self, session: object, resolved_list: List[ResolvedReference]) -> bool: """ + insert resolved reference records into the database - :param session: - :param resolved_list: - :return: + :param session: database session + :param resolved_list: List of resolved reference records + :return: True if successful """ session.bulk_save_objects(resolved_list) session.flush() self.logger.debug("Added `ResolvedReference` records successfully.") return True - def update_resolved_reference_records(self, session, resolved_list): + def update_resolved_reference_records(self, session: object, resolved_list: List[ResolvedReference]) -> bool: """ + update resolved reference records in the database - :param session: - :param resolved_list: - :return: + :param session: database session + :param resolved_list: List of resolved reference records + :return: True if successful """ session.bulk_update_mappings(ResolvedReference, [r.toJSON() for r in resolved_list]) session.flush() self.logger.debug("Added `ResolvedReference` records successfully.") return True - def insert_compare_records(self, session, compared_list): + def insert_compare_records(self, session: object, compared_list: List[CompareClassic]) -> bool: """ + insert records into the compare classic table - :param session: - :param compared_list: - :return: + :param session: database session + :param compared_list: List of comparison records + :return: True if successful """ session.bulk_save_objects(compared_list) session.flush() self.logger.debug("Added `CompareClassic` records successfully.") return True - def populate_resolved_reference_records_pre_resolved(self, references, history_id, item_nums=None): + def populate_resolved_reference_records_pre_resolved(self, references: List, history_id: int, item_nums: List = None) -> tuple: """ - insert resolved records before sending them to service to be matched - if we have xml references, then insert populate the xml table as well + insert resolved reference records before sending them to a service - :param references: - :param history_id: - :param item_nums: - :return: + :param references: List of references + :param history_id: history record ID + :param item_nums: optional List of item numbers + :return: tuple containing resolved records and updated references """ if not item_nums: item_nums = list(range(1, len(references)+1)) @@ -425,18 +442,18 @@ def populate_resolved_reference_records_pre_resolved(self, references, history_i if 'item_num' in ref: del ref['item_num'] return resolved_records, references - def populate_tables_pre_resolved_initial_status(self, source_bibcode, source_filename, parsername, references): + def populate_tables_pre_resolved_initial_status(self, source_bibcode: str, source_filename: str, parsername: str, references: List) -> List: """ - this is called when the references are being processed for the first time, from the file + populate database tables for references being processed for the first time - :param source_bibcode: - :param source_filename: - :param parsername: - :param references: - :return: + :param source_bibcode: source bibcode + :param source_filename: source filename + :param parsername: parser name + :param references: List of references + :return: List of processed references """ - try: - with self.session_scope() as session: + with self.session_scope() as session: + try: reference_record = ReferenceSource(bibcode=source_bibcode, source_filename=source_filename, resolved_filename=get_resolved_filename(source_filename), @@ -451,27 +468,27 @@ def populate_tables_pre_resolved_initial_status(self, source_bibcode, source_fil total_ref=len(references)) history_id = self.insert_history_record(session, history_record) resolved_records, references = self.populate_resolved_reference_records_pre_resolved(references, history_id) - self.insert_resolved_referencce_records(session, resolved_records) + self.insert_resolved_reference_records(session, resolved_records) session.commit() self.logger.info("Source file %s for bibcode %s with %d references, processed successfully." % (source_filename, source_bibcode, len(references))) return references - except SQLAlchemyError as e: - session.rollback() - self.logger.info("Source file %s information failed to get added to database. Error: %s" % (source_filename, str(e.__dict__['orig']))) - return [] + except SQLAlchemyError as e: + session.rollback() + self.logger.error("Source file %s information failed to get added to database. Error: %s" % (source_filename, str(e))) + return [] - def populate_tables_pre_resolved_retry_status(self, source_bibcode, source_filename, source_modified, retry_records): + def populate_tables_pre_resolved_retry_status(self, source_bibcode: str, source_filename: str, source_modified: str, retry_records: List[Dict]) -> List[Dict]: """ - this is called when the references are being reprocessed, usually cherry picked from the records in the database + this is called when the references are being reprocessed, usually cherry-picked from the records in the database - :param source_bibcode: - :param source_filename: - :param source_modified: - :param retry_records: - :return: + :param source_bibcode: source bibcode + :param source_filename: source filename + :param source_modified: last modified date of the source file + :param retry_records: List of references to be reprocessed + :return: List of processed references """ - try: - with self.session_scope() as session: + with self.session_scope() as session: + try: history_record = ProcessedHistory(bibcode=source_bibcode, source_filename=source_filename, source_modified=source_modified, @@ -480,113 +497,117 @@ def populate_tables_pre_resolved_retry_status(self, source_bibcode, source_filen total_ref=len(retry_records)) history_id = self.insert_history_record(session, history_record) resolved_records, references = self.populate_resolved_reference_records_pre_resolved(retry_records, history_id) - if resolved_records: - self.insert_resolved_referencce_records(session, resolved_records) - session.commit() - self.logger.info("Source file %s for bibcode %s with %d references, for reprocessing added successfully." % (source_filename, source_bibcode, len(references))) - return references - except SQLAlchemyError as e: - session.rollback() - self.logger.info("Source file %s information for reprocessing failed to get added to database." % (source_filename, str(e.__dict__['orig']))) - return [] + self.insert_resolved_reference_records(session, resolved_records) + session.commit() + self.logger.info("Source file %s for bibcode %s with %d references, for reprocessing added successfully." % (source_filename, source_bibcode, len(references))) + return references + except SQLAlchemyError as e: + session.rollback() + self.logger.error("Source file %s information for reprocessing failed to get added to database. Error: %s" % (source_filename, str(e))) + return [] - def populate_tables_post_resolved(self, resolved_reference, source_bibcode, classic_resolved_filename): + def populate_tables_post_resolved(self, resolved_reference: List, source_bibcode: str, classic_resolved_filename: str) -> bool: """ - this is called after references has been resolved + update tables after references have been resolved - :param resolved_reference: - :param source_bibcode: - :param classic_resolved_filename: - :return: + :param resolved_reference: List of resolved references + :param source_bibcode: source bibcode + :param classic_resolved_filename: filename of classic resolved references + :return: True if successful """ - try: - # if the filename for classic resolver output is supplied, read the resolved information - # make sure that the length matches resolved, classic does some breaking a reference into two - # and hence messes up the order if we want to compare one-to-one, if that is the case, just - # ignore the result - resolved_classic = None - if classic_resolved_filename: - resolved_classic = compare_classic_and_service(resolved_reference, source_bibcode, classic_resolved_filename) - - with self.session_scope() as session: - resolved_records = [] - compare_records = [] - for i, ref in enumerate(resolved_reference): - match = self.RE_PARSE_ID.match(ref['id']) - history_id = int(match.group('history_id')) - item_num = int(match.group('item_num')) - # TODO change refstring to refraw for reference_raw - resolved_record = ResolvedReference(history_id=history_id, - item_num=item_num, - reference_str=ref.get('refstring', None), - bibcode=ref.get('bibcode', None), - score=ref.get('score', None), - reference_raw=ref.get('refstring', None)) - resolved_records.append(resolved_record) + with self.session_scope() as session: + try: + # if the filename for classic resolver output is supplied, read the resolved information + # make sure that the length matches resolved, classic does some breaking a reference into two + # and hence messes up the order if we want to compare one-to-one, if that is the case, just + # ignore the result + resolved_classic = None + if classic_resolved_filename: + resolved_classic = compare_classic_and_service(resolved_reference, source_bibcode, classic_resolved_filename) + + resolved_records = [] + compare_records = [] + for i, ref in enumerate(resolved_reference): + match = self.RE_PARSE_ID.match(ref['id']) + history_id = int(match.group('history_id')) + item_num = int(match.group('item_num')) + # TODO change refstring to refraw for reference_raw + resolved_record = ResolvedReference(history_id=history_id, + item_num=item_num, + reference_str=ref.get('refstring', None), + bibcode=ref.get('bibcode', None), + score=ref.get('score', None), + reference_raw=ref.get('refstring', None)) + resolved_records.append(resolved_record) + if resolved_classic: + compare_record = CompareClassic(history_id=history_id, + item_num=item_num, + bibcode=resolved_classic[i][1], + score=int(resolved_classic[i][2]), + state=resolved_classic[i][3]) + compare_records.append(compare_record) if resolved_classic: - compare_record = CompareClassic(history_id=history_id, - item_num=item_num, - bibcode=resolved_classic[i][1], - score=int(resolved_classic[i][2]), - state=resolved_classic[i][3]) - compare_records.append(compare_record) - if resolved_classic: - self.update_resolved_reference_records(session, resolved_records) - self.insert_compare_records(session, compare_records) - else: - self.update_resolved_reference_records(session, resolved_records) - session.commit() - self.logger.info("Updated %d resolved reference records successfully." % len(resolved_reference)) - return True - except SQLAlchemyError as e: - session.rollback() - self.logger.info("Failed to update %d resolved reference records successfully. Error %s" % (len(resolved_reference), str(e))) - return False + self.update_resolved_reference_records(session, resolved_records) + self.insert_compare_records(session, compare_records) + else: + self.update_resolved_reference_records(session, resolved_records) + session.commit() + self.logger.info("Updated %d resolved reference records successfully." % len(resolved_reference)) + return True + except SQLAlchemyError as e: + session.rollback() + self.logger.error("Failed to update %d resolved reference records successfully. Error %s" % (len(resolved_reference), str(e))) + return False - def get_count_reference_source_records(self, session): + def get_count_reference_source_records(self, session: object) -> int: """ + get the count of records in the reference source table - :param session: - :return: + :param session: database session + :return: number of records """ rows = session.query(ReferenceSource).count() self.logger.debug("Currently there are %d records in `ReferenceSource` table."%rows) return rows - def get_count_processed_history_records(self, session): + def get_count_processed_history_records(self, session: object) -> int: """ + get the count of records in the processed history table - :param session: - :return: + :param session: database session + :return: number of records """ rows = session.query(ProcessedHistory).count() self.logger.debug("Currently there are %d records in `ProcessedHistory` table."%rows) return rows - def get_count_resolved_reference_records(self, session): + def get_count_resolved_reference_records(self, session: object) -> int: """ + get the count of records in the resolved reference table - :param session: - :return: + :param session: database session + :return: number of records """ rows = session.query(ResolvedReference).count() self.logger.debug("Currently there are %d records in `ResolvedReference` table."%rows) return rows - def get_count_compare_classic_records(self, session): + def get_count_compare_classic_records(self, session: object) -> int: """ + get the count of records in the compare classic table - :param session: - :return: + :param session: database session + :return: number of records """ rows = session.query(CompareClassic).count() self.logger.debug("Currently there are %d records in `CompareClassic` table."%rows) return rows - def get_count_records(self): + def get_count_records(self) -> List: """ + get the count of records in all tables - :return: + :return: List of dictionaries with table names and record counts """ with self.session_scope() as session: results = [ @@ -614,14 +635,14 @@ def get_count_records(self): ] return results - def get_service_classic_compare_tags(self, session, source_bibcode, source_filename): + def get_service_classic_compare_tags(self, session: object, source_bibcode: str, source_filename: str) -> object: """ - makes a grid of classic and service compared tags and returns the query + generates a comparison grid for classic and service resolved references - :param session: - :param source_bibcode: - :param source_filename: - :return: + :param session: database session + :param source_bibcode: source bibcode + :param source_filename: source filename + :return: subquery object containing comparison results """ # given reference source (bibcodes and filenames), have query that would contain # all resolved records ids, and if we have reprocessed records, it contains one @@ -652,16 +673,17 @@ def get_service_classic_compare_tags(self, session, source_bibcode, source_filen .group_by(CompareClassic.history_id, CompareClassic.item_num) \ .subquery() - def get_service_classic_compare_stats_grid(self, source_bibcode, source_filename): + def get_service_classic_compare_stats_grid(self, source_bibcode: str, source_filename: str) -> tuple: """ + retrieve comparison statistics between service and classic resolved references - :param source_bibcode: - :param source_filename: - :return: + :param source_bibcode: source bibcode + :param source_filename: source filename + :return: tuple containing a text-based grid, total references, and resolved references """ with self.session_scope() as session: compare_grid = self.get_service_classic_compare_tags(session, source_bibcode, source_filename) - results = session.query(ResolvedReference.reference_str.label('refstr'), + rows = session.query(ResolvedReference.reference_str.label('refstr'), ResolvedReference.bibcode.label('service_bibcode'), CompareClassic.bibcode.label('classic_bibcode'), ResolvedReference.score.label('service_conf'), CompareClassic.score.label('classic_score'), compare_grid.c.MATCH.label('match'), compare_grid.c.MISS.label('miss'), @@ -673,66 +695,67 @@ def get_service_classic_compare_stats_grid(self, source_bibcode, source_filename ResolvedReference.item_num == compare_grid.c.item_num)) \ .order_by(ResolvedReference.history_id, ResolvedReference.item_num) \ .all() - if results: + if rows: # Texttable functionality is here https://pypi.org/project/texttable/ table = Texttable() table.set_cols_width([60,19,19,15,15,5,5,5,5,5]) table.set_cols_dtype(['t']*10) table.set_cols_align(['l']+['c']*9) - table.header(results[0]._asdict().keys()) + table.header(rows[0]._asdict().keys()) num_resolved = 0 - for result in results: + for row in rows: # count how many was resolved on the side of service - if not result[1].startswith('.'): + if not row[1].startswith('.'): num_resolved += 1 - row = [] - for item in result: + result = [] + for item in row: if not item: item = '' - row.append(item) - table.add_row(row) - return table.draw(), len(results), num_resolved + result.append(item) + table.add_row(result) + return table.draw(), len(rows), num_resolved return 'Unable to fetch data for reference source file `%s` from database!'%source_filename, -1, -1 - def get_reprocess_records(self, type, score_cutoff, match_bibcode, date_cutoff): + def filter_reprocess_query(self, query: object, type: int, score_cutoff: float, match_bibcode: str, date_cutoff: int) -> object: """ + apply one of the four selected filters, also apply date if requested - :param type: - :param score_cutoff: - :param match_bibcode: - :param date_cutoff: - :return: + :param query: SQLAlchemy query object + :param type: type of filter to apply + :param score_cutoff: score threshold for filtering + :param match_bibcode: bibcode pattern for filtering + :param date_cutoff: number of days to filter by recent records + :return: filtered query object """ - def apply_filter(query, type, score_cutoff, match_bibcode, date_cutoff): - """ - apply one of the four selected filter, also apply date if requested - - :param query: - :param type: - :param score_cutoff: - :param match_bibcode: - :param date_cutoff: - :return: - """ - if type == ReprocessQueryType.score: - query = query.filter(ResolvedReference.score <= "%.2f" % score_cutoff) - elif type == ReprocessQueryType.bibstem and len(match_bibcode): - query = query.filter(ResolvedReference.bibcode.like('____%s__________' % match_bibcode)) - elif type == ReprocessQueryType.year and len(match_bibcode): - query = query.filter(ResolvedReference.bibcode.like('%s_______________' % match_bibcode)) - elif type == ReprocessQueryType.failed: - query = query.filter(and_(ResolvedReference.bibcode == '0000', ResolvedReference.score == -1)) - if date_cutoff: - since = datetime.now() - timedelta(days=int(date_cutoff)) - query = query.filter(ProcessedHistory.date >= since) - return query - - rows = [] + if type == ReprocessQueryType.score: + query = query.filter(ResolvedReference.score <= "%.2f" % score_cutoff) + elif type == ReprocessQueryType.bibstem and len(match_bibcode): + query = query.filter(ResolvedReference.bibcode.like('____%s__________' % match_bibcode)) + elif type == ReprocessQueryType.year and len(match_bibcode): + query = query.filter(ResolvedReference.bibcode.like('%s_______________' % match_bibcode)) + elif type == ReprocessQueryType.failed: + query = query.filter(and_(ResolvedReference.bibcode == '0000', ResolvedReference.score == -1)) + if date_cutoff: + since = datetime.now() - timedelta(days=int(date_cutoff)) + query = query.filter(ProcessedHistory.date >= since) + return query + + def get_reprocess_records(self, type: int, score_cutoff: float, match_bibcode: str, date_cutoff: int) -> List: + """ + retrieve references that need reprocessing based on filters + + :param type: type of reprocessing filter + :param score_cutoff: score threshold + :param match_bibcode: bibcode filter + :param date_cutoff: date filter in days + :return: List of references for reprocessing + """ + results = [] with self.session_scope() as session: # have a query containing unique reference source ids (bibcodes and filenames), # that have been filtered on one of four possible options and also date if requested reference_source_ids = session.query(ProcessedHistory.bibcode, ProcessedHistory.source_filename) \ .filter(ProcessedHistory.id == ResolvedReference.history_id) - reference_source_ids = apply_filter(reference_source_ids, type, score_cutoff, match_bibcode, date_cutoff) + reference_source_ids = self.filter_reprocess_query(reference_source_ids, type, score_cutoff, match_bibcode, date_cutoff) reference_source_ids = reference_source_ids.distinct().all() bibcodes = [ids[0] for ids in reference_source_ids] filenames = [ids[1] for ids in reference_source_ids] @@ -744,11 +767,11 @@ def apply_filter(query, type, score_cutoff, match_bibcode, date_cutoff): .filter(and_(ProcessedHistory.id == ResolvedReference.history_id), ProcessedHistory.bibcode.in_(bibcodes), ProcessedHistory.source_filename.in_(filenames)) - resolved_reference_ids = apply_filter(resolved_reference_ids, type, score_cutoff, match_bibcode, date_cutoff) + resolved_reference_ids = self.filter_reprocess_query(resolved_reference_ids, type, score_cutoff, match_bibcode, date_cutoff) resolved_reference_ids = resolved_reference_ids.distinct().subquery() - results = session.query(resolved_reference_ids.c.history_id.label('history_id'), + rows = session.query(resolved_reference_ids.c.history_id.label('history_id'), resolved_reference_ids.c.item_num.label('item_num'), ResolvedReference.reference_str.label('refstr'), ResolvedReference.reference_raw.label('refraw'), @@ -764,30 +787,154 @@ def apply_filter(query, type, score_cutoff, match_bibcode, date_cutoff): .order_by(ResolvedReference.history_id, ResolvedReference.item_num) \ .all() - if results: - results = [r._asdict() for r in results] - row = {} + if rows: + rows = [r._asdict() for r in rows] + result = {} history_id = -1 - for result in results: - if result['history_id'] != history_id: - if row: - rows.append(row) - row = {} - history_id = result['history_id'] + for row in rows: + if row['history_id'] != history_id: + if result: + results.append(result) + result = {} + history_id = row['history_id'] for key in ['source_bibcode', 'source_filename', 'source_modified', 'parser_name']: - row[key] = result[key] - row['references'] = [] + result[key] = row[key] + result['references'] = [] reference = {} for key in ['item_num', 'refstr', 'refraw']: - reference[key] = result[key] - row['references'].append(reference) + reference[key] = row[key] + result['references'].append(reference) else: reference = {} for key in ['item_num', 'refstr', 'refraw']: - reference[key] = result[key] - row['references'].append(reference) + reference[key] = row[key] + result['references'].append(reference) # last batch, if any - if row: - rows.append(row) - return rows + if result: + results.append(result) + return results + + def get_resolved_references_all(self, source_bibcode: str) -> List[tuple]: + """ + retrieve all resolved references with the highest score per resolved bibcode + + :param source_bibcode: source bibcode for which resolved references should be queried + :return: List of tuples containing resolved references with metadata + """ + result = [] + with self.session_scope() as session: + # build the query to select the highest-scored resolved references per resolved bibcode + # also return name of the parser, order number of parsed reference, date it was parsed, + # and the confidence score + highest_scored_resolved_reference = session.query( + ReferenceSource.bibcode.label('source_bibcode'), + ProcessedHistory.date.label('date'), + ResolvedReference.item_num.label('id'), + ResolvedReference.bibcode.label('resolved_bibcode'), + ResolvedReference.score.label('score'), + ReferenceSource.parser_name.label('parser_name'), + func.row_number().over( + partition_by=[ReferenceSource.bibcode, ReferenceSource.parser_name, ResolvedReference.bibcode], + order_by=desc(ResolvedReference.score) + ).label('ranking_by_score') + ).join(ProcessedHistory, ProcessedHistory.id == ResolvedReference.history_id) \ + .join(ReferenceSource, ProcessedHistory.bibcode == ReferenceSource.bibcode) \ + .filter(and_(ReferenceSource.bibcode == source_bibcode, + ResolvedReference.score != 0)) \ + .subquery() + + # query database now + rows = session.query( + highest_scored_resolved_reference.c.source_bibcode, + highest_scored_resolved_reference.c.date, + highest_scored_resolved_reference.c.id, + highest_scored_resolved_reference.c.resolved_bibcode, + highest_scored_resolved_reference.c.score, + highest_scored_resolved_reference.c.parser_name) \ + .filter(highest_scored_resolved_reference.c.ranking_by_score == 1) \ + .order_by(highest_scored_resolved_reference.c.resolved_bibcode) \ + .all() + + if len(rows) > 0: + for row in rows: + result.append((row.source_bibcode, + row.date.strftime("%Y-%m-%d %H:%M:%S"), + row.id, + row.resolved_bibcode, + float(row.score), + row.parser_name)) + else: + self.logger.error(f'Unable to fetch resolved references for source bibcode `{source_bibcode}`.') + + return result + + def get_resolved_references(self, source_bibcode: str) -> List[Dict]: + """ + retrieve resolved references with the highest parser priority for each unique combination of source_bibcode, parser_name, and resolved_bibcode + + :param source_bibcode: source bibcode for which resolved references should be queried + :return: List of dictionaries containing the highest-priority resolved references + """ + result = [] + with self.session_scope() as session: + + # Build the query to rank parsers by priority (based on the parser_name) and then by score + highest_priority_resolved_reference = session.query( + ReferenceSource.bibcode.label('source_bibcode'), + ProcessedHistory.date.label('date'), + ResolvedReference.item_num.label('id'), + ResolvedReference.bibcode.label('resolved_bibcode'), + ResolvedReference.score.label('score'), + ReferenceSource.parser_name.label('parser_name'), + case( + [ + (ReferenceSource.parser_name.in_(['arXiv', 'CrossRef']), 1), + (ReferenceSource.parser_name == 'Arthur', 3) + ], + else_=2 + ).label('parser_priority'), + func.row_number().over( + partition_by=[ReferenceSource.bibcode, ResolvedReference.bibcode], + order_by=[desc(case( + [ + (ReferenceSource.parser_name.in_(['arXiv', 'CrossRef']), 1), + (ReferenceSource.parser_name == 'Arthur', 3) + ], + else_=2 + )), desc(ResolvedReference.score)] + ).label('ranking_by_priority') + ).join(ProcessedHistory, ProcessedHistory.id == ResolvedReference.history_id) \ + .join(ReferenceSource, ProcessedHistory.bibcode == ReferenceSource.bibcode) \ + .filter(and_(ReferenceSource.bibcode == source_bibcode, + ResolvedReference.score != 0)) \ + .subquery() + + # Query the ranked resolved references, ensuring we get the highest-ranked ones (ranking_by_priority == 1) + rows = session.query( + highest_priority_resolved_reference.c.source_bibcode, + highest_priority_resolved_reference.c.date, + highest_priority_resolved_reference.c.id, + highest_priority_resolved_reference.c.resolved_bibcode, + highest_priority_resolved_reference.c.score, + highest_priority_resolved_reference.c.parser_name, + highest_priority_resolved_reference.c.parser_priority)\ + .filter(highest_priority_resolved_reference.c.ranking_by_priority == 1) \ + .order_by(highest_priority_resolved_reference.c.resolved_bibcode) \ + .all() + + # Process the results + if rows: + for row in rows: + result.append({ + 'source_bibcode': row.source_bibcode, + 'date': row.date.strftime("%Y-%m-%d %H:%M:%S"), + 'id': row.id, + 'resolved_bibcode': row.resolved_bibcode, + 'score': float(row.score), + 'parser_name': row.parser_name, + 'parser_priority': row.parser_priority + }) + else: + self.logger.error(f'Unable to fetch resolved references for source bibcode `{source_bibcode}`.') + return result diff --git a/adsrefpipe/models.py b/adsrefpipe/models.py old mode 100644 new mode 100755 index e1edc3d..5e2c725 --- a/adsrefpipe/models.py +++ b/adsrefpipe/models.py @@ -20,17 +20,19 @@ class Action(Base): __tablename__ = 'action' status = Column(String, primary_key=True) - def get_status_new(self): + def get_status_new(self) -> str: """ + returns the initial status - :return: + :return: string indicating the initial status """ return 'initial' - def get_status_retry(self): + def get_status_retry(self) -> str: """ + returns the retry status - :return: + :return: string indicating the retry status """ return 'retry' @@ -49,50 +51,57 @@ class Parser(Base): reference_service_endpoint = Column(String) matches = Column(JSONB, default=dict) - def __init__(self, name, extension_pattern, reference_service_endpoint, matches=[]): + def __init__(self, name: str, extension_pattern: str, reference_service_endpoint: str, matches: list = []): """ + initializes a parser object - :param name: - :param extension_pattern: - :param reference_service_endpoint: - :param matches: + :param name: name of the parser + :param extension_pattern: reference file extension pattern used by the parser + :param reference_service_endpoint: endpoint for the reference service + :param matches: list of matches for the parser-reference file mapping """ self.name = name self.extension_pattern = extension_pattern self.reference_service_endpoint = reference_service_endpoint self.matches = matches - def get_name(self): + def get_name(self) -> str: """ + returns the name of the parser - :return: + :return: string indicating the name of the parser """ return self.name - def get_extension_pattern(self): + def get_extension_pattern(self) -> str: """ + returns the extension pattern of the reference files processed by the parser - :return: + :return: string indicating the file extension pattern """ return self.extension_pattern - def get_endpoint(self): + def get_endpoint(self) -> str: """ + returns the reference service endpoint to resolve references - :return: + :return: string indicating the reference service endpoint """ return self.reference_service_endpoint - def get_matches(self): + def get_matches(self) -> list: """ + returns the list of mappings for the parser - :return: + :return: list of matches """ return self.matches - def toJSON(self): + def toJSON(self) -> dict: """ - :return: values formatted as python dict + converts the parser object to a JSON dictionary + + :return: dictionary containing the parser details """ return { 'name': self.name, @@ -103,28 +112,37 @@ def toJSON(self): class ReferenceSource(Base): + """ + This class represents the source of a reference in the database, + each entry links a source file with its resolved version and + the parser used to process the reference. + It serves as the initial record for the reference processing pipeline. + """ __tablename__ = 'reference_source' bibcode = Column(String, primary_key=True) source_filename = Column(String, primary_key=True) resolved_filename = Column(String) parser_name = Column(String, ForeignKey('parser.name')) - def __init__(self, bibcode, source_filename, resolved_filename, parser_name): + def __init__(self, bibcode: str, source_filename: str, resolved_filename: str, parser_name: str): """ + initializes a reference source object - :param bibcode: - :param source_filename: - :param resolved_filename: - :param parser_name: + :param bibcode: unique bibcode for the reference source + :param source_filename: name of the reference file + :param resolved_filename: name of the resolved file for future use + :param parser_name: name of the parser used """ self.bibcode = bibcode self.source_filename = source_filename self.resolved_filename = resolved_filename self.parser_name = parser_name - def toJSON(self): + def toJSON(self) -> dict: """ - :return: values formatted as python dict, if no values found returns empty structure, not None + converts the reference source object to a JSON dictionary + + :return: dictionary containing reference source details """ return { 'bibcode': self.bibcode, @@ -135,6 +153,10 @@ def toJSON(self): class ProcessedHistory(Base): + """ + This class tracks the processing history of a resolved reference, recording details about the processing status, + reference file timestamp, and the total number of references parsed. + """ __tablename__ = 'processed_history' __table_args__ = (ForeignKeyConstraint( ['bibcode', 'source_filename'], ['reference_source.bibcode', 'reference_source.source_filename']),) id = Column(Integer, primary_key=True) @@ -145,15 +167,16 @@ class ProcessedHistory(Base): date = Column(DateTime, default=func.now()) total_ref = Column(Integer) - def __init__(self, bibcode, source_filename, source_modified, status, date, total_ref): + def __init__(self, bibcode: str, source_filename: str, source_modified: DateTime, status: str, date: DateTime, total_ref: int): """ + initializes a processed history object - :param bibcode: - :param source_filename: - :param source_modified: - :param status: - :param date: - :param total_ref: + :param bibcode: bibcode for the reference source + :param source_filename: name of the source reference file + :param source_modified: timestamp of the reference file at the time it was read + :param status: first time processing, or reprocessing this list of references + :param date: date of processing + :param total_ref: total number of references parsed """ self.bibcode = bibcode self.source_filename = source_filename @@ -162,9 +185,11 @@ def __init__(self, bibcode, source_filename, source_modified, status, date, tota self.date = date self.total_ref = total_ref - def toJSON(self): + def toJSON(self) -> dict: """ - :return: values formatted as python dict, if no values found returns empty structure, not None + converts the processed history object to a JSON dictionary + + :return: dictionary containing processed history details """ return { 'bibcode': self.bibcode, @@ -177,6 +202,10 @@ def toJSON(self): class ResolvedReference(Base): + """ + This class stores information about references that have been resolved, including the reference string, score, + and its associated history entry. + """ __tablename__ = 'resolved_reference' history_id = Column(Integer, ForeignKey('processed_history.id'), primary_key=True) item_num = Column(Integer, primary_key=True) @@ -185,14 +214,16 @@ class ResolvedReference(Base): score = Column(Numeric) reference_raw = Column(String) - def __init__(self, history_id, item_num, reference_str, bibcode, score, reference_raw): + def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str): """ + initializes a resolved reference object - :param history_id: - :param item_num - :param reference_str: - :param bibcode: - :param score: + :param history_id: ID of the related processed history entry + :param item_num: order of the reference within the source + :param reference_str: reference string + :param bibcode: resolved bibcode + :param score: confidence score of the resolved reference + :param reference_raw: raw reference string """ self.history_id = history_id self.item_num = item_num @@ -201,26 +232,19 @@ def __init__(self, history_id, item_num, reference_str, bibcode, score, referenc self.score = score self.reference_raw = reference_raw - def toJSON(self): + def toJSON(self) -> dict: """ - :return: values formatted as python dict, if no values found returns empty structure, not None + converts the resolved reference object to a JSON dictionary + + :return: dictionary containing resolved reference details """ - if self.reference_raw: - return { - 'history_id': self.history_id, - 'reference_str': self.reference_str, - 'bibcode': self.bibcode, - 'score': self.score, - 'item_num': self.item_num, - 'reference_raw': self.reference_raw - } - # do not include reference_raw if it is None return { 'history_id': self.history_id, 'reference_str': self.reference_str, 'bibcode': self.bibcode, 'score': self.score, 'item_num': self.item_num, + **({'reference_raw': self.reference_raw} if self.reference_raw else {}) } @@ -228,8 +252,8 @@ class CompareClassic(Base): """ This table is for comparing classic resolver with service reference, keeps track of service reference that matched classic reference - bibcode and score here is for classic - + bibcode and score here is for classic, should be a temparary class + only used during development/testing and verification """ __tablename__ = 'compare_classic' history_id = Column(Integer, ForeignKey('processed_history.id'), primary_key=True) @@ -238,14 +262,15 @@ class CompareClassic(Base): score = Column(Numeric) state = Column(String) - def __init__(self, history_id, item_num, bibcode, score, state): + def __init__(self, history_id: int, item_num: int, bibcode: str, score: Numeric, state: str): """ + initializes a compare classic object - :param history_id: - :param item_num: - :param bibcode: - :param classic_score: - :param state: + :param history_id: ID of the related processed history entry + :param item_num: order of the reference within the source + :param bibcode: resolved bibcode + :param score: confidence score of the resolved reference + :param state: comparison state (ie, matched, unmatched, etc.) """ self.history_id = history_id self.item_num = item_num @@ -253,9 +278,11 @@ def __init__(self, history_id, item_num, bibcode, score, state): self.score = score self.state = state - def toJSON(self): + def toJSON(self) -> dict: """ - :return: values formatted as python dict, if no values found returns empty structure, not None + converts the compare classic object to a JSON dictionary + + :return: dictionary containing compare classic details """ return { 'history_id': self.history_id, diff --git a/adsrefpipe/refparsers/unicode.py b/adsrefpipe/refparsers/unicode.py old mode 100644 new mode 100755 index 32c3579..77a50a6 --- a/adsrefpipe/refparsers/unicode.py +++ b/adsrefpipe/refparsers/unicode.py @@ -29,12 +29,12 @@ XML_PREDEFINED_ENTITIES = ('quot', 'amp', 'apos', 'lt', 'gt') - -def tostr(value): +def tostr(value: str) -> str: """ - to use in for python 3 replacing python 2 str - :param value: - :return: + convert a value to a UTF-8 encoded string + + :param value: input value to be encoded + :return: UTF-8 encoded string """ try: encoded = value.encode('utf-8').decode('utf-8') @@ -43,17 +43,20 @@ def tostr(value): return encoded -def tounicode(value): +def tounicode(value: str) -> str: """ + convert a value to a Unicode string with HTML entities unescaped - :param value: - :return: + :param value: input string + :return: Unicode string with HTML entities unescaped """ return html.unescape(value) class UnicodeHandlerError(Exception): - """ Error in the UnicodeHandler. """ + """ + error raised when an issue occurs in UnicodeHandler + """ pass @@ -69,14 +72,18 @@ class UnicodeHandler(UserDict): 4/ Latex representation 5/ Type (optional) can be P=ponctuation, S=space, L=lowercase-letter, U=uppercase-letter + this class provides methods to convert Unicode characters to ASCII and named entities + Some day we may want to scrap this approach in favor of using the python namedentities module (although that will lack the TeX representation) """ + # matches named HTML entities (e.g., & -> matches "amp") re_entity = re.compile(r'&([a-zA-Z0-9]{2,}?);') + # matches numeric character references in decimal format (e.g., A -> matches "65", which represents 'A') re_numentity = re.compile(r'&#(?P\d+);') + # matches numeric character references in hexadecimal format (e.g., A -> matches "41", which represents 'A') re_hexnumentity = re.compile(r'&#x(?P[0-9a-fA-F]+);') - - # re_unicode = re.compile(u'([\u0080-\uffff])') + # matches Unicode escape sequences (e.g., \u00E9 -> matches "00E9", which represents 'é') re_unicode = re.compile(r'\\u(?P[0-9a-fA-F]{4})') # accents with a slash in front. To be converted to entities @@ -104,7 +111,9 @@ class UnicodeHandler(UserDict): '#x00af': 'macron', } + # matches incorrectly formatted entities that should be combined with the preceding character (e.g., "a´" -> matches "a´") re_missent = re.compile(r'([a-zA-Z])&(%s);' % '|'.join(missent.keys())) + # matches incorrectly formatted entities that appear after a space or semicolon (e.g., " ;´" -> matches ";´") re_missent_space = re.compile(r'([\s\;])&(%s);' % '|'.join(missent.keys())) # some entities not in entities table. Maybe not acurate: aproximation @@ -122,18 +131,23 @@ class UnicodeHandler(UserDict): '#x03d2': 'Upsilon', '#x00fd': 'yacute' } - re_morenum = re.compile(r'&(%s);' % '|'.join(morenum.keys())) + # matches additional numeric entities that need conversion (e.g., "&x030a;" -> matches "x030a") + re_morenum = re.compile(r'&(%s);' % '|'.join(morenum.keys())) + # matches placeholder "__amp__" used to represent an ampersand in text re_replace_amp = re.compile(r'__amp__') - + # matches right single quotation marks in named entity form (e.g., "’" or "’") re_rsquo = re.compile(r'&rsquor?;') + # matches backslashes in text re_backslash = re.compile(r'\\') + # matches lowercase or uppercase 'l' followed by a forward slash (e.g., "l/") re_lower_upper_ls = re.compile(r'([Ll])/') - def __init__(self, data_filename=None): + def __init__(self, data_filename: str = None): """ - - :param data_filename: + initialize UnicodeHandler by loading Unicode data from a file + + :param data_filename: path to the Unicode data file """ self.data_filename = data_filename or os.path.dirname(__file__) + '/data_files/unicode.dat' self.unicode = [None, ] * 65536 @@ -159,11 +173,12 @@ def __init__(self, data_filename=None): except ValueError: pass - def ent2asc(self, text): + def ent2asc(self, text: str) -> str: """ - - :param text: - :return: + convert named entities in a string to ASCII equivalents + + :param text: input text containing named entities + :return: text with entities replaced by ASCII equivalents """ text = self.re_replace_amp.sub('&', text) result = self.re_entity.sub(self.__sub_asc_entity, text) @@ -171,32 +186,35 @@ def ent2asc(self, text): result = self.re_hexnumentity.sub(self.__sub_hexnumasc_entity, result) return result - def u2asc(self, text): + def u2asc(self, text: str) -> str: """ + convert Unicode characters to their ASCII representations - :param text: - :return: + :param text: input Unicode text + :return: ASCII equivalent of the input text """ result = re.sub(r'\-unknown\-entity\-(.)([^\-]+)\-', r'\g<1>', text) result = ''.join([self.__toascii(char) for char in result]) return result - def u2ent(self, text): + def u2ent(self, text: str) -> str: """ + convert Unicode characters to their named entity representations - :param text: - :return: + :param text: input Unicode text + :return: text with Unicode characters replaced by named entities """ result = re.sub(r'\-unknown\-entity\-([^\-]+)\-', r'&\g<1>;', text) result = ''.join([self.__toentity(char) for char in result]) result = self.re_unicode.sub(self.__sub_hexnum_toent, result) return result - def __sub_numasc_entity(self, match): + def __sub_numasc_entity(self, match: re.Match) -> str: """ - - :param match: - :return: + convert numeric entities to ASCII equivalents + + :param match: regex match object containing numeric entity + :return: ASCII representation of the numeric entity """ entno = int(match.group('number')) @@ -211,11 +229,12 @@ def __sub_numasc_entity(self, match): except OverflowError: raise UnicodeHandlerError('Unknown numeric entity: %s' % match.group(0)) - def __sub_hexnumasc_entity(self, match): + def __sub_hexnumasc_entity(self, match: re.Match) -> str: """ - - :param match: - :return: + convert hexadecimal numeric entities to ASCII equivalents + + :param match: regex match object containing hexadecimal numeric entity + :return: ASCII representation of the hexadecimal entity """ entno = int(match.group('hexnum'), 16) try: @@ -226,11 +245,12 @@ def __sub_hexnumasc_entity(self, match): except IndexError: raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % match.group(0)) - def __sub_hexnum_toent(self, match): + def __sub_hexnum_toent(self, match: re.Match) -> str: """ + convert hexadecimal numeric entities to named entities - :param match: - :return: + :param match: regex match object containing hexadecimal numeric entity + :return: named entity representation of the hexadecimal entity """ try: entno = int(match.group('number'), 16) @@ -242,11 +262,12 @@ def __sub_hexnum_toent(self, match): else: raise UnicodeHandlerError('Unknown hexadecimal entity: %s' % entno) - def __sub_asc_entity(self, match): + def __sub_asc_entity(self, match: re.Match) -> str: """ - - :param match: - :return: + convert named entities to ASCII equivalents + + :param match: regex match object containing a named entity + :return: ASCII representation of the named entity """ ent = match.group(1) if ent in self.keys(): @@ -256,11 +277,12 @@ def __sub_asc_entity(self, match): logger.error(UnicodeHandlerError('Unknown named entity: %s, replacing by WHITE SQUARE' % match.group(0))) return self.unicode[9633].ascii - def __toascii(self, char): + def __toascii(self, char: str) -> str: """ + convert a Unicode character to its ASCII equivalent - :param char: - :return: + :param char: Unicode character + :return: ASCII representation of the character """ ascii_value = ord(char) @@ -273,11 +295,12 @@ def __toascii(self, char): logger.error(UnicodeHandlerError('Unknown character code: %d, replacing by WHITE SQUARE' % ascii_value)) return self.unicode[9633].ascii - def __toentity(self, char): + def __toentity(self, char: str) -> str: """ + convert a Unicode character to its named entity representation - :param char: - :return: + :param char: Unicode character + :return: named entity representation of the character """ ascii_value = ord(char) @@ -292,8 +315,10 @@ def __toentity(self, char): # Return a numeric entity. return '&#%d;' % ascii_value - def cleanall(self, str, cleanslash=0): + def cleanall(self, str: str, cleanslash: int = 0) -> str: """ + clean and normalize text by handling accents, entities, and numeric codes + Deals with things like: 1./ accents with a slashes and converts them to entities. Example: \', \`,\^ @@ -310,9 +335,9 @@ def cleanall(self, str, cleanslash=0): slashes. Gets rid of all of them. Also converts 'l/a' to 'ła'. Maybe cases in which this is substituting too much? - :param str: - :param cleanslash: - :return: + :param str: input text + :param cleanslash: flag to remove slashes and process special characters + :return: cleaned text """ retstr = self.re_accent.sub(self.__sub_accent,str) retstr = self.re_missent.sub(self.__sub_missent,retstr) @@ -325,19 +350,21 @@ def cleanall(self, str, cleanslash=0): retstr = self.re_lower_upper_ls.sub('&\g<1>strok;',retstr) return retstr - def __sub_accent(self, match): + def __sub_accent(self, match: re.Match) -> str: """ + convert accented characters to named entities - :param match: - :return: + :param match: regex match object containing an accented character + :return: named entity representation of the accented character """ return "&%s%s;" % (match.group(1), self.accents[match.group(2)]) - def __sub_missent(self, match): + def __sub_missent(self, match: re.Match) -> str: """ + convert incorrectly formatted entities to proper named entities - :param match: - :return: + :param match: regex match object containing a malformed entity + :return: corrected named entity representation """ ent = "%s%s" % (match.group(1), self.missent[match.group(2)]) if ent in self.keys(): @@ -345,21 +372,27 @@ def __sub_missent(self, match): else: return "%s&%s;" % (match.group(1), self.missent[match.group(2)]) - def __sub_morenum(self, match): + def __sub_morenum(self, match: re.Match) -> str: """ + convert additional numeric entities to named entities - :param match: - :return: + :param match: regex match object containing a numeric entity + :return: named entity representation of the numeric entity """ return "&%s;" % (self.morenum[match.group(1)]) class UnicodeChar: - def __init__(self, fields): + """ + represents a Unicode character with its entity, ASCII, and LaTeX representations + """ + + def __init__(self, fields: list): """ - - :param fields: + initialize a UnicodeChar instance + + :param fields: list containing Unicode code, entity, ASCII, and LaTeX representations """ self.code = int(fields[0].strip()) self.entity = fields[1].strip() diff --git a/adsrefpipe/tasks.py b/adsrefpipe/tasks.py old mode 100644 new mode 100755 index febc90f..32775c5 --- a/adsrefpipe/tasks.py +++ b/adsrefpipe/tasks.py @@ -28,14 +28,15 @@ class FailedRequest(Exception): @app.task(queue='task_process_reference', max_retries=config['MAX_QUEUE_RETRIES']) -def task_process_reference(reference_task): +def task_process_reference(reference_task: dict) -> bool: """ + process a reference task by resolving references and updating the database - :param reference_task: - :return: + :param reference_task: dictionary containing reference details and service url + :return: True if processing is successful, False otherwise """ try: - resolved = utils.get_resolved_references(reference_task['reference'], reference_task['resolver_service_url']) + resolved = utils.post_request_resolved_reference(reference_task['reference'], reference_task['resolver_service_url']) # if failed to connect to reference service, raise a exception to requeue, for max_retries times if not resolved: raise FailedRequest @@ -48,8 +49,13 @@ def task_process_reference(reference_task): return False return True + except KeyError: return False -if __name__ == '__main__': - app.start() \ No newline at end of file +# dont know how to unittest this part +# this (app.start()) the only line that is not unittested +# and since i want all modules to be 100% covered, +# making this line not be considered part of coverage +if __name__ == '__main__': # pragma: no cover + app.start() diff --git a/adsrefpipe/tests/unittests/data_test_db_query.py b/adsrefpipe/tests/unittests/stubdata/dbdata.py similarity index 100% rename from adsrefpipe/tests/unittests/data_test_db_query.py rename to adsrefpipe/tests/unittests/stubdata/dbdata.py diff --git a/adsrefpipe/tests/unittests/test_db_query.py b/adsrefpipe/tests/unittests/test_app.py similarity index 52% rename from adsrefpipe/tests/unittests/test_db_query.py rename to adsrefpipe/tests/unittests/test_app.py index d953e8c..c21cff3 100644 --- a/adsrefpipe/tests/unittests/test_db_query.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -4,7 +4,13 @@ sys.path.insert(0, project_home) import unittest -import datetime +from unittest.mock import patch, MagicMock, Mock +from datetime import datetime, timedelta +from collections import namedtuple + +from sqlalchemy.exc import SQLAlchemyError +from sqlalchemy.sql import and_, func, case, column, table, literal +from sqlalchemy.dialects import postgresql from adsrefpipe import app from adsrefpipe.models import Base, Action, Parser, ReferenceSource, ProcessedHistory, ResolvedReference, CompareClassic @@ -22,7 +28,7 @@ from adsrefpipe.refparsers.AGUxml import AGUtoREFs, AGUreference from adsrefpipe.refparsers.arXivTXT import ARXIVtoREFs from adsrefpipe.refparsers.handler import verify -from adsrefpipe.tests.unittests.data_test_db_query import actions_records, parsers_records +from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records class TestDatabase(unittest.TestCase): @@ -149,7 +155,7 @@ def add_stub_data(self): score=classic[1], state=classic[2]) compare_records.append(compare_record) - success = self.app.insert_resolved_referencce_records(session, resolved_records) + success = self.app.insert_resolved_reference_records(session, resolved_records) self.assertTrue(success == True) success = self.app.insert_compare_records(session, compare_records) self.assertTrue(success == True) @@ -306,7 +312,7 @@ def test_reprocess_references(self): result_expected_year = [ {'source_bibcode': '0002arXiv.........Z', 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), - 'source_modified': datetime.datetime(2020, 4, 3, 18, 8, 42), + 'source_modified': datetime(2020, 4, 3, 18, 8, 42), 'parser_name': 'arXiv', 'references': [{'item_num': 2, 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', @@ -315,7 +321,7 @@ def test_reprocess_references(self): result_expected_bibstem = [ {'source_bibcode': '0002arXiv.........Z', 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), - 'source_modified': datetime.datetime(2020, 4, 3, 18, 8, 42), + 'source_modified': datetime(2020, 4, 3, 18, 8, 42), 'parser_name': 'arXiv', 'references': [{'item_num': 2, 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', @@ -323,7 +329,7 @@ def test_reprocess_references(self): }, {'source_bibcode': '0003arXiv.........Z', 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00003.raw'), - 'source_modified': datetime.datetime(2020, 4, 3, 18, 8, 32), + 'source_modified': datetime(2020, 4, 3, 18, 8, 32), 'parser_name': 'arXiv', 'references': [{'item_num': 2, 'refstr': 'Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ', @@ -351,10 +357,385 @@ def test_reprocess_references(self): ] self.assertTrue(self.app.get_count_records() == current_num_records) + def test_get_parser(self): + """ test get_parser """ + + # test cases where journal and extension alone determine the parser + self.assertEqual(self.app.get_parser('OTHER/2007AIPC..948..357M/2007AIPC..948..357M.raw')['name'], 'ADStxt') + self.assertEqual(self.app.get_parser('OTHER/Astro2020/2019arXiv190309325N.raw')['name'], 'arXiv') + + # test case where volume information is needed to identify the correct parser + result = self.app.get_parser('PASJ/0052/iss0.raw') + self.assertIsInstance(result, dict) + self.assertEqual(result.get('name'), 'PASJhtml') + self.assertEqual(result.get('matches'), [{'journal': 'PASJ', 'volume_end': 53, 'volume_begin': 51}]) + def test_match_parser(self): - """ test match_parser returning correct parser name for the same journal and extension """ - self.assertTrue(self.app.get_parser('OTHER/2007AIPC..948..357M/2007AIPC..948..357M.raw')['name'] == 'ADStxt') - self.assertTrue(self.app.get_parser('OTHER/Astro2020/2019arXiv190309325N.raw')['name'] == 'arXiv') + """ test match_parser when the filepath has been wrong and no matches were found""" + self.assertEqual(self.app.match_parser(rows=[], journal='unknown', volume='2'), {}) + + def test_query_reference_source_tbl(self): + """ test query_reference_source_tbl when parsername is given """ + + # test when parsername is valid + result = self.app.query_reference_source_tbl(parsername="arXiv") + self.assertEqual(len(result), 3) + self.assertEqual(result[0]['parser_name'], "arXiv") + self.assertEqual(result[1]['bibcode'], "0002arXiv.........Z") + self.assertEqual(result[2]['source_filename'].split('/')[-1], "00003.raw") + + # test when parsername is invalid and should log an error + with patch.object(self.app.logger, 'error') as mock_error: + result = self.app.query_reference_source_tbl(parsername="invalid") + self.assertEqual(len(result), 0) + mock_error.assert_called_with("No records found for parser = invalid.") + + def test_query_resolved_reference_tbl_no_records(self): + """ test query_resolved_reference_tbl() when no records exist """ + + # when history_id_list is not empty + with patch.object(self.app.logger, 'error') as mock_error: + result = self.app.query_resolved_reference_tbl(history_id_list=[9999]) + self.assertEqual(result, []) + mock_error.assert_called_with("No records found for history ids = 9999.") + + # when history_id_list is empty + with patch.object(self.app.logger, 'error') as mock_error: + result = self.app.query_resolved_reference_tbl(history_id_list=[]) + self.assertEqual(result, []) + mock_error.assert_called_with("No history_id provided, returning no records.") + + def test_populate_tables_pre_resolved_initial_status_exception(self): + """ test populate_tables_pre_resolved_initial_status method when there is an exception """ + with patch.object(self.app, "session_scope") as mock_session_scope: + mock_session = mock_session_scope.return_value.__enter__.return_value + mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") + + with patch.object(self.app.logger, 'error') as mock_error: + results = self.app.populate_tables_pre_resolved_initial_status('0001arXiv.........Z', + os.path.join(self.arXiv_stubdata_dir,'00001.raw'), + 'arXiv', + references=[]) + self.assertEqual(results, []) + mock_session.rollback.assert_called_once() + mock_error.assert_called() + + def test_populate_tables_pre_resolved_retry_status_exception(self): + """ test populate_tables_pre_resolved_retry_status method when there is an exception """ + with patch.object(self.app, "session_scope") as mock_session_scope: + mock_session = mock_session_scope.return_value.__enter__.return_value + mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") + + with patch.object(self.app.logger, 'error') as mock_error: + results = self.app.populate_tables_pre_resolved_retry_status('0001arXiv.........Z', + os.path.join(self.arXiv_stubdata_dir,'00001.raw'), + source_modified='', + retry_records=[]) + self.assertEqual(results, []) + mock_session.rollback.assert_called_once() + mock_error.assert_called() + + def test_populate_tables_post_resolved_exception(self): + """ test populate_tables_post_resolved method when there is an exception """ + with patch.object(self.app, "session_scope") as mock_session_scope: + mock_session = mock_session_scope.return_value.__enter__.return_value + mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") + + with patch.object(self.app.logger, 'error') as mock_error: + result = self.app.populate_tables_post_resolved(resolved_reference=[], + source_bibcode='0001arXiv.........Z', + classic_resolved_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw.results')) + self.assertEqual(result, False) + mock_session.rollback.assert_called_once() + mock_error.assert_called() + + def test_populate_tables_post_resolved_with_classic(self): + """ test populate_tables_post_resolved when resolved_classic is available """ + + resolved_reference = [ + {'id': 'H1I1', 'refstring': 'Reference 1', 'bibcode': '2023A&A...657A...1X', 'score': 1.0}, + {'id': 'H1I2', 'refstring': 'Reference 2', 'bibcode': '2023A&A...657A...2X', 'score': 0.8} + ] + source_bibcode = "2023A&A...657A...1X" + classic_resolved_filename = "classic_results.txt" + classic_resolved_reference = [ + (1, "2023A&A...657A...1X", "1", "MATCH"), + (2, "2023A&A...657A...2X", "1", "MATCH") + ] + + with patch.object(self.app, "session_scope"), \ + patch("adsrefpipe.app.compare_classic_and_service", return_value=classic_resolved_reference), \ + patch.object(self.app, "update_resolved_reference_records") as mock_update, \ + patch.object(self.app, "insert_compare_records") as mock_insert, \ + patch.object(self.app.logger, "info") as mock_logger: + + result = self.app.populate_tables_post_resolved(resolved_reference, source_bibcode, classic_resolved_filename) + + self.assertTrue(result) + mock_update.assert_called_once() + mock_insert.assert_called_once() + mock_logger.assert_called_with("Updated 2 resolved reference records successfully.") + + @patch("adsrefpipe.app.ProcessedHistory") + @patch("adsrefpipe.app.ResolvedReference") + @patch("adsrefpipe.app.CompareClassic") + def test_get_service_classic_compare_tags(self, mock_compare, mock_resolved, mock_processed): + """ test get_service_classic_compare_tags """ + + mock_session = MagicMock() + + # mock resolved_reference_ids to behave like a real subquery + resolved_reference_ids_mock = table("resolved_reference_ids", column("history_id"), column("item_num")) + mock_session.query().filter().distinct().subquery.return_value = resolved_reference_ids_mock + + # explicitly define mock_compare.state as a SQLAlchemy column + mock_compare.state = column("state") + + # mock the session query behavior for final select query involving CompareClassic + mock_final_query = mock_session.query.return_value + mock_final_query.select_from.return_value.outerjoin.return_value.group_by.return_value.subquery.return_value = "mock_final_subquery" + + # test case 1: Only source_bibcode is provided + result1 = self.app.get_service_classic_compare_tags(mock_session, source_bibcode="2023A&A...657A...1X", source_filename="") + self.assertEqual(result1, "mock_final_subquery") + + expected_filter_bibcode = and_(mock_processed.id == mock_resolved.history_id, literal('"2023A&A...657A...1X').op('~')(mock_processed.bibcode)) + found_bibcode_filter = any(call.args and expected_filter_bibcode.compare(call.args[0]) for call in mock_session.query().filter.call_args_list) + self.assertTrue(found_bibcode_filter) + + # test case 2: Only source_filename are provided + result2 = self.app.get_service_classic_compare_tags(mock_session, source_bibcode="", source_filename="some_source_file.txt") + self.assertEqual(result2, "mock_final_subquery") + + expected_filter_filename = and_(mock_processed.id == mock_resolved.history_id, literal('2023A&A...657A...1X').op('~')(mock_processed.source_filename)) + found_filename_filter = any(call.args and expected_filter_filename.compare(call.args[0]) for call in mock_session.query().filter.call_args_list) + self.assertTrue(found_filename_filter) + + def test_get_service_classic_compare_stats_grid_error(self): + """ test get_service_classic_compare_stats_grid when error """ + + with patch.object(self.app, "session_scope") as mock_session_scope: + mock_session = mock_session_scope.return_value.__enter__.return_value + + # create a mock for compare_grid + mock_compare_grid = Mock() + mock_compare_grid.c.MATCH = Mock(label=Mock(return_value="MATCH")) + mock_compare_grid.c.MISS = Mock(label=Mock(return_value="MISS")) + mock_compare_grid.c.NEW = Mock(label=Mock(return_value="NEW")) + mock_compare_grid.c.NEWU = Mock(label=Mock(return_value="NEWU")) + mock_compare_grid.c.DIFF = Mock(label=Mock(return_value="DIFF")) + + # mock `get_service_classic_compare_tags()` to return the mocked compare_grid + with patch.object(self.app, "get_service_classic_compare_tags", return_value=mock_compare_grid): + # mock `session.query(...).all()` to return an empty list + mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [] + + result = self.app.get_service_classic_compare_stats_grid(source_bibcode='0001arXiv.........Z', + source_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw')) + + self.assertEqual(result, ('Unable to fetch data for reference source file `%s` from database!'%os.path.join(self.arXiv_stubdata_dir,'00001.raw'), -1, -1)) + + @patch("adsrefpipe.app.datetime") + def test_filter_reprocess_query(self, mock_datetime): + """Test all cases of filter_reprocess_query""" + + mock_query = Mock() + + # set a fixed datetime for consistent testing + mock_datetime.now.return_value = datetime(2025, 1, 1) + + # test case: ReprocessQueryType.score + self.app.filter_reprocess_query(mock_query, ReprocessQueryType.score, 0.8, "", 0) + mock_query.filter.assert_called() + called_args, _ = mock_query.filter.call_args + compiled_query = called_args[0].compile(dialect=postgresql.dialect()) + self.assertTrue(str(called_args[0]), 'resolved_reference.score <= :score_1') + self.assertTrue(compiled_query.params['score_1'], 0.8) + + # test case: ReprocessQueryType.bibstem with match_bibcode + mock_query.reset_mock() + self.app.filter_reprocess_query(mock_query, ReprocessQueryType.bibstem, 0.8, "1234", 0) + mock_query.filter.assert_called() + called_args, _ = mock_query.filter.call_args + compiled_query = called_args[0].compile(dialect=postgresql.dialect()) + self.assertTrue(str(called_args[0]), 'resolved_reference.bibcode LIKE :bibcode_1') + self.assertTrue(compiled_query.params['bibcode_1'], '____1234__________') + + # test case: ReprocessQueryType.year with match_bibcode + mock_query.reset_mock() + self.app.filter_reprocess_query(mock_query, ReprocessQueryType.year, 0.8, "2023", 0) + mock_query.filter.assert_called() + called_args, _ = mock_query.filter.call_args + compiled_query = called_args[0].compile(dialect=postgresql.dialect()) + self.assertTrue(str(called_args[0]), 'resolved_reference.bibcode LIKE :bibcode_1') + self.assertTrue(compiled_query.params['bibcode_1'], '2023_______________') + + # test case: ReprocessQueryType.failed + mock_query.reset_mock() + self.app.filter_reprocess_query(mock_query, ReprocessQueryType.failed, 0.8, "", 0) + mock_query.filter.assert_called() + called_args, _ = mock_query.filter.call_args + compiled_query = called_args[0].compile(dialect=postgresql.dialect()) + self.assertTrue(str(called_args[0]), 'resolved_reference.bibcode = :bibcode_1 AND resolved_reference.score = :score_1') + self.assertTrue(compiled_query.params['bibcode_1'], '0000') + self.assertTrue(compiled_query.params['score_1'], -1) + + # Test case: date_cutoff is applied + mock_query.reset_mock() + self.app.filter_reprocess_query(mock_query, ReprocessQueryType.score, 0.8, "", 10) + expected_since = datetime(2025, 1, 1) - timedelta(days=10) + mock_query.filter.assert_called() + called_args, _ = mock_query.filter.call_args + compiled_query = called_args[0].compile(dialect=postgresql.dialect()) + print(str(called_args[0])) + print(compiled_query.params) + self.assertTrue(str(called_args[0]), 'resolved_reference.score <= :score_1') + self.assertTrue(compiled_query.params.get('score_1'), 0.8) + + def test_get_reprocess_records(self): + """ test get_reprocess_records method """ + + with patch.object(self.app, "session_scope") as mock_session_scope: + mock_session = mock_session_scope.return_value.__enter__.return_value + + # define a mock SQLAlchemy row with _asdict() method + MockRow = namedtuple("MockRow", + ["history_id", "item_num", "refstr", "refraw", "source_bibcode", "source_filename", + "source_modified", "parser_name"]) + + # mock query results with same history_id to trigger the else block + mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ + MockRow(history_id=1, item_num=1, refstr="Reference 1", refraw="Raw 1", source_bibcode="2023A&A...657A...1X", + source_filename="some_source_file.txt", source_modified="D1", parser_name="arXiv"), + MockRow(history_id=1, item_num=2, refstr="Reference 2", refraw="Raw 2", source_bibcode="2023A&A...657A...1X", + source_filename="some_source_file.txt", source_modified="D1", parser_name="arXiv"), + ] + + results = self.app.get_reprocess_records(type=0, score_cutoff=0.8, match_bibcode="", date_cutoff=0) + + self.assertEqual(len(results), 1) + self.assertEqual(len(results[0]['references']), 2) + self.assertEqual(results[0]['references'][1]['refstr'], 'Reference 2') + + def test_get_resolved_references_all(self): + """ test get_resolved_references_all method """ + + with patch.object(self.app, "session_scope") as mock_session_scope: + mock_session = mock_session_scope.return_value.__enter__.return_value + + # define a mock SQLAlchemy row with _asdict() method + MockRow = namedtuple("MockRow", ["source_bibcode", "date", "id", "resolved_bibcode", "score", "parser_name"]) + + # mock query results with highest scores + mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv"), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv"), + ] + + results = self.app.get_resolved_references_all("2023A&A...657A...1X") + + assert len(results) == 2 + assert results[0] == ("2023A&A...657A...1X", "2025-01-01 00:00:00", 1, "0001arXiv.........Z", 0.95, "arXiv") + assert results[1] == ("2023A&A...657A...1X", "2025-01-02 00:00:00", 2, "0002arXiv.........Z", 0.85, "arXiv") + + # test case when no results are found + with patch.object(self.app.logger, "error") as mock_error: + mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [] + results = self.app.get_resolved_references_all("2023A&A...657A...1X") + assert results == [] + mock_error.assert_called_with("Unable to fetch resolved references for source bibcode `2023A&A...657A...1X`.") + + def test_get_resolved_references(self): + """ test get_resolved_references method """ + + with patch.object(self.app, "session_scope") as mock_session_scope: + mock_session = mock_session_scope.return_value.__enter__.return_value + + # Define a mock SQLAlchemy row with namedtuple + MockRow = namedtuple("MockRow", ["source_bibcode", "date", "id", "resolved_bibcode", "score", "parser_name", "parser_priority"]) + + # Mock query results with highest-ranked records + mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv", parser_priority=1), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv", parser_priority=1), + ] + + results = self.app.get_resolved_references("2023A&A...657A...1X") + + assert len(results) == 2 + assert results[0] == { + "source_bibcode": "2023A&A...657A...1X", + "date": "2025-01-01 00:00:00", + "id": 1, + "resolved_bibcode": "0001arXiv.........Z", + "score": 0.95, + "parser_name": "arXiv", + "parser_priority": 1 + } + assert results[1] == { + "source_bibcode": "2023A&A...657A...1X", + "date": "2025-01-02 00:00:00", + "id": 2, + "resolved_bibcode": "0002arXiv.........Z", + "score": 0.85, + "parser_name": "arXiv", + "parser_priority": 1 + } + + # Test case when no results are found + with patch.object(self.app.logger, "error") as mock_error: + mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [] + results = self.app.get_resolved_references("2023A&A...657A...1X") + assert results == [] + mock_error.assert_called_with("Unable to fetch resolved references for source bibcode `2023A&A...657A...1X`.") + + def test_parser_model_get_name(self): + """ test get_name method of Parser class in model module """ + parser = Parser(name="TestParser", extension_pattern=".xml", reference_service_endpoint="xml", matches=[]) + self.assertEqual(parser.get_name(), "TestParser") + + def test_parser_model_get_extension_pattern(self): + """ test get_extension_pattern method of Parser class in model module """ + parser = Parser(name="TestParser", extension_pattern=".xml", reference_service_endpoint="xml", matches=[]) + self.assertEqual(parser.get_extension_pattern(), ".xml") + + def test_processed_history_toJSON(self): + """ test toJSON method of ProcessedHistory class in model module """ + history = ProcessedHistory( + bibcode="2023A&A...657A...1X", + source_filename="some_source_file.txt", + source_modified="2025-03-05T12:00:00", + status="processed", + date="2025-03-05T12:30:00", + total_ref=10 + ) + expected_json = { + "bibcode": "2023A&A...657A...1X", + "source_filename": "some_source_file.txt", + "source_modified": "2025-03-05T12:00:00", + "status": "processed", + "date": "2025-03-05T12:30:00", + "total_ref": 10 + } + self.assertEqual(history.toJSON(), expected_json) + + def test_compare_classic_toJSON(self): + """Test toJSON method of CompareClassic class""" + compare = CompareClassic( + history_id=1, + item_num=2, + bibcode="0001arXiv.........Z", + score=1, + state="MATCH") + expected_json = { + "history_id": 1, + "item_num": 2, + "bibcode": "0001arXiv.........Z", + "score": 1, + "state": "MATCH" + } + self.assertEqual(compare.toJSON(), expected_json) + class TestDatabaseNoStubdata(unittest.TestCase): @@ -461,6 +842,12 @@ def test_populate_tables(self): classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result')) self.assertTrue(status == True) + def test_get_parser_error(self): + """ test get_parser when it errors for unrecognized source filename """ + with patch.object(self.app.logger, 'error') as mock_error: + self.assertEqual(self.app.get_parser("invalid/file/path/"), {}) + mock_error.assert_called_with("Unrecognizable source file invalid/file/path/.") + if __name__ == '__main__': unittest.main() diff --git a/adsrefpipe/tests/unittests/test_ref_parsers.py b/adsrefpipe/tests/unittests/test_ref_parsers.py old mode 100644 new mode 100755 index e73a53f..2c910cc --- a/adsrefpipe/tests/unittests/test_ref_parsers.py +++ b/adsrefpipe/tests/unittests/test_ref_parsers.py @@ -4,46 +4,16 @@ sys.path.insert(0, project_home) import unittest -import mock +from unittest.mock import Mock, patch, mock_open, MagicMock import json +import re from adsrefpipe.tests.unittests.stubdata import parsed_references -from adsrefpipe.refparsers.AASxml import AAStoREFs -from adsrefpipe.refparsers.AGUxml import AGUtoREFs, AGUreference -from adsrefpipe.refparsers.APSxml import APStoREFs -from adsrefpipe.refparsers.AnAxml import AnAtoREFs -from adsrefpipe.refparsers.AIPxml import AIPtoREFs -from adsrefpipe.refparsers.BlackwellXML import BLACKWELLtoREFs -from adsrefpipe.refparsers.CrossRefXML import CrossRefToREFs -from adsrefpipe.refparsers.CUPxml import CUPtoREFs -from adsrefpipe.refparsers.EDPxml import EDPtoREFs -from adsrefpipe.refparsers.EGUxml import EGUtoREFs -from adsrefpipe.refparsers.ElsevierXML import ELSEVIERtoREFs -from adsrefpipe.refparsers.IcarusXML import ICARUStoREFs -from adsrefpipe.refparsers.IOPFTxml import IOPFTtoREFs -from adsrefpipe.refparsers.IOPxml import IOPtoREFs -from adsrefpipe.refparsers.IPAPxml import IPAPtoREFs -from adsrefpipe.refparsers.JATSxml import JATStoREFs -from adsrefpipe.refparsers.JSTAGExml import JSTAGEtoREFs -from adsrefpipe.refparsers.LivingReviewsXML import LivingReviewsToREFs -from adsrefpipe.refparsers.MDPIxml import MDPItoREFs -from adsrefpipe.refparsers.NLM3xml import NLMtoREFs -from adsrefpipe.refparsers.NatureXML import NATUREtoREFs -from adsrefpipe.refparsers.ONCPxml import ONCPtoREFs -from adsrefpipe.refparsers.OUPxml import OUPtoREFs -from adsrefpipe.refparsers.PASAxml import PASAtoREFs -from adsrefpipe.refparsers.RSCxml import RSCtoREFs -from adsrefpipe.refparsers.SpringerXML import SPRINGERtoREFs -from adsrefpipe.refparsers.SPIExml import SPIEtoREFs -from adsrefpipe.refparsers.UCPxml import UCPtoREFs -from adsrefpipe.refparsers.VERSITAxml import VERSITAtoREFs -from adsrefpipe.refparsers.WileyXML import WILEYtoREFs - from adsrefpipe.refparsers.arXivTXT import ARXIVtoREFs from adsrefpipe.refparsers.reference import Reference, ReferenceError, XMLreference from adsrefpipe.refparsers.handler import verify -from adsrefpipe.utils import get_bibcode, verify_bibcode, get_resolved_references +from adsrefpipe.refparsers.unicode import tostr, UnicodeHandler, UnicodeHandlerError class TestReferenceParsers(unittest.TestCase): @@ -54,222 +24,12 @@ def setUp(self): def tearDown(self): unittest.TestCase.tearDown(self) - def test_aasxml_parser(self): - """ test parser for anaxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.aas.raw') - references = AAStoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_aas) - - def test_aguxml_parser(self): - """ test parser for aguxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.agu.xml') - references = AGUtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_agu) - - def test_aipxml_parser(self): - """ test parser for aipxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.aip.xml') - references = AIPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_aip) - - def test_anaxml_parser(self): - """ test parser for anaxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.ana.xml') - references = AnAtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_ana) - - def test_apsxml_parser(self): - """ test parser for apsxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.aps.xml') - references = APStoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_aps) - - def test_blackwellxml_parser(self): - """ test parser for blackwellxml """ - testing = [ - ('/stubdata/test.blackwell.xml', parsed_references.parsed_blackwell), - ('/stubdata/test.mnras.xml', parsed_references.parsed_mnras), - ] - for (file, expected) in testing: - reference_source = os.path.abspath(os.path.dirname(__file__) + file) - references = BLACKWELLtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, expected) - - def test_crossrefxml_parser(self): - """ test parser for crossrefxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.xref.xml') - references = CrossRefToREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_crossref) - - def test_cupxml_parser(self): - """ test parser for cupxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.cup.xml') - references = CUPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_cup) - - def test_edpxml_parser(self): - """ test parser for edpxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.edp.xml') - references = EDPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_edp) - - def test_eguxml_parser(self): - """ test parser for eguxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.egu.xml') - references = EGUtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_egu) - - def test_elsevierxml_parser(self): - """ test parser for elsevierxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.elsevier.xml') - references = ELSEVIERtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_elsevier) - - def test_icarusxml_parser(self): - """ test parser for icarusxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.icarus.raw') - references = ICARUStoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_icarus) - - def test_iopftxml_parser(self): - """ test parser for iopftxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.iopft.xml') - references = IOPFTtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_iopft) - - def test_iopxml_parser(self): - """ test parser for iopxml """ - testing = [ - ('/stubdata/test.iop.xml', parsed_references.parsed_iop), - ('/stubdata/test.edporiop.xml', parsed_references.parsed_edporiop), - ] - for (file, expected) in testing: - reference_source = os.path.abspath(os.path.dirname(__file__) + file) - references = IOPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, expected) - - def test_ipapxml_parser(self): - """ test parser for ipapxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.ipap.xml') - references = IPAPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_ipap) - - def test_jatsxml_parser(self): - """ test parser for jatsxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.jats.xml') - references = JATStoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_jats) - - def test_jstxml_parser(self): - """ test parser for jstxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.jst.xml') - references = JSTAGEtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_jst) - - def test_livingreviewsxml_parser(self): - """ test parser for livingreviewsxml """ - xml_testing = [ - ('/stubdata/lrr-2014-6.living.xml', parsed_references.parsed_livingreviews_llr), - ('/stubdata/lrsp-2007-2.living.xml', parsed_references.parsed_livingreviews_lrsp) - ] - for (filename, expected_results) in xml_testing: - reference_source = os.path.abspath(os.path.dirname(__file__) + filename) - references = LivingReviewsToREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, expected_results) - - def test_mdpixml_parser(self): - """ test parser for mdpixml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.mdpi.xml') - references = MDPItoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_mdpi) - - def test_nlm3xml_parser(self): - """ test parser for nlm3xml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.nlm3.xml') - references = NLMtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_nlm3) - - def test_naturexml_parser(self): - """ test parser for naturexml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.nature.xml') - references = NATUREtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_nature) - - def test_oncpxml_parser(self): - """ test parser for oncpxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.meta.xml') - references = ONCPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_oncp) - - def test_oupxml_parser(self): - """ test parser for oupxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.oup.xml') - references = OUPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_oup) - - def test_pasaxml_parser(self): - """ test parser for pasaxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.pasa.xml') - references = PASAtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_pasa) - - def test_rscxml_parser(self): - """ test parser for rscxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.rsc.xml') - references = RSCtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_rsc) - - def test_spiexml_parser(self): - """ test parser for spiexml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.spie.xml') - references = SPIEtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_spie) - - def test_springerxml_parser(self): - """ test parser for springerxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.springer.xml') - references = SPRINGERtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_springer) - - def test_ucpxml_parser(self): - """ test parser for ucpxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.ucp.xml') - references = UCPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_ucp) - - def test_versitaxml_parser(self): - """ test parser for wileyxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.versita.xml') - references = VERSITAtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_versita) - - def test_wileyxml_parser(self): - """ test parser for wileyxml """ - reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.wiley2.xml') - references = WILEYtoREFs(filename=reference_source, buffer=None).process_and_dispatch() - self.assertEqual(references, parsed_references.parsed_wiley) - def test_reference_init(self): """ test Reference class init """ with self.assertRaises(Exception) as context: Reference({'authors': "Pipeline, R", 'jrlstr': "For Testing", 'year': 2020}).parse() self.assertEqual('Parse method not defined.', str(context.exception)) - def test_reference_pages(self): - """ test calling parse pages method of reference class""" - reference = AGUreference('') - self.assertEqual(reference.parse_pages(None), ('', None)) - self.assertEqual(reference.parse_pages('L23'), ('23', 'L')) - self.assertEqual(reference.parse_pages('T2', ignore='RSTU'), ('2', None)) - self.assertEqual(reference.parse_pages('T2', letters='RSTU'), ('2', 'T')) - self.assertEqual(reference.parse_pages('23S'), ('23', 'S')) - self.assertEqual(reference.parse_pages('S23'), ('23', 'S')) - - def test_reference_url(self): - """ test calling url decode method of XMLreference""" - reference = AGUreference('') - self.assertEqual(reference.url_decode('%AF'), '¯') - def test_html_parser(self): """ test parsers for html references """ html_testing = [ @@ -289,8 +49,8 @@ def test_html_parser(self): "response":{"numFound":1,"start":0,"docs":[{ "bibcode":""}]} } for (parser, filename, expected_results, bibcode) in html_testing: - with mock.patch('requests.get') as get_mock: - get_mock.return_value = mock_response = mock.Mock() + with patch('requests.get') as get_mock: + get_mock.return_value = mock_response = Mock() mock_response.status_code = 200 annrev_response['response']['docs'][0]['bibcode'] = bibcode mock_response.text = json.dumps(annrev_response) @@ -347,68 +107,209 @@ def test_arxivtxt_parser(self): references = ARXIVtoREFs(filename=reference_source, buffer=None).process_and_dispatch() self.assertEqual(references, parsed_references.parsed_arxiv) - def test_get_bibcode(self): - """ some reference files provide doi, and bibcode needs to be infered from doi """ - return_value = { - u'responseHeader': {u'status': 0, u'QTime': 13}, - u'response': { - u'start': 0, - u'numFound': 1, - u'docs': [{u'bibcode': u'2023arXiv230317899C'}] - } - } - with mock.patch('requests.get') as get_mock: - get_mock.return_value = mock_response = mock.Mock() - mock_response.status_code = 200 - mock_response.text = json.dumps(return_value) - bibcode = get_bibcode(doi='10.48550/arXiv.2303.17899') - self.assertEqual(bibcode, '2023arXiv230317899C') - - def test_get_bibcode_error(self): - """ some reference files provide doi, and bibcode needs to be infered from doi when solr returns error""" - with mock.patch('requests.get') as get_mock: - get_mock.return_value = mock_response = mock.Mock() - mock_response.status_code = 502 - bibcode = get_bibcode(doi='10.48550/arXiv.2303.17899') - self.assertEqual(bibcode, None) - - def test_verify_bibcode(self): - """ test calling solr to verify a bibcode """ - return_value = { - u'responseHeader': {u'status': 0, u'QTime': 13}, - u'response': { - u'start': 0, - u'numFound': 1, - u'docs': [{u'bibcode': u'2023arXiv230317899C'}] - } - } - with mock.patch('requests.get') as get_mock: - get_mock.return_value = mock_response = mock.Mock() - mock_response.status_code = 200 - mock_response.text = json.dumps(return_value) - bibcode = verify_bibcode(bibcode='2023arXiv230317899C') - self.assertEqual(bibcode, '2023arXiv230317899C') - - def test_verify_bibcode_error(self): - """ test calling solr to verify a bibcode when error is returned """ - with mock.patch('requests.get') as get_mock: - get_mock.return_value = mock_response = mock.Mock() - mock_response.status_code = 502 - bibcode = verify_bibcode(bibcode='2023arXiv230317899C') - self.assertEqual(bibcode, None) - - def test_get_resolved_references_error(self): - """ test calling get_resolved_references with wrong end point """ - references = [{'item_num': 2, - 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', - 'refraw': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 '}] - self.assertEqual(get_resolved_references(references, 'wrong_url'), None) - - with mock.patch('requests.post') as get_mock: - get_mock.return_value = mock_response = mock.Mock() - mock_response.status_code = 502 - self.assertEqual(get_resolved_references(references, 'xml'), None) + ##### unicode module's unittests ##### + + def test_unicode_tostr_exception(self): + """ test unicode's tostr when ValueError is raised """ + mock_value = Mock(spec=str) + mock_value.encode.side_effect = ValueError("Encoding error") + self.assertEqual(tostr(mock_value), "") + + def test_unicode_handler_init_exception(self): + """ test UnicodeHandler's init when ValueError is raised """ + + # Invalid code (not an int) + mock_data = 'invalid_entry "entity" "ascii" "latex"\n' + with patch("builtins.open", mock_open(read_data=mock_data)): + handler = UnicodeHandler("dummy_path") + self.assertNotIn("entity", handler) + + def test_unicode_handler_sub_numasc_entity_exception(self): + """ test UnicodeHandler's __sub_numasc_entity when IndexError and OverflowError are raised """ + + # mock file reading to prevent FileNotFoundError + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + handler.unicode = MagicMock() + handler.unicode.__getitem__.side_effect = IndexError # Simulate IndexError + + # test IndexError handling (falls back to unicodedata.normalize) + match = re.match(r'&#(?P\d+);', "󴈿") + if match: + with patch("unicodedata.normalize", return_value="normalized_value"): + self.assertEqual(handler._UnicodeHandler__sub_numasc_entity(match), "normalized_value") + + # test OverflowError handling (raises UnicodeHandlerError) + match = re.match(r'&#(?P\d+);', "�") + if match: + with patch("unicodedata.normalize", side_effect=OverflowError): + with self.assertRaises(UnicodeHandlerError) as context: + handler._UnicodeHandler__sub_numasc_entity(match) + self.assertEqual(str(context.exception), "Unknown numeric entity: �") + + def test_unicode_handler_sub_hexnumasc_entity(self): + """ test UnicodeHandler's __sub_hexnumasc_entity method """ + + # mock file reading to prevent FileNotFoundError + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + # ensure no entry exists for entno so that the elif branch executes + handler.unicode = [None] * 65536 + + # hex for £ (163) to trigger the elif branch + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "£") + if match: + # mock u2asc to return a known value + with patch.object(handler, "u2asc", return_value="converted_ascii") as mock_u2asc: + self.assertEqual(handler._UnicodeHandler__sub_hexnumasc_entity(match), "converted_ascii") + # ensure u2asc is called with the correct character + mock_u2asc.assert_called_once_with("£") + + # mock unicode lookup to raise IndexError + handler.unicode = MagicMock() + handler.unicode.__getitem__.side_effect = IndexError + + # large invalid hex value to trigger the IndexError exception + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "򙦙") + if match: + # check that the correct exception is raised + with self.assertRaises(UnicodeHandlerError) as context: + handler._UnicodeHandler__sub_hexnumasc_entity(match) + # ensure the exception message is correct + self.assertEqual(str(context.exception), "Unknown hexadecimal entity: 򙦙") + + def test_unicode_handler_sub_hexnum_toent(self): + """ test UnicodeHandler's __sub_hexnum_toent method """ + + # mock file reading to prevent FileNotFoundError + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + # test ValueError exception, should return escaped unicode representation + match = re.match(r'&#x(?P[G-Z]+);', "&#xGHI;") + if match: + self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), r"\uGHI") + # ensure unicode list is large enough and contains a valid entity + handler.unicode = [None] * 70000 + handler.unicode[163] = MagicMock(entity="pound") + + # test valid conversion to named entity + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "£") + if match: + self.assertEqual(handler._UnicodeHandler__sub_hexnum_toent(match), "£") + + # test UnicodeHandlerError for unknown entity by ensuring index is in range but has no entity + handler.unicode = MagicMock() + handler.unicode.__getitem__.return_value = None + match = re.match(r'&#x(?P[0-9A-Fa-f]+);', "򙦙") + if match: + with self.assertRaises(UnicodeHandlerError) as context: + handler._UnicodeHandler__sub_hexnum_toent(match) + # ensure the exception message is correct + self.assertEqual(str(context.exception), "Unknown hexadecimal entity: 629145") + + def test_unicode_handler_toentity(self): + """ test UnicodeHandler's __toentity method """ + + # mock file reading to prevent FileNotFoundError + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + # ensure unicode list is large enough + handler.unicode = [None] * 70000 + + # mock a named entity for character £ (ascii_value 163) + handler.unicode[163] = Mock(entity="pound") + + # test named entity conversion + self.assertEqual(handler._UnicodeHandler__toentity("£"), "£") + + # test numeric entity conversion when no named entity exists + # Ʃ (mathematical summation, ascii_value 425) + self.assertEqual(handler._UnicodeHandler__toentity("Ʃ"), "Ʃ") + + def test_unicode_handler_cleanall(self): + """ test UnicodeHandler's cleanall method """ + + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + # mock regex substitutions for unrelated operations + handler.re_accent = Mock(sub=lambda func, text: text) + handler.re_missent = Mock(sub=lambda func, text: text) + handler.re_morenum = Mock(sub=lambda func, text: text) + handler.re_rsquo = Mock(sub=lambda repl, text: text) + + # mock re_backslash and re_lower_upper_ls substitutions to test cleanslash is true + handler.re_backslash = Mock(sub=lambda repl, text: text.replace("\\", "")) + handler.re_lower_upper_ls = Mock(sub=lambda repl, text: text.replace("l/a", "ła")) + input_text = "l/a and back\\slash" + expected_output = "ła and backslash" + self.assertEqual(handler.cleanall(input_text, cleanslash=1), expected_output) + + def test_unicode_handler_sub_accent(self): + """ test UnicodeHandler's __sub_accent method """ + + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + # correct mapping: accent symbols -> entity suffixes + handler.accents = {"`": "grave", "'": "acute", "^": "circ"} + # create a mock match object + match = Mock() + match.group.side_effect = lambda x: "e" if x == 1 else "`" + + self.assertEqual(handler._UnicodeHandler__sub_accent(match), "è") + + def test_unicode_handler_sub_missent(self): + """ test UnicodeHandler's __sub_missent method """ + + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + # mock missent dictionary with correct mapping + handler.missent = {"b4": "acute", "caron": "scaron"} + # mock keys() method to simulate known and unknown entities + handler.keys = Mock(side_effect=lambda: {"sacute", "egrave"}) + # create a mock match object for an entity that exists in keys() + match_existing = Mock() + match_existing.group.side_effect = lambda x: "s" if x == 1 else "b4" + + # test correction when entity exists + self.assertEqual(handler._UnicodeHandler__sub_missent(match_existing), "ś") + + # create a mock match object for an entity that does not exist in keys() + match_non_existing = Mock() + match_non_existing.group.side_effect = lambda x: "e" if x == 1 else "caron" + + # test correction when entity does not exist + self.assertEqual(handler._UnicodeHandler__sub_missent(match_non_existing), "eš") + + def test_unicode_handler_sub_morenum(self): + """ test UnicodeHandler's __sub_morenum method """ + + with patch("builtins.open", mock_open(read_data="")): + handler = UnicodeHandler("dummy_path") + + # mock morenum dictionary with a valid numeric entity mapping + handler.morenum = {"34": "quot", "169": "copy"} + # create a mock match object for a valid numeric entity + match_valid = Mock() + match_valid.group.side_effect = lambda x: "34" if x == 1 else None + + # test valid numeric entity conversion + self.assertEqual(handler._UnicodeHandler__sub_morenum(match_valid), """) + + # create a mock match object for an unknown numeric entity + match_invalid = Mock() + match_invalid.group.side_effect = lambda x: "9999" if x == 1 else None + + # test KeyError handling (should raise KeyError) + with self.assertRaises(KeyError): + handler._UnicodeHandler__sub_morenum(match_invalid) if __name__ == '__main__': - unittest.main() \ No newline at end of file + unittest.main() diff --git a/adsrefpipe/tests/unittests/test_ref_parsers_xml.py b/adsrefpipe/tests/unittests/test_ref_parsers_xml.py new file mode 100644 index 0000000..1df1fc3 --- /dev/null +++ b/adsrefpipe/tests/unittests/test_ref_parsers_xml.py @@ -0,0 +1,261 @@ +import sys, os +project_home = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')) +if project_home not in sys.path: + sys.path.insert(0, project_home) + +import unittest + +from adsrefpipe.tests.unittests.stubdata import parsed_references +from adsrefpipe.refparsers.AASxml import AAStoREFs +from adsrefpipe.refparsers.AGUxml import AGUtoREFs, AGUreference +from adsrefpipe.refparsers.APSxml import APStoREFs +from adsrefpipe.refparsers.AnAxml import AnAtoREFs +from adsrefpipe.refparsers.AIPxml import AIPtoREFs +from adsrefpipe.refparsers.BlackwellXML import BLACKWELLtoREFs +from adsrefpipe.refparsers.CrossRefXML import CrossRefToREFs +from adsrefpipe.refparsers.CUPxml import CUPtoREFs +from adsrefpipe.refparsers.EDPxml import EDPtoREFs +from adsrefpipe.refparsers.EGUxml import EGUtoREFs +from adsrefpipe.refparsers.ElsevierXML import ELSEVIERtoREFs +from adsrefpipe.refparsers.IcarusXML import ICARUStoREFs +from adsrefpipe.refparsers.IOPFTxml import IOPFTtoREFs +from adsrefpipe.refparsers.IOPxml import IOPtoREFs +from adsrefpipe.refparsers.IPAPxml import IPAPtoREFs +from adsrefpipe.refparsers.JATSxml import JATStoREFs +from adsrefpipe.refparsers.JSTAGExml import JSTAGEtoREFs +from adsrefpipe.refparsers.LivingReviewsXML import LivingReviewsToREFs +from adsrefpipe.refparsers.MDPIxml import MDPItoREFs +from adsrefpipe.refparsers.NLM3xml import NLMtoREFs +from adsrefpipe.refparsers.NatureXML import NATUREtoREFs +from adsrefpipe.refparsers.ONCPxml import ONCPtoREFs +from adsrefpipe.refparsers.OUPxml import OUPtoREFs +from adsrefpipe.refparsers.PASAxml import PASAtoREFs +from adsrefpipe.refparsers.RSCxml import RSCtoREFs +from adsrefpipe.refparsers.SpringerXML import SPRINGERtoREFs +from adsrefpipe.refparsers.SPIExml import SPIEtoREFs +from adsrefpipe.refparsers.UCPxml import UCPtoREFs +from adsrefpipe.refparsers.VERSITAxml import VERSITAtoREFs +from adsrefpipe.refparsers.WileyXML import WILEYtoREFs + +class TestReferenceParsersXML(unittest.TestCase): + + def setUp(self): + unittest.TestCase.setUp(self) + + def tearDown(self): + unittest.TestCase.tearDown(self) + + def test_aasxml_parser(self): + """ test parser for anaxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.aas.raw') + references = AAStoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_aas) + + def test_aguxml_parser(self): + """ test parser for aguxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.agu.xml') + references = AGUtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_agu) + + def test_aipxml_parser(self): + """ test parser for aipxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.aip.xml') + references = AIPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_aip) + + def test_anaxml_parser(self): + """ test parser for anaxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.ana.xml') + references = AnAtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_ana) + + def test_apsxml_parser(self): + """ test parser for apsxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.aps.xml') + references = APStoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_aps) + + def test_blackwellxml_parser(self): + """ test parser for blackwellxml """ + testing = [ + ('/stubdata/test.blackwell.xml', parsed_references.parsed_blackwell), + ('/stubdata/test.mnras.xml', parsed_references.parsed_mnras), + ] + for (file, expected) in testing: + reference_source = os.path.abspath(os.path.dirname(__file__) + file) + references = BLACKWELLtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, expected) + + def test_crossrefxml_parser(self): + """ test parser for crossrefxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.xref.xml') + references = CrossRefToREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_crossref) + + def test_cupxml_parser(self): + """ test parser for cupxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.cup.xml') + references = CUPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_cup) + + def test_edpxml_parser(self): + """ test parser for edpxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.edp.xml') + references = EDPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_edp) + + def test_eguxml_parser(self): + """ test parser for eguxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.egu.xml') + references = EGUtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_egu) + + def test_elsevierxml_parser(self): + """ test parser for elsevierxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.elsevier.xml') + references = ELSEVIERtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_elsevier) + + def test_icarusxml_parser(self): + """ test parser for icarusxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.icarus.raw') + references = ICARUStoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_icarus) + + def test_iopftxml_parser(self): + """ test parser for iopftxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.iopft.xml') + references = IOPFTtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_iopft) + + def test_iopxml_parser(self): + """ test parser for iopxml """ + testing = [ + ('/stubdata/test.iop.xml', parsed_references.parsed_iop), + ('/stubdata/test.edporiop.xml', parsed_references.parsed_edporiop), + ] + for (file, expected) in testing: + reference_source = os.path.abspath(os.path.dirname(__file__) + file) + references = IOPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, expected) + + def test_ipapxml_parser(self): + """ test parser for ipapxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.ipap.xml') + references = IPAPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_ipap) + + def test_jatsxml_parser(self): + """ test parser for jatsxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.jats.xml') + references = JATStoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_jats) + + def test_jstxml_parser(self): + """ test parser for jstxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.jst.xml') + references = JSTAGEtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_jst) + + def test_livingreviewsxml_parser(self): + """ test parser for livingreviewsxml """ + xml_testing = [ + ('/stubdata/lrr-2014-6.living.xml', parsed_references.parsed_livingreviews_llr), + ('/stubdata/lrsp-2007-2.living.xml', parsed_references.parsed_livingreviews_lrsp) + ] + for (filename, expected_results) in xml_testing: + reference_source = os.path.abspath(os.path.dirname(__file__) + filename) + references = LivingReviewsToREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, expected_results) + + def test_mdpixml_parser(self): + """ test parser for mdpixml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.mdpi.xml') + references = MDPItoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_mdpi) + + def test_nlm3xml_parser(self): + """ test parser for nlm3xml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.nlm3.xml') + references = NLMtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_nlm3) + + def test_naturexml_parser(self): + """ test parser for naturexml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.nature.xml') + references = NATUREtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_nature) + + def test_oncpxml_parser(self): + """ test parser for oncpxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.meta.xml') + references = ONCPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_oncp) + + def test_oupxml_parser(self): + """ test parser for oupxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.oup.xml') + references = OUPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_oup) + + def test_pasaxml_parser(self): + """ test parser for pasaxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.pasa.xml') + references = PASAtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_pasa) + + def test_rscxml_parser(self): + """ test parser for rscxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.rsc.xml') + references = RSCtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_rsc) + + def test_spiexml_parser(self): + """ test parser for spiexml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.spie.xml') + references = SPIEtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_spie) + + def test_springerxml_parser(self): + """ test parser for springerxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.springer.xml') + references = SPRINGERtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_springer) + + def test_ucpxml_parser(self): + """ test parser for ucpxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.ucp.xml') + references = UCPtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_ucp) + + def test_versitaxml_parser(self): + """ test parser for wileyxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.versita.xml') + references = VERSITAtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_versita) + + def test_wileyxml_parser(self): + """ test parser for wileyxml """ + reference_source = os.path.abspath(os.path.dirname(__file__) + '/stubdata/test.wiley2.xml') + references = WILEYtoREFs(filename=reference_source, buffer=None).process_and_dispatch() + self.assertEqual(references, parsed_references.parsed_wiley) + + def test_reference_pages(self): + """ test calling parse pages method of reference class""" + reference = AGUreference('') + self.assertEqual(reference.parse_pages(None), ('', None)) + self.assertEqual(reference.parse_pages('L23'), ('23', 'L')) + self.assertEqual(reference.parse_pages('T2', ignore='RSTU'), ('2', None)) + self.assertEqual(reference.parse_pages('T2', letters='RSTU'), ('2', 'T')) + self.assertEqual(reference.parse_pages('23S'), ('23', 'S')) + self.assertEqual(reference.parse_pages('S23'), ('23', 'S')) + + def test_reference_url(self): + """ test calling url decode method of XMLreference""" + reference = AGUreference('') + self.assertEqual(reference.url_decode('%AF'), '¯') + + + +if __name__ == '__main__': + unittest.main() \ No newline at end of file diff --git a/adsrefpipe/tests/unittests/test_tasks.py b/adsrefpipe/tests/unittests/test_tasks.py old mode 100644 new mode 100755 index b93680e..ad84160 --- a/adsrefpipe/tests/unittests/test_tasks.py +++ b/adsrefpipe/tests/unittests/test_tasks.py @@ -5,16 +5,16 @@ import datetime import unittest -import mock +from unittest.mock import Mock, patch import json from adsrefpipe import app, tasks, utils from adsrefpipe.models import Base, Action, Parser, ReferenceSource, ProcessedHistory, ResolvedReference, CompareClassic from adsrefpipe.refparsers.handler import verify -from adsrefpipe.tests.unittests.data_test_db_query import actions_records, parsers_records +from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records -class TestWorkers(unittest.TestCase): +class TestTasks(unittest.TestCase): postgresql_url_dict = { 'port': 5432, @@ -105,7 +105,7 @@ def add_stub_data(self): score=service[2], reference_raw=service[0]) resolved_records.append(resolved_record) - success = self.app.insert_resolved_referencce_records(session, resolved_records) + success = self.app.insert_resolved_reference_records(session, resolved_records) self.assertTrue(success == True) session.commit() @@ -127,8 +127,8 @@ def test_process_references(self): } ] - with mock.patch('requests.post') as mock_resolved_references: - mock_resolved_references.return_value = mock_response = mock.Mock() + with patch('requests.post') as mock_resolved_references: + mock_resolved_references.return_value = mock_response = Mock() mock_response.status_code = 200 mock_response.content = json.dumps({"resolved": resolved_reference}) filename = os.path.join(self.arXiv_stubdata_dir,'00001.raw') @@ -175,8 +175,8 @@ def test_reprocess_subset_references(self): "id": "H1I1" } ] - with mock.patch('requests.post') as mock_resolved_references: - mock_resolved_references.return_value = mock_response = mock.Mock() + with patch('requests.post') as mock_resolved_references: + mock_resolved_references.return_value = mock_response = Mock() mock_response.status_code = 200 mock_response.content = json.dumps({"resolved": resolved_reference}) parser_dict = self.app.get_parser(reprocess_record[0]['source_filename']) @@ -206,6 +206,56 @@ def test_reprocess_subset_references(self): {'name': 'CompareClassic', 'description': 'comparison of new and classic processed run', 'count': 0}] self.assertTrue(self.app.get_count_records() == expected_count) + def test_task_process_reference_error(self): + """ test task_process_reference when utils method returns False """ + + reference_task = { + 'reference': [{'item_num': 2, + 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', + 'id': '2'}], + 'source_bibcode': '2023TEST..........S', + 'source_filename': 'some_source.txt', + 'resolver_service_url': 'text' + } + + # mock post_request_resolved_reference to return false to trigger FailedRequest + with patch("adsrefpipe.tasks.utils.post_request_resolved_reference", return_value=False): + with self.assertRaises(tasks.FailedRequest): + tasks.task_process_reference(reference_task) + + def test_task_process_reference_exception(self): + """ test task_process_reference when KeyError is raised """ + + reference_task = { + 'reference': [{'item_num': 2, + 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', + 'id': '2'}], + 'source_bibcode': '2023TEST..........S', + 'source_filename': 'some_source.txt', + 'resolver_service_url': 'text' + } + + # mock post_request_resolved_reference to raise KeyError + with patch("adsrefpipe.tasks.utils.post_request_resolved_reference", side_effect=KeyError): + self.assertFalse(tasks.task_process_reference(reference_task)) + + def test_task_process_reference_success(self): + """ test task_process_reference successfully returns True """ + + reference_task = { + 'reference': [{'item_num': 2, + 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', + 'id': '2'}], + 'source_bibcode': '2023TEST..........S', + 'source_filename': 'some_source.txt', + 'resolver_service_url': 'text' + } + + # Mock post_request_resolved_reference to return a valid resolved reference + with patch("adsrefpipe.tasks.utils.post_request_resolved_reference", return_value=["resolved_ref"]), \ + patch("adsrefpipe.tasks.app.populate_tables_post_resolved", return_value=True): + self.assertTrue(tasks.task_process_reference(reference_task)) + if __name__ == '__main__': unittest.main() diff --git a/adsrefpipe/tests/unittests/test_utils.py b/adsrefpipe/tests/unittests/test_utils.py new file mode 100644 index 0000000..291b419 --- /dev/null +++ b/adsrefpipe/tests/unittests/test_utils.py @@ -0,0 +1,146 @@ +import sys, os +project_home = os.path.abspath(os.path.join(os.path.dirname(__file__), '../../../')) +if project_home not in sys.path: + sys.path.insert(0, project_home) + +import unittest +from unittest.mock import MagicMock, patch +import json +import time +import requests + +from adsrefpipe.utils import get_bibcode, verify_bibcode, post_request_resolved_reference, \ + get_date_created, get_date_modified_struct_time + + +class TestUtils(unittest.TestCase): + + def setUp(self): + unittest.TestCase.setUp(self) + + def tearDown(self): + unittest.TestCase.tearDown(self) + + def test_get_bibcode(self): + """ some reference files provide doi, and bibcode needs to be infered from doi """ + return_value = { + u'responseHeader': {u'status': 0, u'QTime': 13}, + u'response': { + u'start': 0, + u'numFound': 1, + u'docs': [{u'bibcode': u'2023arXiv230317899C'}] + } + } + with patch('requests.get') as get_mock: + get_mock.return_value = mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = json.dumps(return_value) + bibcode = get_bibcode(doi='10.48550/arXiv.2303.17899') + self.assertEqual(bibcode, '2023arXiv230317899C') + + def test_get_bibcode_error(self): + """ some reference files provide doi, and bibcode needs to be infered from doi when solr returns error""" + with patch('requests.get') as get_mock: + get_mock.return_value = mock_response = MagicMock() + mock_response.status_code = 502 + bibcode = get_bibcode(doi='10.48550/arXiv.2303.17899') + self.assertEqual(bibcode, None) + + def test_get_bibcode_exception(self): + """ test get_bibcode when a request exception occurs """ + with patch('requests.get') as get_mock: + get_mock.side_effect = requests.exceptions.RequestException("Connection error") + self.assertEqual(get_bibcode(doi='10.48550/arXiv.2303.17899'), None) + + def test_verify_bibcode(self): + """ test calling solr to verify a bibcode """ + return_value = { + u'responseHeader': {u'status': 0, u'QTime': 13}, + u'response': { + u'start': 0, + u'numFound': 1, + u'docs': [{u'bibcode': u'2023arXiv230317899C'}] + } + } + with patch('requests.get') as get_mock: + get_mock.return_value = mock_response = MagicMock() + mock_response.status_code = 200 + mock_response.text = json.dumps(return_value) + bibcode = verify_bibcode(bibcode='2023arXiv230317899C') + self.assertEqual(bibcode, '2023arXiv230317899C') + + def test_verify_bibcode_error(self): + """ test calling solr to verify a bibcode when error is returned """ + with patch('requests.get') as get_mock: + get_mock.return_value = mock_response = MagicMock() + mock_response.status_code = 502 + bibcode = verify_bibcode(bibcode='2023arXiv230317899C') + self.assertEqual(bibcode, '') + + def test_verify_bibcode_exception(self): + """ test verify_bibcode when a request exception occurs """ + with patch('requests.get') as get_mock: + get_mock.side_effect = requests.exceptions.RequestException("Connection error") + self.assertEqual(verify_bibcode(bibcode='2023arXiv230317899C'), "") + + def test_get_resolved_references_error(self): + """ test calling post_request_resolved_reference with wrong end point """ + references = [{'item_num': 2, + 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', + 'refraw': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 '}] + self.assertEqual(post_request_resolved_reference(references, 'wrong_url'), None) + + with patch('requests.post') as get_mock: + get_mock.return_value = mock_response = MagicMock() + mock_response.status_code = 502 + self.assertEqual(post_request_resolved_reference(references, 'xml'), None) + + def test_post_request_resolved_reference_exception(self): + """ test post_request_resolved_reference when a request exception occurs """ + references = [{'item_num': 2, + 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', + 'id': '2'}] + + with patch('requests.post') as get_mock: + get_mock.side_effect = requests.exceptions.RequestException("Connection error") + self.assertEqual(post_request_resolved_reference(references[0], 'text'), None) + + @patch("adsrefpipe.utils.path.getctime") + @patch("adsrefpipe.utils.time.localtime") + def test_get_date_created(self, mock_localtime, mock_getctime): + """ test get_date_created method """ + + # mock file creation time (epoch timestamp) corresponding to 2023-01-01 00:00:00 UTC + mock_getctime.return_value = 1672531200 + + # create struct_time object that matches what the function expects + time_tuple = time.struct_time((2023, 1, 1, 0, 0, 0, 6, 1, 0)) + mock_localtime.return_value = time_tuple + + # patch DATE_FORMAT to match + with patch("adsrefpipe.utils.DATE_FORMAT", "%04d/%02d/%02d %02d:%02d:%02d"): + self.assertEqual(get_date_created("dummy_file.txt"), "2023/01/01 00:00:00") + + @patch("adsrefpipe.utils.path.getmtime") + @patch("adsrefpipe.utils.time.localtime") + def test_get_date_modified_struct_time(self, mock_localtime, mock_getmtime): + """ test get_date_modified_struct_time method """ + + # mock file modification time (epoch timestamp) corresponding to 2023-01-01 00:00:00 UTC + mock_getmtime.return_value = 1672531200 + + # create a struct_time object + expected_time = time.struct_time((2023, 1, 1, 0, 0, 0, 6, 1, 0)) + mock_localtime.return_value = expected_time + + result = get_date_modified_struct_time("test_file.txt") + + # verify the mocks were called with correct arguments + mock_getmtime.assert_called_once_with("test_file.txt") + mock_localtime.assert_called_once_with(1672531200) + + self.assertEqual(result, expected_time) + + +if __name__ == '__main__': + unittest.main() diff --git a/adsrefpipe/utils.py b/adsrefpipe/utils.py old mode 100644 new mode 100755 index b9c5a0f..37095c7 --- a/adsrefpipe/utils.py +++ b/adsrefpipe/utils.py @@ -3,7 +3,6 @@ from builtins import str from os import path import time -import re import json import requests @@ -21,55 +20,57 @@ class ReprocessQueryType: score, bibstem, year, failed = range(4) -def get_date_created(filename): +def get_date_created(filename: str) -> str: """ + get the creation date of the file - :param filename: - :return: file's created date in the format YYYY/MM/DD HH:MM:SS + :param filename: the name of the file + :return: file's creation date in the format YYYY/MM/DD HH:MM:SS """ return DATE_FORMAT%(time.localtime(path.getctime(filename))[:-3]) - -def get_date_modified(filename): +def get_date_modified(filename: str) -> str: """ + get the last modified date of the file - :param filename: + :param filename: the name of the file :return: file's modified date in the format YYYY/MM/DD HH:MM:SS """ return DATE_FORMAT%(time.localtime(path.getmtime(filename))[:-3]) - -def get_date_modified_struct_time(filename): +def get_date_modified_struct_time(filename: str) -> time.struct_time: """ + get the last modified date of the file in time.struct_time format - :param filename: - :return: file's modified date in the time.struct_time format + :param filename: the name of the file + :return: file's modified date in time.struct_time format """ return time.localtime(path.getmtime(filename)) -def get_date_now(): +def get_date_now() -> str: """ + get the current date and time - :return: current time in the format YYYY/MM/DD HH:MM:SS + :return: current date and time in the format YYYY/MM/DD HH:MM:SS """ return DATE_FORMAT%(time.localtime(time.time())[:-3]) -def get_resolved_filename(source_filename): +def get_resolved_filename(source_filename: str) -> str: """ + get the resolved filename for storing service results - :param source_filename: - :return: resolved file name to save the result of service resolved references + :param source_filename: reference filename + :return: resolved filename for storing results """ return source_filename.replace('sources','retrieve') + '.result' - -def get_resolved_references(reference, service_url): +def post_request_resolved_reference(reference: dict, service_url: str) -> list: """ - send a request to reference service + send a request to reference service to resolve reference(s) - :param reference: dict containing one reference info - :param service_url - :return: + :param reference: dictionary containing reference info + :param service_url: url of the reference service + :return: resolved reference from the service """ if service_url.endswith('text'): payload = {'reference': [reference['refstr']], 'id': [reference['id']]} @@ -96,14 +97,15 @@ def get_resolved_references(reference, service_url): logger.error('Unable to connect to the service: %s'%str(e)) return None -# this function shall be removed from final product, no need to unittest -def read_classic_resolved_file(source_bibcode, filename): # pragma: no cover +def read_classic_resolved_file(source_bibcode: str, filename: str) -> list: # pragma: no cover """ - read classic resolved file + read references from a classic resolved file - :param source_bibcode: - :param filename: - :return: + note that this function shall be removed from final product, no need to unittest + + :param source_bibcode: bibcode to match + :param filename: classic resolved file name + :return: list of resolved references or None """ try: resolved = [] @@ -122,16 +124,18 @@ def read_classic_resolved_file(source_bibcode, filename): # pragma: no cover return resolved except: logger.error('Unable to read references from classic resolved file %s.' % (filename)) - return None + return [] -# this function shall be removed from final product, no need to unittest -def get_compare_state(service_bibcode, classic_bibcode, classic_score): # pragma: no cover +def get_compare_state(service_bibcode: str, classic_bibcode: str, classic_score: str) -> str: # pragma: no cover """ - compare service and classic resolved bibcodes and return descriptive state - :param service_bibcode: - :param classic_bibcode: - :param classic_score: - :return: + compare bibcodes and return descriptive comparison state + + note that this function shall be removed from final product, no need to unittest + + :param service_bibcode: bibcode of resolved reference from the service side (new) + :param classic_bibcode: bibcode of resolved reference from the classic side + :param classic_score: confidence score from the classic file + :return: comparison state """ not_found = '.' * 19 @@ -152,15 +156,16 @@ def get_compare_state(service_bibcode, classic_bibcode, classic_score): # pragm return 'NEW' return 'NONE' -# this function shall be removed from final product, no need to unittest -def compare_classic_and_service(service, source_bibcode, classic_filename): # pragma: no cover +def compare_classic_and_service(service: list, source_bibcode: str, classic_filename: str) -> list: # pragma: no cover """ - compare the result of service and classic resolved references + compare the results from service and classic resolved references + + note that this function shall be removed from final product, no need to unittest - :param service: resolved references from service, in dict structure - :param source_bibcode: - :param classic: resolved references from classic, string format - :return: + :param service: resolved references from reference service + :param source_bibcode: bibcode to match + :param classic_filename: classic filename containing the resolved references + :return: list of comparison results """ classic = read_classic_resolved_file(source_bibcode, classic_filename) if not classic: @@ -192,12 +197,12 @@ def compare_classic_and_service(service, source_bibcode, classic_filename): # p break return compare -def get_bibcode(doi): +def get_bibcode(doi: str) -> str: """ - send a request to solr service to get the bibcode from doi + send a request to solr service to get the bibcode from a DOI - :param doi - :return: + :param doi: the DOI to look up + :return: bibcode corresponding to the DOI """ headers = {'Authorization': 'Bearer ' + config['REFERENCE_PIPELINE_ADSWS_API_TOKEN']} try: @@ -214,12 +219,12 @@ def get_bibcode(doi): logger.error('Unable to connect to the solr service: %s'%str(e)) return None -def verify_bibcode(bibcode): +def verify_bibcode(bibcode: str) -> str: """ - send a request to solr service to verify the bibcode is correct + send a request to solr service to verify if bibcode is correct - :param doi - :return: + :param bibcode: bibcode to verify + :return: verified bibcode or None """ headers = {'Authorization': 'Bearer ' + config['REFERENCE_PIPELINE_ADSWS_API_TOKEN']} try: @@ -231,7 +236,7 @@ def verify_bibcode(bibcode): bibcode = docs[0].get('bibcode') return bibcode logger.error('Attempt at verify bibcode %s failed with status code %s.' % (bibcode, r.status_code)) - return None + return '' except requests.exceptions.RequestException as e: logger.error('Unable to connect to the solr service: %s'%str(e)) - return None + return '' diff --git a/alembic/versions/378ac509c8dc_move_mapping_of_paicz_and_cokon_from_.py b/alembic/versions/378ac509c8dc_move_mapping_of_paicz_and_cokon_from_.py new file mode 100755 index 0000000..ab00891 --- /dev/null +++ b/alembic/versions/378ac509c8dc_move_mapping_of_paicz_and_cokon_from_.py @@ -0,0 +1,44 @@ +"""Move mapping of PAICz and CoKon from ADStex to ADStxt + +Revision ID: 378ac509c8dc +Revises: 55d2bf274509 +Create Date: 2024-10-11 10:44:50.306251 + +""" +from alembic import op +import sqlalchemy as sa + +from sqlalchemy.dialects import postgresql + +# revision identifiers, used by Alembic. +revision = '378ac509c8dc' +down_revision = '55d2bf274509' +branch_labels = None +depends_on = None + +def move_mapping(journal, current_parser, future_parser): + # Move the journal from current_parser to future_parser + op.execute(f""" + UPDATE parser + SET matches = ( + SELECT jsonb_agg(elem) + FROM jsonb_array_elements(matches) elem + WHERE elem != '{{"journal": "{journal}", "all_volume": true}}' + ) + WHERE name = '{current_parser}'; + """) + + op.execute(f""" + UPDATE parser + SET matches = matches || '{{"journal": "{journal}", "all_volume": true}}' + WHERE name = '{future_parser}'; + """) + +def upgrade(): + move_mapping('PAICz', 'ADStex', 'ADStxt') + move_mapping('CoKon', 'ADStex', 'ADStxt') + + +def downgrade(): + move_mapping('PAICz', 'ADStxt', 'ADStex') + move_mapping('CoKon', 'ADStxt', 'ADStex') diff --git a/pytest.ini b/pytest.ini old mode 100644 new mode 100755 index 85711df..a9154b5 --- a/pytest.ini +++ b/pytest.ini @@ -1,3 +1,3 @@ [pytest] addopts = --cov=adsrefpipe --cov-report=term-missing -testpaths = adsrefpipe/tests/ \ No newline at end of file +testpaths = adsrefpipe/tests diff --git a/run.py b/run.py old mode 100644 new mode 100755 index be90758..12bc533 --- a/run.py +++ b/run.py @@ -3,6 +3,7 @@ from adsputils import setup_logging, load_config, get_date from datetime import timedelta +import time import argparse @@ -17,12 +18,13 @@ logger = setup_logging('run.py') -def run_diagnostics(bibcodes, source_filenames): +def run_diagnostics(bibcodes: list, source_filenames: list) -> None: """ - Show information about what we have in our storage. + show diagnostic information based on the provided bibcodes and source filenames - :param: bibcodes - list of bibcodes - :param: source_filenames - list of source filenames + :param bibcodes: list of bibcodes to retrieve diagnostic data for + :param source_filenames: list of source filenames to retrieve diagnostic data for + :return: None """ max_entries_diagnostics = config['MAX_ENTRIES_DIAGNOSTICS'] # make sure we only send max number of entires per bibcode/source_file to be queried @@ -36,10 +38,13 @@ def run_diagnostics(bibcodes, source_filenames): return -def get_source_filenames(source_file_path, file_extension, date_cutoff): +def get_source_filenames(source_file_path: str, file_extension: str, date_cutoff: time.struct_time) -> list: """ - :param source_file_path: - :param date_cutoff: if modified date is after this date + retrieves a list of files from the given directory with the specified file extension and modified date after the cutoff + + :param source_file_path: the path of the directory to search for files + :param file_extension: the file extension pattern to match + :param date_cutoff: the modified date cutoff, files modified after this date will be included only :return: list of files in the directory with modified date after the cutoff, if any """ list_files = [] @@ -52,14 +57,15 @@ def get_source_filenames(source_file_path, file_extension, date_cutoff): return list_files -def queue_references(references, source_filename, source_bibcode, parsername): +def queue_references(references: list, source_filename: str, source_bibcode: str, parsername: str) -> None: """ + queues references for processing by preparing a task and sending it to the queue - :param reference: - :param source_filename: - :param source_bibcode: - :param parsername: - :return: + :param references: a list of reference objects to be queued for processing + :param source_filename: the name of the source file from which references are being queued + :param source_bibcode: the bibcode associated with the source of the references + :param parsername: the name of the parser used to extract the references + :return: None """ resolver_service_url = config['REFERENCE_PIPELINE_SERVICE_URL'] + app.get_reference_service_endpoint(parsername) for reference in references: @@ -67,16 +73,20 @@ def queue_references(references, source_filename, source_bibcode, parsername): 'source_bibcode': source_bibcode, 'source_filename': source_filename, 'resolver_service_url': resolver_service_url} - tasks.task_process_reference.delay(reference_task) + # tasks.task_process_reference.delay(reference_task) + print('---here') + tasks.task_process_reference(reference_task) -def process_files(filenames): +def process_files(filenames: list) -> None: """ - two ways to queue references: one is to read source files, the other is to query database - this is to read the source reference file and queue each reference for processing + processes the given list of filenames by reading source reference files and sending each reference for processing + + note that there are two ways to queue references: one is to read source files, the other is to query database + this function handles the former - :param files: - :return: + :param filenames: list of filenames to be processed + :return: None """ for filename in filenames: # from filename get the parser info @@ -128,15 +138,18 @@ def process_files(filenames): logger.error("Unable to process %s. Skipped!" % toREFs.filename) -def reprocess_references(reprocess_type, score_cutoff=0, match_bibcode='', date_cutoff=None): +def reprocess_references(reprocess_type: str, score_cutoff: float = 0, match_bibcode: str = '', date_cutoff: time.struct_time = None) -> None: """ + reprocesses references by querying the database and sending each reference for processing + two ways to queue references: one is to read source files, the other is to query database - this is to query the db and queue each reference for processing + this function handles the latter - :param reprocess_type: - :param param: - :param date_cutoff: - :return: + :param reprocess_type: the type of query to be performed to get references (e.g., by score, bibstem, year, etc.) + :param score_cutoff: confidence score below which references will be reprocessed (default is 0) + :param match_bibcode: bibcode wildcard to match for reprocessing (optional) + :param date_cutoff: only references after this date will be considered (optional) + :return: None """ records = app.get_reprocess_records(reprocess_type, score_cutoff, match_bibcode, date_cutoff) for record in records: @@ -289,15 +302,28 @@ def reprocess_references(reprocess_type, score_cutoff=0, match_bibcode='', date_ action='store_true', help='Print out the count of records in the four main tables') + query = subparsers.add_parser('QUERY', help='Print out statistics of the reference source file') + query.add_argument('-b', + '--bibcode', + dest='bibcode', + action='store', + default=None, + help='Query database by source bibcode, return resolved bibcodes') + query.add_argument('-a', + '--all', + dest='all', + action='store_true', + help='Return all resolved bibcode') + args = parser.parse_args() if args.action == 'DIAGNOSTICS': if args.parse_filename: name = app.get_parser(args.parse_filename) if name: - print('Source file `%s` shall be parsed using `%s` parser.' % (args.parse_filename, name)) + logger.info('Source file `%s` shall be parsed using `%s` parser.' % (args.parse_filename, name)) else: - print('No parser yet to parse source file `%s`.' % args.parse_filename) + logger.error('No parser yet to parse source file `%s`.' % args.parse_filename) # either pass in the list of bibcodes, or list of filenames to query db on # if neither bibcode nor filenames are supplied, number of records for the tables are displayed else: @@ -308,9 +334,9 @@ def reprocess_references(reprocess_type, score_cutoff=0, match_bibcode='', date_ process_files(args.source_filenames) elif args.path or args.extension: if not args.extension: - print('Both path and extension are required params. Provide extention by -e .') + logger.error('Both path and extension are required params. Provide extention by -e .') elif not args.path: - print('Both path and extension are required params. Provide path by -p .') + logger.error('Both path and extension are required params. Provide path by -p .') else: # if days has been specified, read it and only consider files with date from today-days, # otherwise we are going with everything @@ -334,9 +360,9 @@ def reprocess_references(reprocess_type, score_cutoff=0, match_bibcode='', date_ date_cutoff = get_date() - timedelta(days=int(args.days)) if args.days else None reprocess_references(ReprocessQueryType.failed, date_cutoff=date_cutoff) - # TODO: do we need more command for querying db + # keeping prints for stats commands, since the user possibly wants to see the replies, instead of seeing them in logs elif args.action == 'STATS': if args.bibcode or args.source_filename: table, num_references, num_resolved = app.get_service_classic_compare_stats_grid(args.bibcode, args.source_filename) @@ -358,4 +384,11 @@ def reprocess_references(reprocess_type, score_cutoff=0, match_bibcode='', date_ print('Currently there are %d records in `%s` table, which holds %s.'%(result['count'], result['name'], result['description'])) print('\n') + elif args.action == 'QUERY': + results = app.get_resolved_references('0000PThPS...0.....U') + for r in results: + print(r) + # if args.all: + # else: + sys.exit(0) \ No newline at end of file diff --git a/uml/adsrefpipe_plantuml.txt b/uml/adsrefpipe_plantuml.txt new file mode 100644 index 0000000..2d3f7f2 --- /dev/null +++ b/uml/adsrefpipe_plantuml.txt @@ -0,0 +1,105 @@ +@startuml + +package "adsrefpipe" { + + class "ADSReferencePipelineCelery" { + +init_default_parsers() + +match_parser() + +get_parser() + +get_reference_service_endpoint() + +query_reference_source_tbl() + +query_processed_history_tbl() + +query_resolved_reference_tbl() + +diagnostic_query() + +insert_reference_source_record() + +insert_history_record() + +insert_resolved_reference_records() + +update_resolved_reference_records() + +insert_compare_records() + +populate_resolved_reference_records_pre_resolved() + +populate_tables_pre_resolved_initial_status() + +populate_tables_pre_resolved_retry_status() + +populate_tables_post_resolved() + } + + class "Action" { + -status: String + +get_status_new(): str + +get_status_retry(): str + } + + class "Parser" { + -name: String + -extension_pattern: String + -reference_service_endpoint: String + -matches: JSONB + +get_name(): str + +get_extension_pattern(): str + +get_endpoint(): str + +get_matches(): list + +toJSON(): dict + } + + class "ReferenceSource" { + -bibcode: String + -source_filename: String + -resolved_filename: String + -parser_name: String + +toJSON(): dict + } + + class "ProcessedHistory" { + -id: Integer + -bibcode: String + -source_filename: String + -source_modified: DateTime + -status: String + -date: DateTime + -total_ref: Integer + +toJSON(): dict + } + + class "ResolvedReference" { + -history_id: Integer + -item_num: Integer + -reference_str: String + -bibcode: String + -score: Numeric + -reference_raw: String + +toJSON(): dict + } + + class "CompareClassic" { + -history_id: Integer + -item_num: Integer + -bibcode: String + -score: Numeric + -state: String + +toJSON(): dict + } + + class "tasks" { + +task_process_reference(reference_task: dict): bool + } + + class "utils" { + +get_date_created(filename: str): str + +get_date_modified(filename: str): str + +get_date_now(): str + +get_resolved_filename(source_filename: str): str + +post_request_resolved_reference(reference: dict, service_url: str): list + +compare_classic_and_service(service: list, source_bibcode: str, classic_filename: str): list + } + + ADSReferencePipelineCelery --> Parser + ADSReferencePipelineCelery --> ReferenceSource + ADSReferencePipelineCelery --> ProcessedHistory + ADSReferencePipelineCelery --> ResolvedReference + ADSReferencePipelineCelery --> CompareClassic + ADSReferencePipelineCelery --> utils + tasks --> utils + tasks --> ADSReferencePipelineCelery + +} + +@enduml diff --git a/uml/database_plantuml.txt b/uml/database_plantuml.txt new file mode 100755 index 0000000..2bfdb23 --- /dev/null +++ b/uml/database_plantuml.txt @@ -0,0 +1,59 @@ +@startuml + +entity "Action" { + *status: VARCHAR <> +} + +entity "Parser" { + *name: VARCHAR <> + -- + extension_pattern: VARCHAR + reference_service_endpoint: VARCHAR + matches: JSONB +} + +entity "ReferenceSource" { + *bibcode: VARCHAR <> + *source_filename: VARCHAR <> + -- + resolved_filename: VARCHAR + parser_name: VARCHAR +} + +entity "ProcessedHistory" { + *id: INTEGER <> + *bibcode: VARCHAR + *source_filename: VARCHAR + -- + source_modified: DATETIME + status: VARCHAR + date: DATETIME + total_ref: INTEGER +} + +entity "ResolvedReference" { + *history_id: INTEGER <> + *item_num: INTEGER <> + *reference_str: VARCHAR <> + -- + bibcode: VARCHAR + score: NUMERIC + reference_raw: VARCHAR +} + +entity "CompareClassic" { + *history_id: INTEGER <> + *item_num: INTEGER <> + -- + bibcode: VARCHAR + score: NUMERIC + state: VARCHAR +} + +ReferenceSource --> Parser : parser_name +ProcessedHistory --> ReferenceSource : "(bibcode, source_filename)" +ResolvedReference --> ProcessedHistory : history_id +CompareClassic --> ProcessedHistory : history_id +ProcessedHistory --> Action : status + +@enduml \ No newline at end of file From 7e4bcf70de536a7fa213f7796180df6c93ef69ae Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Wed, 5 Mar 2025 16:39:04 -0500 Subject: [PATCH 4/9] downgrade pip to <24.1 for Celery 4.4.2 compatibility --- .github/workflows/python_actions.yml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/.github/workflows/python_actions.yml b/.github/workflows/python_actions.yml index d8357a0..43f4bf6 100644 --- a/.github/workflows/python_actions.yml +++ b/.github/workflows/python_actions.yml @@ -45,7 +45,7 @@ jobs: - name: Install dependencies run: | - python -m pip install --upgrade setuptools pip + python -m pip install --upgrade setuptools "pip<24.1" pip install -r requirements.txt pip install -r dev-requirements.txt - name: Test with pytest From e88836f0c9a2656b114d608c5f9be009fefa347f Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Fri, 14 Mar 2025 10:01:53 -0400 Subject: [PATCH 5/9] added function/parameter descriptions and parameter/return type to the two base modules for reference parsers, also test coverage is now 100 percent for both --- adsrefpipe/refparsers/reference.py | 441 ++++----- adsrefpipe/refparsers/toREFs.py | 867 ++++++++++-------- .../tests/unittests/test_ref_parsers_base.py | 0 3 files changed, 723 insertions(+), 585 deletions(-) create mode 100644 adsrefpipe/tests/unittests/test_ref_parsers_base.py diff --git a/adsrefpipe/refparsers/reference.py b/adsrefpipe/refparsers/reference.py index 0a17d27..bb28cd1 100644 --- a/adsrefpipe/refparsers/reference.py +++ b/adsrefpipe/refparsers/reference.py @@ -7,12 +7,15 @@ except ImportError: from collections import UserDict +from typing import List, Dict, Tuple, Any + from adsrefpipe.refparsers.xmlFile import XmlString from adsrefpipe.refparsers.unicode import UnicodeHandler unicode_handler = UnicodeHandler() -from adsputils import load_config +from adsputils import setup_logging, load_config +logger = setup_logging('refparsers') config = {} config.update(load_config()) @@ -21,7 +24,7 @@ class ReferenceError(Exception): """ is raised by Reference and XMLreference """ - + pass class Reference(UserDict): """ @@ -30,6 +33,8 @@ class Reference(UserDict): provide a value to the resolver attribute in some way and that you should probably override the parse method. """ + + # dictionary for storing various reference attributes with initial values as None ref_dict = { 'authors': None, 'jrlstr': None, @@ -48,6 +53,7 @@ class Reference(UserDict): 'series': None, } + # list of tuples mapping internal field names to their corresponding reference field names field_mappings = [ ("authors", "authors"), ("journal", "jrlstr"), @@ -65,46 +71,60 @@ class Reference(UserDict): ("bibcode", "bibcode") ] + # to match and remove spaces re_remove_space = re.compile(r'\s') + # to match and remove multiple spaces re_remove_extra_spaces = re.compile(r'\s+') + # to match a sequence of digits re_match_digit = re.compile(r'(\d+)') - re_match_roman_numerals = re.compile(r'(^(?=[MDCLXVI])M*D?C{0,4}L?X{0,4}V?I{0,4}$)') + # to match roman numerals + re_match_roman_numerals = re.compile(r'^(M{0,4}(?:CM|CD|D?C{0,3})?(?:XC|XL|L?X{0,3})?(?:IX|IV|V?I{0,3})?)$') # to match non-digit characters re_match_non_digit = re.compile(r'\D+') + # to match hex-encoded characters (URL encoded) re_hex_decode = re.compile(r'%[A-Fa-f0-9]{2}') + # to remove XML tags re_remove_xml_tag = re.compile(r'<.*?>') - arxiv_category = ['acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph', 'bayes-an', 'chao-dyn', 'chem-ph', - 'cmp-lg', 'comp-gas', 'cond-mat', 'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', - 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin', 'nucl-ex', 'nucl-th', 'patt-sol', 'physics', 'plasm-ph', - 'q-alg', 'q-bio', 'quant-ph', 'solv-int', 'supr-con'] - re_arxiv_old_pattern = re.compile( - r'\b(?:arXiv\W*)?(' + "|".join(arxiv_category) + r')(\.[A-Z]{2})?/(\d{7})(:?v\d+)?\b', re.IGNORECASE) - re_arxiv_new_pattern = re.compile(r'\b(?:(?:arXiv\s*\W?\s*)|(?:(?:' + "|".join( - arxiv_category) + r')\s*[:/]?\s*)|(?:http://.*?/abs/)|(?:))(\d{4})\.(\d{4,5})(?:v\d+)?\b', re.IGNORECASE) - + # list of arxiv categories + arxiv_category = ['acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph', 'bayes-an', 'chao-dyn', + 'chem-ph', 'cmp-lg', 'comp-gas', 'cond-mat', 'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', + 'hep-lat', 'hep-ph', 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin', 'nucl-ex', 'nucl-th', + 'patt-sol', 'physics', 'plasm-ph', 'q-alg', 'q-bio', 'quant-ph', 'solv-int', 'supr-con'] + + # to match old arxiv ID format + re_arxiv_old_pattern = re.compile(r'\b(?:arXiv\W*)?(' + "|".join(arxiv_category) + r')(\.[A-Z]{2})?/(\d{7})(:?v\d+)?\b', re.IGNORECASE) + # to match new arxiv ID format + re_arxiv_new_pattern = re.compile(r'\b(?:(?:arXiv\s*\W?\s*)|(?:(?:' + "|".join(arxiv_category) + r')\s*[:/]?\s*)|(?:http://.*?/abs/)|(?:))(\d{4})\.(\d{4,5})(?:v\d+)?\b', re.IGNORECASE) + # to match DOI in the reference string re_doi = re.compile(r'\bdoi:\s*(10\.[\d\.]{2,9}/\S+\w)', re.IGNORECASE) + # to match DOI in XML format re_doi_xml = re.compile(r'(10\.[\d\.]{2,9}/\S+)', re.IGNORECASE) + # to match DOI URL format re_doi_url = re.compile(r'//(?:dx\.)?doi\.org/(10\.[\d\.]{2,9}/[^<\s\."]*)', re.IGNORECASE) - # this is so we can catch cases such as the following: - # Pinter, T. et al (2013), PIMO, La Palma, Spain, 213-217, 10.5281/zenodo.53085 + # to match DOI in reference string in the form "Pinter, T. et al (2013), PIMO, La Palma, Spain, 213-217, 10.5281/zenodo.53085" re_doi_prm = re.compile(r'\b(10.[\d\.]{2,9}/\S+\w)', re.IGNORECASE) - + # to match a 4-digit year re_year = re.compile(r'\b[12][09]\d\d[a-z]?\b') + # to match year in parentheses re_year_parentheses = re.compile(r'\(.*([12][09]\d\d).*\)') + # dictionary for roman numerals and their integer values romans_numeral = { 'M': 1000, 'CM': 900, 'D': 500, 'CD': 400, 'C': 100, 'XC': 90, 'L': 50, 'XL': 40, 'X': 10, 'IX': 9, 'V': 5, 'IV': 4, 'I': 1, } + + # list of roman numeral keys sorted in descending order of their integer value romans_numeral_keys = [x[0] for x in sorted(romans_numeral.items(), key=lambda x: x[1], reverse=True)] - def __init__(self, reference_str, unicode=None): + def __init__(self, reference_str: str, unicode: UnicodeHandler = None): """ - - :param reference_str: - :param unicode: + initialize a reference object with a reference string and optional unicode handler + + :param reference_str: reference string to be parsed + :param unicode: unicode handler for text processing """ UserDict.__init__(self, initialdata=self.ref_dict) @@ -116,9 +136,9 @@ def __init__(self, reference_str, unicode=None): self.parsed = False try: self.parse() - except AttributeError: - # xml string was not parsed to create the xml structure with childNodes - pass + except Exception as e: + logger.error(f"Error initializing Reference: {str(e)}") + raise def parse(self): """ @@ -134,19 +154,17 @@ def parse(self): """ raise ReferenceError("Parse method not defined.") - def parse_pages(self, page, ignore="", letters=""): + def parse_pages(self, page: str, ignore: str = "", letters: str = "") -> Tuple[str, str]: """ returns both a page number and a qualifier for that page number. This is done to correctly handle both letters and overlong (>4 chars) page numbers. Unfortunately, this somewhat duplicates what's being done in Bibcode. - Whats the point of ignore? - - :param page: - :param ignore: - :param letters: - :return: + :param page: the page string to parse + :param ignore: characters to ignore in parsing + :param letters: characters to treat as qualifiers + :return: a tuple containing the parsed page number and qualifier """ page_num = '' qualifier = None @@ -183,12 +201,13 @@ def parse_pages(self, page, ignore="", letters=""): return page_num, qualifier - def combine_page_qualifier(self, page_num, qualifier): + def combine_page_qualifier(self, page_num: str, qualifier: str) -> str: """ + combines a page number and its qualifier into a single string - :param page_num: - :param qualifier: - :return: + :param page_num: page number + :param qualifier: page qualifier + :return: combined page number and qualifier """ pages = '' for to_combine in [qualifier, page_num]: @@ -196,15 +215,15 @@ def combine_page_qualifier(self, page_num, qualifier): pages += to_combine return pages - def parse_volume(self, volume): + def parse_volume(self, volume: str) -> str: """ returns the first contiguous sequence of digits in the string volume (or an empty string, if there are no digits). For ADS' usual naming convention, this will return a volume number. - :param volume: - :return: + :param volume: the volume string to parse + :return: extracted volume number """ vol_num = '' if volume: @@ -218,10 +237,11 @@ def parse_volume(self, volume): return vol_num - def get_parsed_reference(self): + def get_parsed_reference(self) -> Dict: """ + returns a dictionary of parsed reference fields - :return: + :return: a dictionary containing parsed reference fields """ record = {} for dest_key, src_key in self.field_mappings: @@ -230,31 +250,31 @@ def get_parsed_reference(self): record[dest_key] = value return record - def url_decode(self, url_str): + def url_decode(self, url_str: str) -> str: """ - hex-decodes characters in a URL string; this is a naive - version with limited use but allows us to make things work - rather than using urllib.parse() + hex-decodes characters in a URL string. - :param url_str: - :return: + :param url_str: the URL string to decode + :return: the decoded URL string """ - def hex2c(match): - s = match.group(0) - if s[0] != '%' or len(s) != 3: - return s - try: - r = chr(int(s[1:], 16)) - except ValueError: - r = s - return r - return self.re_hex_decode.sub(hex2c, url_str) + # find all matches for %XX (hexadecimal characters) in the URL string + matches = self.re_hex_decode.findall(url_str) + + # for each match, convert it to a character and replace it in the string + for match in matches: + hex_value = match[1:] + # convert hex to character + char = chr(int(hex_value, 16)) + url_str = url_str.replace(match, char) - def match_arxiv_id(self, ref_str): + return url_str + + def match_arxiv_id(self, ref_str: str) -> str: """ - - :param ref_str: - :return: + extracts an arXiv ID from a reference string + + :param ref_str: the reference string to search for an arXiv ID + :return: the arXiv ID if found, otherwise None """ match_start = self.re_arxiv_old_pattern.search(ref_str) if match_start: @@ -262,26 +282,26 @@ def match_arxiv_id(self, ref_str): match_start = self.re_arxiv_new_pattern.search(ref_str) if match_start: return match_start.group(1) + '.' + match_start.group(2) - return None - def match_doi(self, ref_str): + def match_doi(self, ref_str: str) -> str: """ - - :param ref_str: - :return: + extracts a DOI from a reference string + + :param ref_str: the reference string to search for a DOI + :return: the DOI if found """ + match_start = self.re_doi.search(ref_str) or self.re_doi_xml.search(ref_str) or \ self.re_doi_url.search(ref_str) or self.re_doi_prm.search(ref_str) if match_start: return match_start.group(1) - - def match_int(self, ref_str): + def match_int(self, ref_str: str) -> str: """ extracts the first integer found in a string - - :param ref_str: - :return: + + :param ref_str: the reference string to search for an integer + :return: the extracted integer as a string """ if ref_str: if isinstance(ref_str, list): @@ -289,15 +309,14 @@ def match_int(self, ref_str): match = re.match(r'.*?(\d+)', ref_str) if match: return match.group(1) - return '' - def match_year(self, refstr): + def match_year(self, refstr: str) -> str: """ xtracts a 4-digit year in an input string, if there is only one if there are more than one 4-digit year, see if one is in parentheses - :param refstr: - :return: + :param refstr: the reference string to search for a year + :return: the extracted year if found """ match = list(set(self.re_year.findall(refstr))) if len(match) == 1: @@ -306,13 +325,14 @@ def match_year(self, refstr): match = self.re_year_parentheses.search(refstr) if match: return match.group(1) - return None - def int2roman(self, int_value): + def int2roman(self, int_value: int) -> str: """ + converts an integer to a Roman numeral - :param int_value: - :return: + :param int_value: the integer to convert + :raises ReferenceError: if the integer is out of the valid range + :return: the Roman numeral representation """ result = '' if int_value < 1 or int_value > 4000: @@ -323,11 +343,13 @@ def int2roman(self, int_value): int_value = int_value - self.romans_numeral[i] return result - def roman2int(self, roman_value): + def roman2int(self, roman_value: str) -> int: """ + converts a Roman numeral to an integer - :param roman_value: - :return: + :param roman_value: the Roman numeral to convert + :return: the integer representation + :raises ReferenceError: if the input is not a valid Roman numeral """ roman_value = roman_value.upper() @@ -358,22 +380,24 @@ class XMLreference(Reference): appropriate fields to be used by the resolver by walking it. """ + # to match valid reference strings (word of at least 3 characters) re_valid_refstr = [ re.compile(r'\w{3,}'), re.compile(r'\b[12][098]\d\d\w?\b|\d+(st|nd|rd|th)+') ] + + # to match unstructured URLs re_unstructured_url = re.compile(r'http\S+') + + # to match and remove extra whitespace re_extra_whitespace = re.compile(r"\s+") - def __init__(self, reference_str, unicode=None): + def __init__(self, reference_str: str, unicode: UnicodeHandler = None): """ - simply forwards the request to the superclass with the - exception that if we are passed a plain string in input - (rather than an XmlList object), an XmlList is created - on the fly via XmlString() - - :param reference_str: - :param unicode: + initializes the XMLReference object, parsing the input string if necessary + + :param reference_str: the reference string to parse + :param unicode: optional unicode string for additional processing """ if not reference_str: raise ReferenceError("XMLReference must have a non-empty input reference") @@ -389,36 +413,32 @@ def __init__(self, reference_str, unicode=None): Reference.__init__(self, reference_str, unicode) - def __str__(self): + def __str__(self) -> str: """ - - :return: + returns a string representation of the XMLReference object + + :return: the string representation of the object """ - if self.is_types_stringtypes(self.reference_str): - return self.reference_str - else: + if not self.is_types_stringtypes(self.reference_str): try: return self.unicode.u2ent(self.reference_str.toxml()) except: return '' - def is_types_stringtypes(self, obj): + def is_types_stringtypes(self, obj: Any) -> bool: """ + checks if the given object is a string type - :param obj: - :return: + :param obj: the object to check + :return: True if the object is a string, False otherwise """ - try: - return isinstance(obj, str) - except NameError: - return isinstance(obj, str) + return isinstance(obj, str) - def get_reference_str(self): + def get_reference_str(self) -> str: """ - format and return the refstr from extracted fields - if necessary fields have not been parsed, return an empty string + formats and returns the reference string from extracted fields - :return: + :return: the formatted reference string """ refstr = None try: @@ -475,11 +495,12 @@ def get_reference_str(self): return refstr - def get_reference_plain_text(self, refstr): + def get_reference_plain_text(self, refstr: str) -> str: """ + removes URLs from the reference string and formats it - :param refstr: - :return: + :param refstr: the reference string to process + :return: the cleaned and formatted reference string """ # remove any url from unstructured string if any refstr = self.re_unstructured_url.sub('', refstr).strip() @@ -492,15 +513,14 @@ def get_reference_plain_text(self, refstr): return self.re_extra_whitespace.sub(' ', refstr) return self.re_extra_whitespace.sub(' ', refstr) + config['INCOMPLETE_REFERENCE'] - def xmlnode_nodecontents(self, name, keepxml=0, attrs={}): + def xmlnode_nodecontents(self, name: str, keepxml: int = 0, attrs: Dict[str, str] = {}) -> str: """ - returns the text content of the first non-empty element in the DOM tree - which matches 'name' and has all the attributes and values passed - - :param name: - :param keepxml: - :param attrs: - :return: + returns the text content of the first non-empty element matching 'name' with given attributes + + :param name: the name of the element to search for + :param keepxml: flag to keep XML tags in the output + :param attrs: dictionary of attributes and values to match in the element + :return: the content of the element as a string """ contents = '' if not name: @@ -524,21 +544,18 @@ def xmlnode_nodecontents(self, name, keepxml=0, attrs={}): contents = self.re_remove_xml_tag.sub(' ', contents) try: contents = self.unicode.ent2asc(contents) - except Exception as e: + except: contents = self.unicode.cleanall(contents) return contents.strip() - def xmlnode_nodescontents(self, name, keepxml=0, attrs={}): + def xmlnode_nodescontents(self, name: str, keepxml: int = 0, attrs: Dict[str, str] = {}) -> List[str]: """ - returns an array of plain text strings representing the contents - of all the elements matching 'name' and with all the attributes - and values passed in the 'attrs' dictionary. - If no name is given a representation of the whole string is returned. - - :param name: - :param keepxml: - :param attrs: - :return: + returns a list of plain text strings representing the contents of all matching elements + + :param name: the name of the element to search for + :param keepxml: flag to keep XML tags in the output + :param attrs: dictionary of attributes and values to match in the element + :return: a list of text contents of matching elements """ if not name: return self.xmlnode_nodecontents(None) @@ -568,24 +585,14 @@ def xmlnode_nodescontents(self, name, keepxml=0, attrs={}): contents.append(self.unicode.cleanall(content.strip().replace('__amp__', '&'))) return contents - def xmlnode_textcontents(self, name, subels=[], attrs={}): + def xmlnode_textcontents(self, name: str, subels: List[str] = [], attrs: Dict[str, str] = {}) -> str: """ - - returns a plain text string containing just the contents from - the text node subelements. For instance, for the XML fragment: - This is a string foo bar, ok? - it will return: - This is a string , ok? - If a list of subelements is given, then the contents of - the named subelements are also returned. For instance, calling - self.xmlnode_textcontents('string', [ 'c' ]) - will return: - This is a string bar, ok? - - :param name: - :param subels: - :param attrs: - :return: + returns a plain text string containing contents from the text node subelements + + :param name: the name of the element to search for + :param subels: list of subelement names to include in the contents + :param attrs: dictionary of attributes and values to match in the element + :return: the combined text content of the element and subelements """ contents = '' required_attrs = set(attrs.items()) @@ -603,23 +610,20 @@ def xmlnode_textcontents(self, name, subels=[], attrs={}): for n in element.childNodes: if n.nodeType == n.TEXT_NODE: contents = contents + n.data - elif subels and \ - n.nodeType == n.ELEMENT_NODE and \ - n.nodeName in subels: + elif subels and n.nodeType == n.ELEMENT_NODE and n.nodeName in subels: for m in n.childNodes: if m.nodeType == m.TEXT_NODE: contents = contents + m.data return contents.strip() - def xmlnode_attribute(self, name, attrname): + def xmlnode_attribute(self, name: str, attrname: str) -> str: """ - returns the contents of an attribute of the given element name - as plain text. - - :param name: - :param attrname: - :return: + returns the contents of an attribute of the given element as plain text + + :param name: the name of the element + :param attrname: the name of the attribute + :return: the attribute value as a string """ if not name or not attrname: return '' @@ -637,14 +641,13 @@ def xmlnode_attribute(self, name, attrname): return contents.strip() - def xmlnode_attributes(self, name, attrname): + def xmlnode_attributes(self, name: str, attrname: str) -> Dict[str, str]: """ - returns the contents of an attribute of the given element name - as a dict. + returns a dictionary of attribute values from all matching elements - :param name: - :param attrname: - :return: + :param name: the name of the element + :param attrname: the name of the attribute + :return: a dictionary of attribute values and their corresponding content """ if not name or not attrname: return {} @@ -659,16 +662,16 @@ def xmlnode_attributes(self, name, attrname): contents[attr_value] = tag_value return contents - def xmlnode_attribute_match_return(self, name, attr_match, attrname_return): + def xmlnode_attribute_match_return(self, name: str, attr_match: Dict[str, str], attrname_return: str) -> str: """ - returns the contents of a return attribute of the given element name - as plain text if a match attribute matches the other attribute. + returns the contents of a return attribute if a match attribute matches the other attribute - :param name: - :param attr_match: this is a dict - :param attrname_return: this is a single value - :return: + :param name: the name of the element + :param attr_match: dictionary of attribute names and values to match + :param attrname_return: the attribute name whose value is returned if the match is found + :return: the attribute value or an empty string if no match is found """ + if not name or not attr_match or not attrname_return: return '' element = self.reference_str.getElementsByTagName(name) @@ -680,17 +683,29 @@ def xmlnode_attribute_match_return(self, name, attr_match, attrname_return): return e.getAttribute(attrname_return) return '' - def strip_tags(self, refstr, change=' '): + def strip_tags(self, refstr: str, change: str = ' ') -> str: """ - strips all XML tags from input string, keeping text between them + strips all XML tags from the input string, keeping text between them + + :param refstr: the reference string to clean + :param change: the string to replace the tags with + :return: the reference string with XML tags removed """ return self.re_remove_xml_tag.sub(change, refstr).strip() - def extract_tag(self, refstr, tag, remove=1, keeptag=0, greedy=0, foldcase=0, attr=0, join=''): + def extract_tag(self, refstr: str, tag: str, remove: int = 1, keeptag: int = 0, greedy: int = 0, foldcase: int = 0, attr: int = 0, join: str = '') -> Tuple[str, str]: """ - extracts an XML tag from the input reference string - and returns the (potentially) modified input string - as well as the extracted tag + extracts an XML tag from the input reference string and returns the (modified) string and the extracted tag + + :param refstr: the reference string to process + :param tag: the XML tag to extract + :param remove: flag to remove the tag from the reference string + :param keeptag: flag to keep the tag in the returned result + :param greedy: flag for greedy matching + :param foldcase: flag to fold case in matching + :param attr: flag to consider attributes in matching + :param join: string to join with if the tag is removed + :return: the modified reference string and the extracted tag """ if not refstr: return '', None @@ -714,18 +729,21 @@ def extract_tag(self, refstr, tag, remove=1, keeptag=0, greedy=0, foldcase=0, at refstr = refstr[:match_start.start()] + join + refstr[match_start.end():] return refstr, substr - def dexml(self, refstr): + def dexml(self, refstr: str) -> str: """ returns a poor man's ASCII version of the input XML string + + :param refstr: the XML reference string to process + :return: the ASCII version of the string """ return self.unicode.ent2asc(self.strip_tags(refstr)).strip() - - def to_ascii(self, text): + def to_ascii(self, text: str) -> str: """ + converts the input text to ASCII encoding - :param text: - :return: + :param text: the text to convert + :return: the ASCII encoded text """ return self.unicode.ent2asc(self.unicode.u2asc(text.replace('amp', '&'))) @@ -737,78 +755,85 @@ class LatexReference(Reference): macro substitutions when stringifying the object. """ + # path to the LaTeX macro file containing AAS-specific macros macro_filename = os.path.dirname(__file__) + '/data_files/aas_latex.dat' + # list of macros read from the LaTeX macro file aas_macros = open(macro_filename).readlines() + # dictionary to store AAS macros and their definitions aas_macro_dict = {} + # populate the aas_macro_dict with macros and their corresponding definitions for line in aas_macros: line = line.strip() macro, means = line.split(None, 1) - # force macro to match at word boundary; this prevents incorrect - # translation of latex commands such as \natexlab{b} (which would - # be translated to Natureexlab{b} due to the \nat macro) - # AA 8/6/02 aas_macro_dict[macro] = means - - aas_macro_keys = aas_macro_dict.keys() - aas_macro_keys = sorted(aas_macro_keys, key=len) + # sorted keys of the AAS macros for regex matching + aas_macro_keys = sorted(aas_macro_dict.keys(), key=len) + # regex pattern to match AAS macros in a string re_aas_macro = re.compile(r'\b|'.join(map(re.escape, aas_macro_keys)) + r'\b') - + # dictionary of LaTeX macros and their replacements latex_macro_dict = {'newline': ' ', 'newblock': ' ', 'etal': 'et al.', 'i': 'i', '-': '-'} + # to match LaTeX macros re_latex_macro = re.compile(r'\\(?P%s)' % '|'.join(latex_macro_dict.keys())) - + + # list 1 of substitutions for cleaning up LaTeX references reference_cleanup_1 = [ (re.compile(r'\\[\w\W]{1}\{([A-Za-z]{1})\}'), r'\1'), - (re.compile(r'\\&'), '&'), - (re.compile(r'&'), '&'), - (re.compile(r'\samp\s'), '&'), - (re.compile(r'(:?\'\')|(:?\`\`)'), ''), # quotes - (re.compile(r'\\[\^\"\'\`\.\~]'), ''), # accent - (re.compile(r'\\[vH]\s?'), ''), # euaccent - (re.compile(r'\\([clL])'), ''), # lslash - (re.compile(r'\\[\ ]'), ' '), # space - (re.compile(r'\{\\(it|bf|em) (.*?)\}'), r'\2'), # style - (re.compile(r'\\(textbf|textit|emph)\{(.*?)\}'), r'\2'), # font - (re.compile(r'\\(textbf|textit|emph|sl|bf) '), ' '), # more fonts - (re.compile(r'%'), ' '), # tab - (re.compile(r'[\{\}]'), ''), # curly brakets + (re.compile(r'\\&'), '&'), # and + (re.compile(r'&'), '&'), # and + (re.compile(r'\samp\s'), '&'), # and + (re.compile(r'(:?\'\')|(:?\`\`)'), ''), # quotes + (re.compile(r'\\[\^\"\'\`\.\~]'), ''), # accent + (re.compile(r'\\[vH]\s?'), ''), # euaccent + (re.compile(r'\\([clL])'), ''), # lslash + (re.compile(r'\\[\ ]'), ' '), # space + (re.compile(r'\{\\(it|bf|em) (.*?)\}'), r'\2'), # style + (re.compile(r'\\(textbf|textit|emph)\{(.*?)\}'), r'\2'), # font + (re.compile(r'\\(textbf|textit|emph|sl|bf) '), ' '), # more fonts + (re.compile(r'%'), ' '), # tab + (re.compile(r'[\{\}]'), ''), # curly brakets ] - + + # list 2 of substitutions for cleaning up LaTeX references reference_cleanup_2 = [ - (re.compile(r'\s\s+'), ' '), # multi-space + (re.compile(r'\s\s+'), ' '), # multi-space ] def __init__(self, reference_str, unicode=None): """ + initialize the LatexReference object - :param reference_str: - :param unicode: + :param reference_str: the reference string to initialize the object with + :param unicode: optional unicode parameter (default is None) """ Reference.__init__(self, reference_str, unicode) def parse(self): """ + parse the reference string :return: """ self.parsed = True - def __str__(self): + def __str__(self) -> str: """ + return the string representation of the reference object - :return: + :return: the cleaned reference string """ reference_str = Reference.__str__(self) return self.cleanup(reference_str).strip() - def cleanup(self, reference): + def cleanup(self, reference: str) -> str: """ + clean up the given reference string by applying various regex substitutions - :param reference: - :return: + :param reference: the reference string to clean up + :return: the cleaned reference string """ reference = self.re_aas_macro.sub(lambda match: self.aas_macro_dict[match.group(0)], reference) for (compiled_re, replace_str) in self.reference_cleanup_1: diff --git a/adsrefpipe/refparsers/toREFs.py b/adsrefpipe/refparsers/toREFs.py index e6aabb5..eea9f99 100644 --- a/adsrefpipe/refparsers/toREFs.py +++ b/adsrefpipe/refparsers/toREFs.py @@ -2,59 +2,76 @@ import os import regex as re -import string, operator from abc import abstractmethod +from typing import List, Dict, Tuple from adsputils import setup_logging, load_config from adsrefpipe.refparsers.reference import unicode_handler -logger = setup_logging('toREFs') +logger = setup_logging('refparsers') config = {} config.update(load_config()) class toREFs(): + """ + base class for reference extraction and processing + """ + + # to match ADS bibcode in XML format re_format_xml = re.compile(r'(?P.*?)\s*') - re_format_text = re.compile(r'\\adsbibcode\{(?P.*?)\}\s*') + # to match ADS bibcode in LaTeX format + re_format_tex = re.compile(r'\\adsbibcode\{(?P.*?)\}\s*') + # to match ADS bibcode in custom tag format re_format_tag = re.compile(r'(((^|\n)\%R\s+)|(\sbibcode="))(?P\S{18,19})[\s+"]') - format_pattern = {'xml': re_format_xml, 'tex': re_format_text, 'tag': re_format_tag} + + # dictionary mapping reference formats to their corresponding regular expressions + format_pattern = {'xml': re_format_xml, 'tex': re_format_tex, 'tag': re_format_tag} + # list of supported reference formats reference_format = format_pattern.keys() + # template strings for formatting references in different formats format_identifier_pattern = {'xml': '%s\n%s', 'tex': '\\adsbibcode{%s}\n%s', 'tag': '%%R %s\n%s'} + + # header patterns for different reference formats format_header_pattern = {'xml': '''''', 'tex': '', 'tag': ''} + # to match and validate Bibcodes re_bibcode = re.compile(r"^(bibcode)?.*([12][089]\d\d[A-Za-z\.0-9&+]{14}[A-Z\.])$", re.IGNORECASE) - arxiv_category = ['acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph', 'bayes-an', 'chao-dyn', 'chem-ph', - 'cmp-lg', 'comp-gas', 'cond-mat', 'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', 'hep-lat', 'hep-ph', - 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin', 'nucl-ex', 'nucl-th', 'patt-sol', 'physics', 'plasm-ph', - 'q-alg', 'q-bio', 'quant-ph', 'solv-int', 'supr-con'] - re_arxiv_old_pattern = re.compile( - r'\b(?:arXiv\W*)?(' + "|".join(arxiv_category) + r')(\.[A-Z]{2})?/(\d{7})(:?v\d+)?\b', re.IGNORECASE) - re_arxiv_new_pattern = re.compile(r'\b(?:(?:arXiv\s*\W?\s*)|(?:(?:' + "|".join( - arxiv_category) + r')\s*[:/]?\s*)|(?:http://.*?/abs/)|(?:))(\d{4})\.(\d{4,5})(?:v\d+)?\b', re.IGNORECASE) + # list of arXiv categories used in the arXiv identifier format + arxiv_category = ['acc-phys', 'adap-org', 'alg-geom', 'ao-sci', 'astro-ph', 'atom-ph', 'bayes-an', 'chao-dyn', + 'chem-ph', 'cmp-lg', 'comp-gas', 'cond-mat', 'cs', 'dg-ga', 'funct-an', 'gr-qc', 'hep-ex', + 'hep-lat', 'hep-ph', 'hep-th', 'math', 'math-ph', 'mtrl-th', 'nlin', 'nucl-ex', 'nucl-th', + 'patt-sol', 'physics', 'plasm-ph', 'q-alg', 'q-bio', 'quant-ph', 'solv-int', 'supr-con'] + + # to match the old arXiv identifier format + re_arxiv_old_pattern = re.compile(r'\b(?:arXiv\W*)?(' + "|".join(arxiv_category) + r')(\.[A-Z]{2})?/(\d{7})(:?v\d+)?\b', re.IGNORECASE) + # to match the new arXiv identifier format + re_arxiv_new_pattern = re.compile(r'\b(?:(?:arXiv\s*\W?\s*)|(?:(?:' + "|".join(arxiv_category) + r')\s*[:/]?\s*)|(?:http://.*?/abs/)|(?:))(\d{4})\.(\d{4,5})(?:v\d+)?\b', re.IGNORECASE) def __init__(self): """ - + initializes an empty list to store raw references """ self.raw_references = [] - def is_bibcode(self, text): + def is_bibcode(self, text: str) -> bool: """ verify that text is a bibcode - :param text: - :return: + :param text: input text to be checked + :return: true if the text matches a bibcode pattern, false otherwise """ return self.re_bibcode.match(text) - def get_bibcodes(self): + def get_bibcodes(self) -> List: """ + extract bibcodes from stored raw references - :return: + :return: list of bibcodes extracted from raw references """ bibcodes = [] for block in self.raw_references: @@ -63,20 +80,25 @@ def get_bibcodes(self): @abstractmethod def prcess_and_dispatch(self): + """ + abstract method for processing and dispatching references + """ return def dispatch(self): """ this function just calls the parser - :return: + + :return: result of process_and_dispatch method """ return self.process_and_dispatch() - def has_arXiv_id(self, reference): + def has_arXiv_id(self, reference: str) -> bool: """ + check if a reference contains an arXiv identifier - :param reference: - :return: + :param reference: reference string to be checked + :return: true if an arXiv ID is found, false otherwise """ if self.re_arxiv_old_pattern.search(reference): return True @@ -84,14 +106,13 @@ def has_arXiv_id(self, reference): return True return False - def any_item_num(self, item_nums, idx): + def any_item_num(self, item_nums: List, idx: int) -> Dict: """ - when references are reprocess, the original item_num is used - if references are being processed for the first time, there is no item_num + retrieve the original item number of a reference if available - :param item_nums: - :param idx: - :return: + :param item_nums: list of item numbers + :param idx: index of the item number to retrieve + :return: dictionary containing the item number if available """ try: item_num = item_nums[idx] @@ -100,48 +121,59 @@ def any_item_num(self, item_nums, idx): pass return {} - def merge(self, dict1, dict2): + def merge(self, dict1: Dict, dict2: Dict) -> Dict: """ combine dict2 into dict1 and return dict1 - :param dict1: - :param dict2: - :return: + :param dict1: primary dictionary + :param dict2: secondary dictionary to be merged into dict1 + :return: updated dict1 after merging with dict2 """ dict1.update(dict2) return dict1 class TXTtoREFs(toREFs): + """ + class for processing references in TXT format + """ + # to match the "http://stacks.iop.org" URL pattern re_stacks_iop_org = re.compile('http://stacks.iop.org') + # list of tuples containing regular expressions for cleaning up unwanted elements in reference blocks block_cleanup = [ - (re.compile(r'°'), ' '), - (re.compile(r'�'), ' '), - (re.compile(r'�'), ' '), - (re.compile(r''), ' '), - (re.compile(r''), ''), + (re.compile(r'°'), ' '), # replace degree symbol with a space + (re.compile(r'�'), ' '), # replace the degree character with a space + (re.compile(r'�'), ' '), # replace the character '�' with a space + (re.compile(r''), ' '), # remove HTML anchor tags + (re.compile(r''), ''), # remove closing HTML anchor tags ] + # list of regular expressions for cleaning up specific parts of a reference, like URLs or LaTeX commands. reference_cleanup_1 = [ - (re.compile('http://dx.doi.org/'), 'doi:'), - (re.compile(r'\\emph\{([^\}]*)\}'), r'\1'), - (re.compile(r'[\{\}]'), ''), - (re.compile(r'\\(it|bf|em)', flags=re.IGNORECASE), ''), - (re.compile(r'\\(textit|textbf)'), ''), - (re.compile(r'&'), r'&'), - (re.compile(r' '), ' '), - (re.compile('(�)+'), ''), - (re.compile(r'', flags=re.IGNORECASE), ''), # remove SUB/SUP tags - (re.compile(r'\\ibidrule'), '--- '), + (re.compile('http://dx.doi.org/'), 'doi:'), # replace DOI URL with "doi:" + (re.compile(r'\\emph\{([^\}]*)\}'), r'\1'), # remove LaTeX emphasis tags + (re.compile(r'[\{\}]'), ''), # remove curly braces + (re.compile(r'\\(it|bf|em)', flags=re.IGNORECASE), ''), # remove LaTeX font style commands + (re.compile(r'\\(textit|textbf)'), ''), # remove LaTeX font style commands + (re.compile(r'&'), r'&'), # replace "&" with "&" + (re.compile(r' '), ' '), # replace non-breaking space with a regular space + (re.compile('(�)+'), ''), # remove invalid characters + (re.compile(r'', flags=re.IGNORECASE), ''), # remove LaTeX subscript and superscript tags + (re.compile(r'\\ibidrule'), '--- '), # replace LaTeX "\ibidrule" with a dash ] + + # list of regular expressions for additional cleanup tasks after the first round of cleaning reference_cleanup_2 = [ - (re.compile(r'–'), '-'), - (re.compile(r'�'), '-') + (re.compile(r'–'), '-'), # replace the hex code for an en dash with a regular dash + (re.compile(r'�'), '-'), # replace character '�' with a dash ] + # regular expression to match multiple spaces and replace them with a single space. re_remove_spaces = re.compile(r'\s\s+') + + # regular expression to match enumeration patterns, like numbering or item list formats re_enumeration = re.compile(r'^(\s*\d{1,3}\.?|' r'\s*\(\d{1,3}\)|' r'\s*\[\d{1,3}\]|' @@ -149,8 +181,11 @@ class TXTtoREFs(toREFs): r'[\s\t]+)' r'([\sA-Zdv]+|' r'[&h]+[a-z]+)') + + # regular expression to match multi-references, where multiple references are combined in one line re_multi_references = re.compile(r'(\d+\W*;\s*\(?[12]\d\d\d[a-z]*\)?\W+)(?=.*[\w\d]+)') + # pattern for matching multiple enumerated references, includes various types of numbering formats multi_enumerated_references_pattern = r'(' \ r'(?:^|[;,]+\s+)\(\d{1,3}\)\s+|' \ r'(?:^|[.,]+\s+)\d{1,3}\)\s+|' \ @@ -165,36 +200,44 @@ class TXTtoREFs(toREFs): r'(?:^|;\s*)\d{1,3}\-\s+|' \ r'(?:^|;\s*)\d{1,3}\-\s*' \ r')' - # this will decide if there are multiple enumeration on the same line, needs to lookahead and determine the count if they include year + + # for identifying multi-references that contain a year in their structure, with a lookahead re_multi_enumerated_references_w_year_lookahead = re.compile(r'%s%s' % (multi_enumerated_references_pattern, r'(?=\s*[A-Z]+[\w\W]{2,}\s+[A-Za-z]+)(?=.*[12]\d\d\d[a-z]*\b)')) - # this is used to split multiple enumeration, this should not include the year, since if there is a reference missing a year, it needs to be split and - # later it is checked if it is a valid reference including if it has the year + # for splitting multi-references that don't include a year re_multi_enumerated_references = re.compile(multi_enumerated_references_pattern) - + # placeholder for matching author lists with a placeholder pattern (e.g., "--" or "__") re_author_list_placeholder = re.compile(r'[-_]{2,}\.?') - re_prior_year = re.compile(r'(.*)(?=[\s\(]+[12]+\d\d\d[a-z]*)') + # to match prior a year in a reference following author list, used to extract the author list from previous references if necessary + re_prior_year = re.compile(r'((\S+\s+){2,})(?=[\s\(\[]+[12]+[09]+\d\d(\S+\s+){2,})') + # to match 4-digit years, possibly with lowercase letters (e.g., 2020a) re_year = re.compile(r'([12]+\d\d\d[a-z]*)') + # to match DOI patterns in a reference re_doi = re.compile(r'doi:(.*?)', re.IGNORECASE) + # to match the bibcode format in a reference, ensuring it matches a 19-character format re_bibcode = re.compile(r'(^\d{4}[\w\.&+]{14}[A-Z\.]{1})') - re_a_reference = re.compile(r'^(\s*[A-Z][a-z]+,?\s+[A-Z]+\.?|[A-Z]+\.?\s+[A-Z][a-z]+,)+\s+[^\d]*.*?(\d+)\W+(\d+)') + # to match the author part of a reference, often used in citation styles with year and volume/page info + re_a_reference = re.compile(r'^(\s*[A-Z][a-z]+,?\s+[A-Z]+\.,?)(?:\s+and\s+[A-Z][a-z]+,?\s+[A-Z]+\.?)*\s+[^\d]*.*?(\d+)\W+(\d+)') - def __init__(self, filename, buffer, parsername, cleanup=None, encoding='UTF-8'): + def __init__(self, filename: str, buffer: Dict, parsername: str, cleanup: List = None, encoding: str = 'UTF-8'): """ + initializes the TXTtoREFs object and processes the reference file - :param filename: - :param buffer: - :param parsername: - :param cleanup: - :param encoding: + :param filename: path to the TXT file + :param buffer: dictionary containing buffer data + :param parsername: name of the parser + :param cleanup: optional list of regex patterns for cleanup + :param encoding: character encoding for the file """ toREFs.__init__(self) + self.raw_references = [] if buffer: self.filename = buffer['source_filename'] self.parsername = buffer['parser_name'] - block_references, item_nums = [[b['refraw'] for b in buffer['references']], [b['item_num'] for b in buffer['references']]] - self.raw_references.append({'bibcode': buffer['source_bibcode'], 'block_references': block_references, 'item_nums':item_nums}) + for buf in buffer['block_references']: + block_references, item_nums = [[ref['refraw'] for ref in buf['references']], [ref['item_num'] for ref in buf['references']]] + self.raw_references.append({'bibcode': buf['source_bibcode'], 'block_references': block_references, 'item_nums': item_nums}) else: self.filename = filename self.parsername = parsername @@ -210,15 +253,16 @@ def __init__(self, filename, buffer, parsername, cleanup=None, encoding='UTF-8') if cleanup: for (compiled_re, replace_str) in cleanup: - references = compiled_re.sub(replace_str, references) + references = [compiled_re.sub(replace_str, ref) for ref in references] self.raw_references.append({'bibcode': bibcode, 'block_references': references}) - def cleanup(self, reference): + def cleanup(self, reference: str) -> str: """ + clean up the reference string by applying various replacements - :param reference: - :return: + :param reference: the reference string to be cleaned up + :return: cleaned reference string """ if 'stacks.iop.org' in reference: reference = self.re_stacks_iop_org.sub('doi:10.1088', reference).replace('i=', '').replace('a=', '') @@ -229,16 +273,17 @@ def cleanup(self, reference): reference = compiled_re.sub(replace_str, reference) return reference - def process_a_reference(self, is_enumerated, line, next_line, reference, prev_reference, block_references): + def process_a_reference(self, is_enumerated: bool, line: str, next_line: str, reference: str, prev_reference: str, block_references: List) -> Tuple: """ + process a single reference, splitting it if necessary - :param is_enumerated: True if the entire reference list is enumerated - :param line: - :param next_line: - :param reference: - :param prev_reference: - :param block_references: - :return: + :param is_enumerated: true if references are enumerated + :param line: current line of the reference + :param next_line: next line in the reference block + :param reference: current reference being processed + :param prev_reference: previous reference for inheritance checks + :param block_references: list to store processed references + :return: updated reference, previous reference, and block of references """ # ignore anything after % line = line.split('%')[0].replace('\n', '') @@ -263,12 +308,13 @@ def process_a_reference(self, is_enumerated, line, next_line, reference, prev_re return reference, prev_reference, block_references - def process_enumeration(self, line, block_references): + def process_enumeration(self, line: str, block_references: List) -> List: """ + process enumerated references - :param line: - :param block_references: - :return: + :param line: line containing the references + :param block_references: list to store processed references + :return: list of processed references """ enumerated_references = [ref.strip() for ref in self.re_multi_enumerated_references.split(line) if ref] if enumerated_references: @@ -280,14 +326,14 @@ def process_enumeration(self, line, block_references): prev_reference = enumerated_reference return block_references - def get_references(self, filename, encoding="ISO-8859-1"): + def get_references(self, filename: str, encoding: str = "ISO-8859-1") -> List: """ - read reference file for this text format + read reference file and extract references - :param filename: - :return: + :param filename: path to the TXT file + :param encoding: character encoding for the file + :return: list of references extracted from the file """ - try: references = [] @@ -329,41 +375,38 @@ def get_references(self, filename, encoding="ISO-8859-1"): references.append([bibcode, block_references]) if len(references) > 0: - logger.debug("Read source file %s, and got %d references to resolve for bibcode %s." % ( - filename, len(references), bibcode)) + logger.debug("Read source file %s, and got %d references to resolve for bibcode %s." % (filename, len(references), bibcode)) elif len(references) == 0: logger.error('No references found in reference file %s.' % (filename)) return references except Exception as e: - logger.error('Exception: %s' % (str(e))) + logger.error(f'Exception: {str(e)}') return [] - def fix_inheritance(self, cur_refstr, prev_refstr): + def fix_inheritance(self, cur_refstr: str, prev_refstr: str) -> str: """ - if author list is the same as the reference above it, a dash is inserted - get the list of authors from the previous reference and add it to the current one + checks if the author list in the current reference is the same as the one in the previous reference, + and if so, appends the previous authors to the current reference. A dash is inserted to separate the authors - :param cur_refstr: - :param prev_refstr: - :return: + :param cur_refstr: The current reference string that may need author inheritance + :param prev_refstr: The previous reference string from which authors might be inherited + :return: The modified current reference string with authors inherited from the previous reference, if applicable """ match = self.re_author_list_placeholder.match(cur_refstr) if match and prev_refstr and len(prev_refstr) > 1: - try: - # find the year and return everything that came before it - prev_authors = self.re_prior_year.match(prev_refstr) - if prev_authors: - cur_refstr = prev_authors.group().strip() + " " + cur_refstr[match.end():].strip() - except TypeError: - pass + # find the year and return everything that came before it + prev_authors = self.re_prior_year.match(prev_refstr) + if prev_authors: + cur_refstr = prev_authors.group().strip() + " " + cur_refstr[match.end():].strip() return cur_refstr - def is_reference(self, reference): + def is_reference(self, reference: str) -> bool: """ - a reference has either year or doi or have at least author/volume/page + determines if a given reference string is a valid reference by checking for a year, DOI, or + sufficient author/volume/page information - :param reference: - :return: + :param reference: The reference string to be validated + :return: True if the reference is valid, otherwise False """ if self.re_year.search(reference) or self.re_doi.search(reference) or self.has_arXiv_id(reference): return True @@ -375,16 +418,20 @@ def is_reference(self, reference): class XMLtoREFs(toREFs): - def __init__(self, filename, buffer, parsername, tag=None, cleanup=None, encoding=None): + """ + class for processing references in XML format + """ + + def __init__(self, filename: str, buffer: Dict, parsername: str, tag: str = None, cleanup: List = None, encoding: str = None): """ + initializes the XMLtoREFs object and processes the XML reference file - :param filename: - :param buffer: - :param parsername: - :param tag: - :param cleanup: - :param encoding: - :param method_identifiers: + :param filename: path to the XML file + :param buffer: dictionary containing buffer data + :param parsername: name of the parser + :param tag: optional XML tag for processing + :param cleanup: optional list of regex patterns for cleanup + :param encoding: optional character encoding for the file """ toREFs.__init__(self) @@ -392,8 +439,9 @@ def __init__(self, filename, buffer, parsername, tag=None, cleanup=None, encodin self.filename = buffer['source_filename'] self.parsername = buffer['parser_name'] - block_references, item_nums = [[b['refraw'] for b in buffer['references']], [b['item_num'] for b in buffer['references']]] - self.raw_references.append({'bibcode': buffer['source_bibcode'], 'block_references': block_references, 'item_nums':item_nums}) + for buf in buffer['block_references']: + block_references, item_nums = [[ref['refraw'] for ref in buf['references']], [ref['item_num'] for ref in buf['references']]] + self.raw_references.append({'bibcode': buf['source_bibcode'], 'block_references': block_references, 'item_nums': item_nums}) else: self.filename = filename self.parsername = parsername @@ -401,48 +449,44 @@ def __init__(self, filename, buffer, parsername, tag=None, cleanup=None, encodin pairs = self.get_references(filename=filename) for pair in pairs: bibcode = pair[0] - buffer = pair[1] + references = pair[1] if len(bibcode) != 19: - logger.error( - "Error in getting a bibcode along with the reference strings from reference file %s. Returned %s for bibcode. Skipping!" % (filename, bibcode)) + logger.error("Error in getting a bibcode along with the reference strings from reference file %s. Returned %s for bibcode. Skipping!" % (filename, bibcode)) continue if cleanup: for (compiled_re, replace_str) in cleanup: - buffer = compiled_re.sub(replace_str, buffer) + references = compiled_re.sub(replace_str, references) - block_references = self.get_xml_block(buffer, tag, encoding) + block_references = self.get_xml_block(references, tag, encoding) self.raw_references.append({'bibcode': bibcode, 'block_references': block_references}) - def get_references(self, filename, encoding="utf8"): + def get_references(self, filename: str, encoding: str = "utf8") -> List: """ - returns an array of bibcode and reference text blobs - parsed from the input file + extract references from an XML file - :param filename: - :param buffer: - :param encoding: - :return: + :param filename: path to the XML file + :param encoding: character encoding for the file + :return: list of references extracted from the file """ - if filename: - try: - buffer = open(filename, encoding=encoding, errors='ignore').read() - except Exception as error: - logger.error("Unable to open file %s. Exception %s." % (filename, error)) + try: + buffer = open(filename, encoding=encoding, errors='ignore').read() + if not buffer: + logger.error(f"File {filename} is empty.") return [] - if not buffer: - logger.error("File %s is empty." % filename) - return [] - return self.get_reference_blob(buffer, self.detect_ref_format(buffer)) + return self.get_reference_blob(buffer, self.detect_ref_format(buffer)) + except Exception as e: + logger.error(f"Unable to open file {filename}. Exception {str(e)}.") + return [] - def detect_ref_format(self, text): + def detect_ref_format(self, text: str) -> str: """ - attempts to detect reference format used in text + detect the reference format used in the XML text - :param text: - :return: + :param text: XML text to detect the format from + :return: reference format (xml, tex, tag) """ for format in self.reference_format: pattern = self.format_pattern[format] @@ -450,14 +494,13 @@ def detect_ref_format(self, text): return format return None - def get_reference_blob(self, buffer, format): + def get_reference_blob(self, buffer: str, format: str) -> List: """ - returns an array of bibcode and reference text blobs - extracted from input buffer + extract references from a buffer based on the detected format - :param buffer: - :param format: - :return: + :param buffer: buffer containing the XML content + :param format: detected reference format + :return: list of references extracted from the buffer """ result = [] @@ -478,17 +521,15 @@ def get_reference_blob(self, buffer, format): return result - def get_xml_block(self, buffer, tag, encoding=None, strip=0): + def get_xml_block(self, buffer: str, tag: str, encoding: str = None, strip: int = 0) -> List: """ - returns XML fragments obtained by splitting the input buffer on - we do this with regexps rather than a real XML parser for efficiency - (and because the XML may be just fragments) + extract XML fragments from the buffer based on a specified tag - :param buffer: - :param tag: - :param encoding: - :param strip: - :return: + :param buffer: buffer containing the XML content + :param tag: XML tag to extract the content from + :param encoding: optional encoding for the XML content + :param strip: flag to indicate whether to strip the XML tags + :return: list of extracted XML fragments """ start_tag = '<' + tag + r'\s*[\s>]' end_tag = '' @@ -498,18 +539,16 @@ def get_xml_block(self, buffer, tag, encoding=None, strip=0): header = self.format_header_pattern['xml'] % encoding return list(map(lambda a: header + a, self.cut_apart(buffer, start_tag, end_tag, strip))) - def cut_apart(self, buffer, start_tag, end_tag, strip): + def cut_apart(self, buffer: str, start_tag: str, end_tag: str, strip: int) -> List: """ - this is the main function that uses regular expressions to break - up a reference section into individual references; - some post-processing of the output may be necessary to join/split - lines depending on what the source is + this function uses regular expressions to break up a reference section into individual references + some post-processing of the output may be necessary to join or split lines depending on the source - :param buffer: - :param start_tag: - :param end_tag: - :param strip: - :return: + :param buffer: containing the reference section to be processed + :param start_tag: regular expression for the start tag of the reference + :param end_tag: regular expression for the end tag of the reference + :param strip: if set to 1, the tag will be stripped from the reference, otherwise, it will remain + :return: list of references as strings, extracted from the input buffer """ references = [] @@ -538,14 +577,15 @@ def cut_apart(self, buffer, start_tag, end_tag, strip): return references - def strip_tag(self, strip, match, side): + def strip_tag(self, strip: int, match, side: str) -> int: """ - if strip is set to 1, then the tag defined in regular expression is removed + this method determines whether to remove the matched tag from the reference string, + based on the 'side' parameter - :param strip: - :param match: - :param side: - :return: + :param strip: if set to 1, the tag defined in regular expression is removed; otherwise, it is not + :param match: the match object from the regular expression search + :param side: the side of the tag to consider ('Left' or 'Right') + :return: the position in the string where the tag should be stripped, or where the reference should be split """ if side == 'Left': if strip: @@ -556,11 +596,20 @@ def strip_tag(self, strip, match, side): return match.start() return match.end() - def extract_tag(self, refstr, tag, remove=1, keeptag=0, greedy=0, foldcase=0, attr=0, join=''): + def extract_tag(self, refstr: str, tag: str, remove: int = 1, keeptag: int = 0, greedy: int = 0, foldcase: int = 0, attr: int = 0, join: str = '') -> Tuple: """ - extracts an XML tag from the input reference string - and returns the (potentially) modified input string + extracts an XML tag from the input reference string and returns the (potentially) modified input string, as well as the extracted tag + + :param refstr: input reference string containing XML tags + :param tag: XML tag to extract + :param remove: if set to 1, removes the matched tag; otherwise, leaves it in the string + :param keeptag: if set to 1, keeps the tag in the extracted reference; otherwise, it is removed + :param greedy: if set to 1, uses greedy matching for the regular expression; otherwise, uses non-greedy matching + :param foldcase: if set to 1, makes the regular expression case-insensitive; otherwise, it is case-sensitive + :param attr: if set to 1, matches attributes within the tag; otherwise, it does not + :param join: string to join the parts of the reference if they are split; defaults to an empty string + :return: modified reference string (after tag extraction) and the extracted tag (if found) """ if not refstr: return '', None @@ -586,14 +635,22 @@ def extract_tag(self, refstr, tag, remove=1, keeptag=0, greedy=0, foldcase=0, at class OCRtoREFs(toREFs): + """ + class for processing references in OCR format + """ + # to match a year with optional letters following it re_year = re.compile(r'([l12]+\d\d\d[a-z]*)') + # to match a DOI (Digital Object Identifier) re_doi = re.compile(r'doi:(.*?)', re.IGNORECASE) + # to match a reference citation with author names and year re_a_reference = re.compile(r'([A-Z][a-z]+,?\s+[A-Z]+\.?|[A-Z]+\.?\s+[A-Z][a-z]+,)+[^\d]*.*?(\d+)\W+(\d+)') - + # to match author list placeholders, such as dashes or asterisks re_author_list_placeholder = re.compile(r'\s*([-_]{2,}\.?|[-_*]{1,}\s*:)') - re_prior_year = re.compile(r'(.*)(?=[\s\(]*[l12]+\d\d\d[a-z]*)') + # to match a prior year in a reference string for author inheritance + re_prior_year = re.compile(r'((\S+\s+){2,})(?=[\s\(]*[l12]+[o09]+\d\d(\S+\s+){2,})') + # patterns and their replacements for cleaning up reference strings re_cleanup = [ (re.compile(r'\[E'), '&'), (re.compile(r'\[H'), '-'), @@ -608,28 +665,37 @@ class OCRtoREFs(toREFs): (re.compile("Co ?11"), "Coll"), ] + # to match a bibcode in the format used in ADS references re_bibcode = re.compile(r'(^\d{4}[\w\.&+]{14}[A-Z\.]{1})') + # all punctuation characters for enumeration matching punctuations = r'!\"#\$%&\'\(\)\*\+,-\./:;<=>\?@\[\]\^_`{\|}~\\' + # to match enumeration with optional punctuations and numbers enumeration = r'^(?:\s{0,1}|\x0f)[%s]*\d{1,3}[a-z]{0,1}[%s\s]+' % (punctuations, punctuations) + # to lookahead for references with uppercase letters or four-digit years enumeration_lookahead = r'(?=.*[A-Z]{1}[\.\s]+)(?=.*[12]\d\d\d[a-z]*)?' + + # to match the start of a reference with enumeration and lookahead re_reference_start = re.compile(r'(%s)%s' % (enumeration, enumeration_lookahead)) + # to remove enumeration from reference lines re_remove_enumeration = re.compile(r'%s%s' % (enumeration, enumeration_lookahead)) + # to match multi-enumerated references, such as when multiple references are in one line re_multi_enumerated_references = re.compile(r'((?:^|[.;\s]+)[\(\[~-]*\d{1,3}[\)\]\.]+\s*)' r'(?=.*[A-Z\d]+[\w\W]{2,}\s+|[A-Z]+[a-z\.\'~]+)(?=.*[12]\d\d\d[a-z]*\b)') + # to match the continuation of a reference in the next line re_reference_continue = re.compile(r'^(\s{2,}|\t)(.*)$') - + # to match the first line of references (e.g., 'References Cited' or similar variations) re_first_line = re.compile(r'(\s*References cited[:]|\s*Reference[s:.-\s]*|\s*Ref[\w\s~]+es)', re.IGNORECASE) - def __init__(self, filename, buffer, parsername, cleanup=None, encoding='UTF-8'): + def __init__(self, filename: str, buffer: Dict, parsername: str, cleanup: List = None, encoding: str = 'UTF-8'): """ + initializes the OCRtoREFs object and processes the OCR reference file - :param filename: - :param buffer: - :param parsername: - :param cleanup: - :param encoding: - :param method_identifiers: + :param filename: path to the OCR file + :param buffer: dictionary containing buffer data + :param parsername: name of the parser + :param cleanup: optional list of regex patterns for cleanup + :param encoding: character encoding for the file """ toREFs.__init__(self) @@ -640,8 +706,9 @@ def __init__(self, filename, buffer, parsername, cleanup=None, encoding='UTF-8') self.filename = buffer['source_filename'] self.parsername = buffer['parser_name'] - block_references, item_nums = [[b['refraw'] for b in buffer['references']], [b['item_num'] for b in buffer['references']]] - self.raw_references.append({'bibcode': buffer['source_bibcode'], 'block_references': block_references, 'item_nums':item_nums}) + for buf in buffer['block_references']: + block_references, item_nums = [[ref['refraw'] for ref in buf['references']], [ref['item_num'] for ref in buf['references']]] + self.raw_references.append({'bibcode': buf['source_bibcode'], 'block_references': block_references, 'item_nums':item_nums}) else: self.filename = filename self.parsername = parsername @@ -657,19 +724,18 @@ def __init__(self, filename, buffer, parsername, cleanup=None, encoding='UTF-8') if cleanup: for (compiled_re, replace_str) in cleanup: - for i in range(len(references)): - references[i] = compiled_re.sub(replace_str, references[i]) + references = [compiled_re.sub(replace_str, ref) for ref in references] self.raw_references.append({'bibcode': bibcode, 'block_references': references}) - def verify_accept(self, block_references, current_reference, prev_reference): + def verify_accept(self, block_references: List, current_reference: str, prev_reference: str) -> Tuple: """ - verify that this is a complete reference, fix author inheritance if need to, and append it to the structure + verify that a reference is complete, and handle author inheritance if needed - :param block_references: - :param current_reference: - :param prev_reference: - :return: + :param block_references: list of references to be updated + :param current_reference: current reference being processed + :param prev_reference: previous reference for inheritance checks + :return: updated block references, current reference, and previous reference """ if self.is_reference(current_reference): reference = self.fix_inheritance(current_reference, prev_reference) @@ -678,14 +744,12 @@ def verify_accept(self, block_references, current_reference, prev_reference): current_reference = '' return block_references, current_reference, prev_reference - def merge_split_process(self, reader): + def merge_split_process(self, reader: List) -> List: """ - some of the reference files contain references that are split in multiple line, and where - one reference finishes another starts. For these it is best to merge all lines, and then - split on the enumeration, and process them. + merge and process references that are split across multiple lines - :param reader: - :return: + :param reader: list of lines in the reference file + :return: processed block references """ buffer = [line.strip().rstrip('-') for line in reader] buffer = ' '.join(buffer).replace('\n', ' ').replace('\r', ' ') @@ -697,20 +761,19 @@ def merge_split_process(self, reader): block_references, _, prev_reference = self.verify_accept(block_references, line, prev_reference) return block_references - def process_with_header_line(self, reader): + def process_with_header_line(self, reader: List) -> List: """ - process reference files that have a header `References` or `References Cited`. + process reference files with a header line (e.g., 'References') - :param lines: - :return: + :param reader: list of lines in the reference file + :return: processed block references """ block_references = [] - prev_reference = '' # remove the section header, if any for i in range(len(reader)): if not reader[i].strip(): continue - # first non empty line, does it start with References/References Cited? + # first non-empty line, does it start with References/References Cited? if self.re_first_line.search(reader[i].strip()): reader[i] = self.re_first_line.sub('', reader[i]).strip() # if enumerated, combine them into a single line @@ -723,20 +786,20 @@ def process_with_header_line(self, reader): break return block_references - def remove_enumeration(self, line, enumeration_status): + def remove_enumeration(self, line: str, enumeration_status: int) -> Tuple: """ + remove enumeration from a reference line - :param line: - :param enumeration_status: - :return: + :param line: reference line to process + :param enumeration_status: current enumeration status + :return: updated line and enumeration status """ - # remove any enumeration try: match = self.re_reference_start.search(line) # if there is an enumeration if match: enumeration_status = 1 - # if there was a enumerated reference, show that this is now the continuation + # if there was an enumerated reference, show that this is now the continuation elif not match and enumeration_status in [1, -1]: enumeration_status = -1 # not enumerated @@ -745,18 +808,19 @@ def remove_enumeration(self, line, enumeration_status): if enumeration_status == 1: line = list(filter(None, self.re_remove_enumeration.split(line)))[0] - except: + except (IndexError, Exception) as e: enumeration_status = 0 - pass + logger.error(f"Error while removing enumeration. Exception {str(e)}") return line, enumeration_status - def get_references(self, filename, encoding="ISO-8859-1"): + def get_references(self, filename: str, encoding: str = "ISO-8859-1") -> List: """ - read reference file for this text format + read reference file and extract references - :param filename: - :return: + :param filename: path to the OCR file + :param encoding: character encoding for the file + :return: list of references extracted from the file """ try: references = [] @@ -835,43 +899,39 @@ def get_references(self, filename, encoding="ISO-8859-1"): if bibcode and block_references: references.append([bibcode, block_references]) else: - logger.error("Error in getting the bibcode from the reference file name %s. Skipping!" % (filename)) + logger.error(f'Error in getting the bibcode from the reference file name {filename}. Skipping!') if len(references) > 0: - logger.debug("Read source file %s, and got %d references to resolve for bibcode %s." % (filename, len(references), bibcode)) + logger.debug(f'Read source file {filename}, and got {len(references)} references to resolve for bibcode {bibcode}.') elif len(references) == 0: - logger.error('No references found in reference file %s.' % (filename)) + logger.error(f'No references found in reference file {filename}.') return references except Exception as e: - logger.error('Exception: %s' % (str(e))) + logger.error(f'Exception: {str(e)}') return [] - def fix_inheritance(self, cur_refstr, prev_refstr): + def fix_inheritance(self, cur_refstr: str, prev_refstr: str) -> str: """ - if author list is the same as the reference above it, a dash is inserted - get the list of authors from the previous reference and add it to the current one + handle inheritance of author list when the current reference is similar to the previous one - :param cur_refstr: - :param prev_refstr: - :return: + :param cur_refstr: current reference string + :param prev_refstr: previous reference string + :return: updated current reference string """ match = self.re_author_list_placeholder.match(cur_refstr) if match and prev_refstr and len(prev_refstr) > 1: - try: - # find the year and return everything that came before it - prev_authors = self.re_prior_year.match(prev_refstr) - if prev_authors: - cur_refstr = prev_authors.group().strip() + " " + cur_refstr[match.end():].strip() - except TypeError: - pass + # find the year and return everything that came before it + prev_authors = self.re_prior_year.match(prev_refstr) + if prev_authors: + cur_refstr = prev_authors.group().strip() + " " + cur_refstr[match.end():].strip() return cur_refstr - def is_reference(self, reference): + def is_reference(self, reference: str) -> bool: """ - a reference has either year or doi or have at least author/volume/page + determine if a reference is valid based on year, DOI, or other criteria - :param reference: - :return: + :param reference: reference string to be validated + :return: true if the reference is valid, false otherwise """ if self.re_year.search(reference) or self.re_doi.search(reference) or self.has_arXiv_id(reference): return True @@ -883,62 +943,95 @@ def is_reference(self, reference): class TEXtoREFs(toREFs): + """ + class for processing references in LaTeX (TEX) format + """ + # to match the start of reference block, including LaTeX keywords like \begin{references} or \begin{thebibliography} reference_block_specifier = r'^(\\begin{references}|(?:%Z\s*)?\\begin{thebibliography}|%Z|\\begin)' + # to match the reference block specifier re_reference_block_specifier = re.compile(reference_block_specifier) - re_reference_block_specifier_to_ignore = re.compile(r'%s({[^\s]*}|$)'%reference_block_specifier) + # to match and ignore certain reference block specifiers + re_reference_block_specifier_to_ignore = re.compile(r'%s({[^\s]*}|$)' % reference_block_specifier) + # to match LaTeX reference start identifiers like \bibitem, \reference, \item, \refb reference_start_reference = r'(\\?bibitem|\\?reference|\\item|\\refb)' + # to match the reference start specifier like \bibitem or \reference reference_block_specifier_and_start_reference = re.compile(r'^(\\bibitem|\\reference|\\refb)') + # to match the start of a reference line in LaTeX, like \bibitem or \reference re_reference_line_start = re.compile(reference_start_reference) + + # to match the full reference block including optional content inside brackets and curly braces re_reference_block = re.compile( - r'%s?' # beginning of reference block (sometimes appear on the same line as first reference) - r'\s*' # optional spaces - r'\\?%s' # latex keyword for start of reference - r'\s*' # optional spaces - r'(\[([^\]]*)\])?' # optional brackets block - r'\s*' # optional spaces - r'({([^}]*)})?' # optional curly brackets block - r'\s*' # optional spaces - r'(?P[^\n%%]*)' # the rest + r'%s?' # beginning of reference block (sometimes appears on the same line as the first reference) + r'\s*' # optional spaces + r'\\?%s' # LaTeX keyword for the start of reference + r'\s*' # optional spaces + r'(\[([^\]]*)\])?' # optional brackets block + r'\s*' # optional spaces + r'({([^}]*)})?' # optional curly brackets block + r'\s*' # optional spaces + r'(?P[^\n%%]*)' # the rest of the content % (reference_block_specifier, reference_start_reference) ) + + # to match reference blocks entirely surrounded by brackets re_reference_block_all_bracketed = re.compile( - r'%s' # latex keyword for start of reference - r'\s*' # optional spaces - r'({)(?P.*)(})\W*$' # content is in brackets + r'%s' # LaTeX keyword for the start of reference + r'\s*' # optional spaces + r'({)(?P.*)(})\W*$' # content inside curly brackets % (reference_start_reference) ) + + # to match reference blocks without content, only brackets re_reference_block_no_content = re.compile( - r'^%s' # latex keyword for start of reference - r'\s*' # optional spaces - r'(?P.*)' # content is in brackets + r'^%s' # LaTeX keyword for the start of reference + r'\s*' # optional spaces + r'(?P.*)' # content inside the block % (reference_start_reference) ) + + # to match only the citation key in the reference block (one word only) re_reference_block_citiation_key_only = re.compile( - r'%s?' # beginning of reference block (sometimes appear on the same line as first reference) - r'\s*' # optional spaces - r'\\?%s' # latex keyword for start of reference - r'\s*' # optional spaces + r'%s?' # beginning of reference block + r'\s*' # optional spaces + r'\\?%s' # LaTeX keyword for the start of reference + r'\s*' # optional spaces r'(\[([^\]]*)\])?' # optional brackets block - r'\s*' # optional spaces - r'({[^\s]+}|$)' # citiation key, one word only + r'\s*' # optional spaces + r'({[^\s]+}|$)' # citation key (one word only) % (reference_block_specifier, reference_start_reference) ) + + # to match the reference document block and extract bibcode re_reference_doc_block = re.compile(r'(?:%R\s+|\\adsbibcode)\b[\s\{]*(?P[^\n\}]*)') + # to add a starting block for bibcode with a \bibitem tag re_add_start_block = re.compile(r'(\\adsbibcode\{[\w\d\W]{19}\})\n(\\bibitem)', flags=re.MULTILINE) + # to detect duplicated references in the document re_duplicate = re.compile(r'%s\s*\1\s*' % reference_start_reference) + # to match the beginning of references (bibitem, reference, or item) re_start_reference = re.compile(r'(\.|{\\)(bibitem|reference|item)') + # to match extra elements in the reference line, like newblock, jcd, or other tags re_extras = re.compile(r'(\\newblock\s*|\\jcd[,]|(? str: """ - do not depend on linefeed to have a single or part of single reference - multi references can appear with the latex identifier (ie, \bibitem) in a single line + split a reference string if it contains multiple references - :param reference: - :return: + :param reference: reference string to be split + :return: split reference string """ key = self.re_multi_reference.search(reference) if not key: @@ -1016,11 +1108,12 @@ def split(self, reference): if results: yield ' '.join(results) - def cleanup(self, reference): + def cleanup(self, reference: str) -> str: """ + clean up the reference string by applying various replacements - :param reference: - :return: + :param reference: the reference string to be cleaned up + :return: cleaned reference string """ for (compiled_re, replace_str) in self.re_cleanup: reference = compiled_re.sub(replace_str, reference) @@ -1034,44 +1127,45 @@ def cleanup(self, reference): references.append(clean_reference) return references - def debraket(self, reference): + def debraket(self, reference: str) -> str: """ + remove LaTeX-specific bracket formatting from the reference - :param reference: - :return: + :param reference: reference string to be processed + :return: de-bracketed reference string """ for (compiled_re, replace_str) in self.re_reference_debraket: reference = compiled_re.sub(replace_str, reference) return reference - def get_references(self, filename, encoding): + def append(self, reference: str, bibcode: str, block_references: List, references: List) -> Tuple: """ - read reference file of text format - this is a generic function + append a reference to the list of references - :param filename: - :return: + :param reference: reference string to be appended + :param bibcode: bibcode associated with the reference + :param block_references: list of references to update + :param references: final list of references + :return: updated reference, bibcode, block references, and references list """ + if reference.strip(): + for ref in self.cleanup(reference.strip()): + block_references.append(ref) + reference = '' + if bibcode and block_references: + references.append([bibcode, block_references]) + bibcode = '' + block_references = [] + return reference, bibcode, block_references, references - def append(reference, bibcode, block_references, references): - """ - - :param reference: - :param bibcode: - :param block_references: - :param references: - :return: - """ - if reference.strip(): - for ref in self.cleanup(reference.strip()): - block_references.append(ref) - reference = '' - if bibcode and block_references: - references.append([bibcode, block_references]) - bibcode = '' - block_references = [] - return reference, bibcode, block_references, references + def get_references(self, filename: str, encoding: str) -> List: + """ + read LaTeX reference file and extract references + :param filename: path to the LaTeX file + :param encoding: character encoding for the file + :return: list of references extracted from the file + """ try: references = [] with open(filename, 'r', encoding=encoding, errors='ignore') as f: @@ -1093,7 +1187,7 @@ def append(reference, bibcode, block_references, references): if match: # add anything already read to the returned structure # to move on to this doc - reference, bibcode, block_references, references = append(reference, bibcode, block_references, references) + reference, bibcode, block_references, references = self.append(reference, bibcode, block_references, references) a_block = False bibcode = match.group('bibcode') # is it the beginning of reference block @@ -1140,13 +1234,16 @@ def append(reference, bibcode, block_references, references): # A.~V. 2012, Astrophys. Bull., 67, 147 # however need to distinguish between that and # %Z \reference {Conselice, C. J., Gallagher, J. S., \& Wyse, R. F. G. 2001, AJ, 122, 2281}\ - elif line and self.re_reference_block_specifier.search(line): - match = self.re_reference_block_all_bracketed.search(line) - if match: - reference = match.group('content') - elif self.re_reference_line_start.search(line): - reference = ' ' - reference, bibcode, block_references, references = append(reference, bibcode, block_references, references) + # golnaz -- while adding unittests 3/11/2025 not able to get to this, + # I am sure this block is never going to be reached, so commenting it + # but not removing it + # elif line and self.re_reference_block_specifier.search(line): + # match = self.re_reference_block_all_bracketed.search(line) + # if match: + # reference = match.group('content') + # elif self.re_reference_line_start.search(line): + # reference = ' ' + reference, bibcode, block_references, references = self.append(reference, bibcode, block_references, references) if len(references): logger.debug("Read source file %s, and got %d references to resolve for bibcode %s." % (filename, len(references), bibcode)) @@ -1154,40 +1251,54 @@ def append(reference, bibcode, block_references, references): logger.error('No references found in reference file %s.' % (filename)) return references except Exception as e: - logger.error('Exception: %s' % (str(e))) + logger.error(f'Exception: {str(e)}') return [] class HTMLtoREFs(toREFs): + """ + class for processing references in HTML format + """ + # to match bibcode format re_bibcode = re.compile(r'(^\d{4}[\w\.&+]{14}[A-Z\.]{1})') + # to match DOI in the reference string re_doi = re.compile(r'doi:(.*?)', re.IGNORECASE) + # to match reference block with ADS bibcode re_reference_block = re.compile(r'(.*?)(?=|$)') + # to extract bibcode from reference block re_block_bibcode = re.compile(r'(.*?)') + + # list of tuples for cleaning up reference strings block_cleanup = [ - (re.compile(r'(||||||||
|
|
|
||||||)', re.I), ''), + (re.compile(r'(||||||||
|
|
|
||||||)',re.I), ''), (re.compile(r'&'), '&'), (re.compile(r' '), ' '), ] + # to match placeholder for author list in references re_author_list_placeholder = re.compile(r'[-_]{2,}\.?') - re_prior_year = re.compile(r'(.*)(?=\b[12]+\d\d\d[a-z]*)') - + # to capture prior year in references + re_prior_year = re.compile(r'((\S+\s+){2,})(?=[\s\(\[]*[12]+[09]+\d\d(\S+\s+){2,})') + # to match year in reference string re_year = re.compile(r'([12]+\d\d\d[a-z]*)') + # to match author and year format in reference string re_a_reference = re.compile(r'([A-Z][a-z]+,?\s+[A-Z]+\.?|[A-Z]+\.?\s+[A-Z][a-z]+,)+[^\d]*.*?(\d+)\W+(\d+)') + # constants to identify single or multi bibcode types single_bibcode, multi_bibcode = range(2) - def __init__(self, filename, buffer, parsername, tag, file_type, cleanup=None, encoding='UTF-8'): + def __init__(self, filename: str, buffer: Dict, parsername: str, tag: str, file_type: int, cleanup: List = None, encoding: str = 'UTF-8'): """ + initializes the HTMLtoREFs object and processes the HTML reference file - :param filename: - :param buffer: - :param parsername: - :param tag: - :param file_type: - :param cleanup: - :param encoding: + :param filename: path to the HTML file + :param buffer: dictionary containing buffer data + :param parsername: name of the parser + :param tag: HTML tag for extracting references + :param file_type: type of the file (single or multiple bibcodes) + :param cleanup: optional list of regex patterns for cleanup + :param encoding: character encoding for the file """ toREFs.__init__(self) @@ -1197,8 +1308,9 @@ def __init__(self, filename, buffer, parsername, tag, file_type, cleanup=None, e self.filename = buffer['source_filename'] self.parsername = buffer['parser_name'] - block_references, item_nums = [[b['refraw'] for b in buffer['references']], [b['item_num'] for b in buffer['references']]] - self.raw_references.append({'bibcode': buffer['source_bibcode'], 'block_references': block_references, 'item_nums':item_nums}) + for buf in buffer['block_references']: + block_references, item_nums = [[ref['refraw'] for ref in buf['references']], [ref['item_num'] for ref in buf['references']]] + self.raw_references.append({'bibcode': buf['source_bibcode'], 'block_references': block_references, 'item_nums':item_nums}) else: self.filename = filename self.parsername = parsername @@ -1214,17 +1326,17 @@ def __init__(self, filename, buffer, parsername, tag, file_type, cleanup=None, e self.raw_references.append({'bibcode': bibcode, 'block_references': references}) - def get_references(self, filename, encoding, tag, file_type): + def get_references(self, filename: str, encoding: str, tag: str, file_type: int) -> List: """ - read reference file of html format + extract references from an HTML file based on the file type - :param filename: - :param encoding: - :param tag: - :param file_type: - :return: + :param filename: path to the HTML file + :param encoding: character encoding for the file + :param tag: HTML tag for extracting references + :param file_type: type of the file (single or multiple bibcodes) + :return: list of references extracted from the file """ - # some html references contain multiple manuscripts and have the bibcode for each record in the file + # some html references to contain multiple manuscripts and have the bibcode for each record in the file # on the other hand, some html references contain only one manuscript, and the bibcode is in the filename if file_type == self.single_bibcode: match = self.re_bibcode.match(os.path.basename(filename)) @@ -1232,27 +1344,30 @@ def get_references(self, filename, encoding, tag, file_type): return self.get_references_single_record(filename, encoding, tag, bibcode=match.group(1)) if file_type == self.multi_bibcode: return self.get_references_multi_records(filename, encoding, tag) - return None + return [] - def cleanup(self, reference, reference_cleanup): + def cleanup(self, reference: str, reference_cleanup: List) -> str: """ + clean up a reference string by applying the provided cleanup rules - :param reference: - :return: + :param reference: reference string to be cleaned up + :param reference_cleanup: list of cleanup rules (regex replacements) + :return: cleaned reference string """ if reference_cleanup: for (compiled_re, replace_str) in reference_cleanup: reference = compiled_re.sub(replace_str, reference) return reference - def get_references_single_record(self, filename, encoding, tag, bibcode): + def get_references_single_record(self, filename: str, encoding: str, tag: str, bibcode: str) -> List: """ + extract references from a single record in the HTML file - :param filename: - :param encoding: - :param tag: - :param bibcode: - :return: + :param filename: path to the HTML file + :param encoding: character encoding for the file + :param tag: HTML tag for extracting references + :param bibcode: bibcode for the reference + :return: list of references extracted from the file """ if not bibcode: logger.error('No bibcode extracted in reference file %s.' % (filename)) @@ -1278,27 +1393,28 @@ def get_references_single_record(self, filename, encoding, tag, bibcode): block_references.append(reference) prev_reference = reference else: - logger.debug("Unable to parse source file %s, no tag was provided." % (filename)) + logger.debug(f"Unable to parse source file {filename}, no tag was provided.") if bibcode and block_references: references.append([bibcode, block_references]) if len(references): - logger.debug("Read source file %s, and got %d references to resolve for bibcode %s." % (filename, len(references), bibcode)) + logger.debug(f"Read source file {filename}, and got {len(references)} references to resolve for bibcode {bibcode}.") elif len(references) == 0: - logger.error('No references found in reference file %s.' % (filename)) + logger.error(f'No references found in reference file {filename}.') return references except Exception as e: - logger.error('Exception: %s' % (str(e))) + logger.error(f'Exception: {str(e)}') return [] - def get_references_multi_records(self, filename, encoding, tag): + def get_references_multi_records(self, filename: str, encoding: str, tag: str) -> List: """ + extract references from multiple records in the HTML file - :param filename: - :param encoding: - :param tag: - :return: + :param filename: path to the HTML file + :param encoding: character encoding for the file + :param tag: HTML tag for extracting references + :return: list of references extracted from the file """ try: references = [] @@ -1328,40 +1444,37 @@ def get_references_multi_records(self, filename, encoding, tag): if bibcode and block_references: references.append([bibcode, block_references]) if len(references): - logger.debug("Read source file %s, and got %d references to resolve for bibcode %s." % (filename, len(references), bibcode)) + logger.debug(f"Read source file {filename}, and got {len(references)} references to resolve for bibcode {bibcode}.") elif len(references) == 0: - logger.error('No references found in reference file %s.' % (filename)) + logger.error(f'No references found in reference file {filename}.') return references except Exception as e: - logger.error('Exception: %s' % (str(e))) + logger.error(f'Exception: {str(e)}') return [] - def fix_inheritance(self, cur_refstr, prev_refstr): + def fix_inheritance(self, cur_refstr: str, prev_refstr: str) -> str: """ if author list is the same as the reference above it, a dash is inserted get the list of authors from the previous reference and add it to the current one - :param cur_refstr: - :param prev_refstr: - :return: + :param cur_refstr: the current reference string that may need author inheritance + :param prev_refstr: the previous reference string from which authors might be inherited + :return: the modified current reference string with authors inherited from the previous reference, if applicable """ match = self.re_author_list_placeholder.match(cur_refstr) if match and prev_refstr and len(prev_refstr) > 1: - try: - # find the year and return everything that came before it - prev_authors = self.re_prior_year.match(prev_refstr) - if prev_authors: - cur_refstr = prev_authors.group().strip() + " " + cur_refstr[match.end():].strip() - except TypeError as error: - pass + # find the year and return everything that came before it + prev_authors = self.re_prior_year.match(prev_refstr) + if prev_authors: + cur_refstr = prev_authors.group().strip() + " " + cur_refstr[match.end():].strip() return cur_refstr - def is_reference(self, reference): + def is_reference(self, reference: str) -> bool: """ - a reference has either year or doi or have at least author/volume/page + a reference has either year or doi or has at least author/volume/page - :param reference: - :return: + :param reference: the reference string to be validated + :return: True if the reference is valid, otherwise False """ if self.re_year.search(reference) or self.re_doi.search(reference) or self.has_arXiv_id(reference): return True diff --git a/adsrefpipe/tests/unittests/test_ref_parsers_base.py b/adsrefpipe/tests/unittests/test_ref_parsers_base.py new file mode 100644 index 0000000..e69de29 From 6dcc83bb039abcfa72ba5623aa3fbb55e6891852 Mon Sep 17 00:00:00 2001 From: golnazads <28757512+golnazads@users.noreply.github.com> Date: Wed, 26 Mar 2025 10:18:25 -0400 Subject: [PATCH 6/9] more unittests and docstrings --- adsrefpipe/refparsers/AASxml.py | 29 +- adsrefpipe/refparsers/ADShtml.py | 225 ++- adsrefpipe/refparsers/ADSocr.py | 60 +- adsrefpipe/refparsers/ADStex.py | 34 +- adsrefpipe/refparsers/ADStxt.py | 164 +- adsrefpipe/refparsers/AGUxml.py | 40 +- adsrefpipe/refparsers/AIPxml.py | 68 +- adsrefpipe/refparsers/APSxml.py | 67 +- adsrefpipe/refparsers/AnAxml.py | 40 +- adsrefpipe/refparsers/BlackwellXML.py | 101 +- adsrefpipe/refparsers/ElsevierXML.py | 98 +- adsrefpipe/refparsers/IPAPxml.py | 58 +- adsrefpipe/refparsers/IcarusXML.py | 105 +- adsrefpipe/refparsers/VERSITAxml.py | 76 +- adsrefpipe/refparsers/WileyXML.py | 132 +- adsrefpipe/refparsers/arXivTXT.py | 39 +- adsrefpipe/refparsers/handler.py | 7 +- adsrefpipe/refparsers/xmlFile.py | 51 +- .../tests/unittests/test_ref_parsers.py | 13 - .../tests/unittests/test_ref_parsers_base.py | 1310 +++++++++++++++ .../tests/unittests/test_ref_parsers_xml.py | 1440 ++++++++++++++++- adsrefpipe/tests/unittests/test_tasks.py | 9 +- 22 files changed, 3569 insertions(+), 597 deletions(-) diff --git a/adsrefpipe/refparsers/AASxml.py b/adsrefpipe/refparsers/AASxml.py index 5f91afd..2080c5f 100644 --- a/adsrefpipe/refparsers/AASxml.py +++ b/adsrefpipe/refparsers/AASxml.py @@ -1,7 +1,7 @@ import sys, os -import regex as re import argparse +from typing import List, Dict from adsputils import setup_logging, load_config @@ -15,9 +15,14 @@ class AASreference(XMLreference): + """ + This class handles parsing AAS references in XML format. It extracts identifiers like bibcodes, DOIs, and arXiv IDs + from the XML reference and stores the parsed information. + """ def parse(self): """ + parse the AAS reference :return: """ @@ -50,22 +55,26 @@ def parse(self): class AAStoREFs(XMLtoREFs): + """ + This class converts AAS XML references to a standardized reference format. It processes raw AAS references from either + a file or a buffer and outputs parsed references, including bibcodes, DOIs, and eprints. + """ - def __init__(self, filename, buffer): + def __init__(self, filename: str, buffer: str): """ + initialize the AAStoREFs object - :param filename: - :param buffer: - :param unicode: - :param tag: + :param filename: the path to the source file + :param buffer: the xml references as a buffer """ XMLtoREFs.__init__(self, filename, buffer, parsername=AAStoREFs, tag='CITATION') - def process_and_dispatch(self): + def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]: """ + process the raw references and dispatch parsed references - :return: + :return: list of dictionaries, each containing a bibcode and a list of parsed references """ references = [] for raw_block_references in self.raw_references: @@ -90,6 +99,10 @@ def process_and_dispatch(self): return references +# This is the main program used for manual testing and verification of AASxml references. +# It allows parsing references from either a file or a buffer, and if no input is provided, +# it runs a source test file to verify the functionality against expected parsed results. +# The test results are printed to indicate whether the parsing is successful or not. from adsrefpipe.tests.unittests.stubdata import parsed_references if __name__ == '__main__': # pragma: no cover parser = argparse.ArgumentParser(description='Parse AAS references') diff --git a/adsrefpipe/refparsers/ADShtml.py b/adsrefpipe/refparsers/ADShtml.py index 5ee3f55..598ab8b 100644 --- a/adsrefpipe/refparsers/ADShtml.py +++ b/adsrefpipe/refparsers/ADShtml.py @@ -3,10 +3,7 @@ import regex as re import argparse import urllib.parse - -from adsrefpipe.refparsers.toREFs import HTMLtoREFs -from adsrefpipe.refparsers.reference import unicode_handler -from adsrefpipe.utils import get_bibcode as get_bibcode_from_doi, verify_bibcode +from typing import List, Dict from adsputils import setup_logging, load_config @@ -14,9 +11,18 @@ config = {} config.update(load_config()) +from adsrefpipe.refparsers.toREFs import HTMLtoREFs +from adsrefpipe.refparsers.reference import unicode_handler +from adsrefpipe.utils import get_bibcode as get_bibcode_from_doi, verify_bibcode + class ADSHTMLtoREFs(HTMLtoREFs): + """ + This class processes ADS HTML references and converts them into a standardized reference format. + It handles reference cleanup and parsing of citation information like authors, title, year, journal, volume, pages, DOI, and eprint. + """ + # list of regex patterns to clean the HTML references reference_cleanup = [ (re.compile(r'()'), ''), (re.compile(r'()', re.I), ''), @@ -38,23 +44,23 @@ class ADSHTMLtoREFs(HTMLtoREFs): (re.compile(r'()'), ''), # if there was a nested tag (ie, href inside comment ] - def __init__(self, filename, buffer, parsername, tag, file_type, cleanup=None, encoding='UTF-8'): + def __init__(self, filename: str, buffer: str, parsername: str, tag: str, file_type: str, cleanup=None, encoding='UTF-8'): """ - - :param filename: - :param buffer: - :param parsername: - :param tag: + :param filename: path to the reference file + :param buffer: buffer containing the references + :param parsername: name of the parser + :param tag: regex tag for parsing + :param file_type: the file type (HTML, XML, etc.) """ if not cleanup: cleanup = self.reference_cleanup HTMLtoREFs.__init__(self, filename, buffer, parsername=parsername, tag=tag, file_type=file_type, cleanup=cleanup, encoding=encoding) - def process_and_dispatch(self): + def process_and_dispatch(self) -> List[Dict[str, List[Dict[str, str]]]]: """ - this function does reference cleaning and then calls the parser + clean references and call the parser to process and dispatch them - :return: + :return: list of references with bibcode and parsed reference details """ references = [] for raw_block_references in self.raw_references: @@ -75,13 +81,15 @@ def process_and_dispatch(self): class AnnRevHTMLtoREFs(ADSHTMLtoREFs): """ - This is to process Annual Review references. There are + This class processes Annual Review references. + They are AnRFM/*/annurev.fluid AREPS/*/annurev.earth ARA+A/*/annurev.astro """ + # to clean up html references reference_cleanup = [ (re.compile(r'()'), ''), (re.compile(r'(|)', re.I), ''), @@ -101,6 +109,7 @@ class AnnRevHTMLtoREFs(ADSHTMLtoREFs): (re.compile(r'–'), '-'), (re.compile(r'(\<\d+:[A-Z]+\>)'), '') ] + # to clean up block of html references block_cleanup = [ (re.compile(r'(||||||||
|
|
|
||||||)', re.I), ''), (re.compile(r'&'), '&'), @@ -109,28 +118,33 @@ class AnnRevHTMLtoREFs(ADSHTMLtoREFs): (re.compile(r'(\s*', re.DOTALL), '') ] - # re_tag = re.compile(r'((?:|]*>)\s*([A-Z][a-z]+.*?)|(?:).*?)(?:|$)', (re.IGNORECASE | re.DOTALL)) + # to match tags in the reference block re_tag = re.compile(r'(?:(?:|]*>)\s*([A-Z][a-z]+.*?)|(?:)(.*?))(?:|$)', (re.IGNORECASE | re.DOTALL)) + # to match DOI in the format re_doi = re.compile(r'\(doi:(.*?)\)', re.IGNORECASE) + # to extract the bibcode re_bibcode = re.compile(r'(.*)') + # to match the reference text before a