From 0bbfc00661d59ce55603d70ea304b73adf29abe8 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Mon, 5 Jan 2026 10:18:15 -0800 Subject: [PATCH 1/8] add external_identifier column --- adsrefpipe/models.py | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/adsrefpipe/models.py b/adsrefpipe/models.py index 5e2c725..046e281 100755 --- a/adsrefpipe/models.py +++ b/adsrefpipe/models.py @@ -2,7 +2,7 @@ from sqlalchemy import Integer, String, Column, ForeignKey, DateTime, func, Numeric, ForeignKeyConstraint -from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.dialects.postgresql import JSONB, ARRAY from sqlalchemy.ext.declarative import declarative_base @@ -213,6 +213,7 @@ class ResolvedReference(Base): bibcode = Column(String) score = Column(Numeric) reference_raw = Column(String) + external_identifier = Column(ARRAY(String)) def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str): """ @@ -224,6 +225,7 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: :param bibcode: resolved bibcode :param score: confidence score of the resolved reference :param reference_raw: raw reference string + :param external_identifier: list of external identifiers associated with the reference, e.g. ["doi:...", "arxiv:...", "ascl:..."] """ self.history_id = history_id self.item_num = item_num @@ -231,6 +233,7 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: self.bibcode = bibcode self.score = score self.reference_raw = reference_raw + self.external_identifier = external_identifier def toJSON(self) -> dict: """ @@ -244,7 +247,8 @@ def toJSON(self) -> dict: 'bibcode': self.bibcode, 'score': self.score, 'item_num': self.item_num, - **({'reference_raw': self.reference_raw} if self.reference_raw else {}) + **({'reference_raw': self.reference_raw} if self.reference_raw else {}), + 'external_identifier': self.external_identifier } From a66c1a673c124db889a5e752166784bbbb456806 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Mon, 5 Jan 2026 11:21:17 -0800 Subject: [PATCH 2/8] alembic upgrade add external identifier --- .../08ca70bd6f5f_add_external_identifier.py | 24 +++++++++++++++++++ 1 file changed, 24 insertions(+) create mode 100644 alembic/versions/08ca70bd6f5f_add_external_identifier.py diff --git a/alembic/versions/08ca70bd6f5f_add_external_identifier.py b/alembic/versions/08ca70bd6f5f_add_external_identifier.py new file mode 100644 index 0000000..62e9caa --- /dev/null +++ b/alembic/versions/08ca70bd6f5f_add_external_identifier.py @@ -0,0 +1,24 @@ +"""add_external_identifier + +Revision ID: 08ca70bd6f5f +Revises: e3d6e15c3b8c +Create Date: 2026-01-05 11:16:27.454389 + +""" +from alembic import op +import sqlalchemy as sa + + +# revision identifiers, used by Alembic. +revision = '08ca70bd6f5f' +down_revision = 'e3d6e15c3b8c' +branch_labels = None +depends_on = None + + +def upgrade(): + pass + + +def downgrade(): + pass From 4ca1862f96abff2ade6e8575bb212f7420865513 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Mon, 5 Jan 2026 14:35:31 -0800 Subject: [PATCH 3/8] alembic update external_identifier --- alembic/versions/08ca70bd6f5f_add_external_identifier.py | 9 +++++++-- 1 file changed, 7 insertions(+), 2 deletions(-) diff --git a/alembic/versions/08ca70bd6f5f_add_external_identifier.py b/alembic/versions/08ca70bd6f5f_add_external_identifier.py index 62e9caa..3a360a0 100644 --- a/alembic/versions/08ca70bd6f5f_add_external_identifier.py +++ b/alembic/versions/08ca70bd6f5f_add_external_identifier.py @@ -7,6 +7,7 @@ """ from alembic import op import sqlalchemy as sa +from sqlalchemy.dialects import postgresql # revision identifiers, used by Alembic. @@ -17,8 +18,12 @@ def upgrade(): - pass + op.add_column('resolved_reference', + sa.Column("external_identifier", + postgresql.ARRAY(sa.String())) + ) def downgrade(): - pass + op.drop_column('resolved_reference', 'external_identifier') + From 82e66efd4a3593eebf7c6f4a1125a988f726733f Mon Sep 17 00:00:00 2001 From: thomasallen Date: Thu, 8 Jan 2026 11:48:19 -0800 Subject: [PATCH 4/8] add external identifier to unittest mock data --- adsrefpipe/tests/unittests/test_app.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index c21cff3..80fce1d 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -811,7 +811,8 @@ def test_populate_tables(self): "bibcode": "2011LRR....14....2U", "refstring": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", "refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", - "id": "H1I1" + "id": "H1I1", + "ext_id": "ExtID1" }, { "score": "1.0", @@ -819,6 +820,7 @@ def test_populate_tables(self): "refstring": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "id": "H1I2", + "ext_id": "ExtID2" } ] arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/') From 0835c731a9e56ebbb17fbad143e31d5a52f34e58 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Thu, 15 Jan 2026 10:30:03 -0800 Subject: [PATCH 5/8] update for unit tests --- adsrefpipe/app.py | 12 ++++ adsrefpipe/models.py | 4 +- adsrefpipe/tests/unittests/test_app.py | 84 ++++++++++++++++-------- adsrefpipe/tests/unittests/test_tasks.py | 2 + 4 files changed, 72 insertions(+), 30 deletions(-) diff --git a/adsrefpipe/app.py b/adsrefpipe/app.py index 1a587cb..d3f12a8 100755 --- a/adsrefpipe/app.py +++ b/adsrefpipe/app.py @@ -22,6 +22,15 @@ from texttable import Texttable +def _ensure_list(x): + if x is None: + return None + # treat strings as scalars, not iterables + if isinstance(x, (str, bytes)): + return [x] + # already list-like + return list(x) + class ADSReferencePipelineCelery(ADSCelery): """ celery-based pipeline for processing and resolving references @@ -306,6 +315,7 @@ def query_resolved_reference_tbl(self, history_id_list: List = None) -> List: return results + def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List = None) -> List: """ perform a diagnostic query to retrieve combined reference records @@ -315,6 +325,8 @@ def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List :return: List of combined records from multiple tables """ results = [] + bibcode_list = _ensure_list(bibcode_list) + source_filename_list = _ensure_list(source_filename_list) reference_source = self.query_reference_source_tbl(bibcode_list, source_filename_list) processed_history = self.query_processed_history_tbl(bibcode_list, source_filename_list) diff --git a/adsrefpipe/models.py b/adsrefpipe/models.py index 046e281..b2db105 100755 --- a/adsrefpipe/models.py +++ b/adsrefpipe/models.py @@ -215,7 +215,7 @@ class ResolvedReference(Base): reference_raw = Column(String) external_identifier = Column(ARRAY(String)) - def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str): + def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str, external_identifier: list = None): """ initializes a resolved reference object @@ -233,7 +233,7 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: self.bibcode = bibcode self.score = score self.reference_raw = reference_raw - self.external_identifier = external_identifier + self.external_identifier = external_identifier or [] def toJSON(self) -> dict: """ diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index 80fce1d..f118425 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -30,6 +30,8 @@ from adsrefpipe.refparsers.handler import verify from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records +import testing.postgresql + class TestDatabase(unittest.TestCase): @@ -39,18 +41,26 @@ class TestDatabase(unittest.TestCase): maxDiff = None - postgresql_url_dict = { - 'port': 5432, - 'host': '127.0.0.1', - 'user': 'postgres', - 'database': 'postgres' - } - postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ - .format(user=postgresql_url_dict['user'], - host=postgresql_url_dict['host'], - port=postgresql_url_dict['port'], - database=postgresql_url_dict['database'] - ) + # postgresql_url_dict = { + # 'port': 5432, + # 'host': '127.0.0.1', + # 'user': 'postgres', + # 'database': 'postgres' + # } + # postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ + # .format(user=postgresql_url_dict['user'], + # host=postgresql_url_dict['host'], + # port=postgresql_url_dict['port'], + # database=postgresql_url_dict['database'] + # ) + + _postgresql = testing.postgresql.Postgresql() + postgresql_url = _postgresql.url() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + cls._postgresql.stop() def setUp(self): self.test_dir = os.path.join(project_home, 'adsrefpipe/tests') @@ -117,8 +127,13 @@ def add_stub_data(self): ] with self.app.session_scope() as session: - session.bulk_save_objects(actions_records) - session.bulk_save_objects(parsers_records) + session.query(Action).delete() + session.query(Parser).delete() + session.commit() + if session.query(Action).count() == 0: + session.bulk_save_objects(actions_records) + if session.query(Parser).count() == 0: + session.bulk_save_objects(parsers_records) session.commit() for i, (a_reference,a_history) in enumerate(zip(reference_source,processed_history)): @@ -745,18 +760,26 @@ class TestDatabaseNoStubdata(unittest.TestCase): maxDiff = None - postgresql_url_dict = { - 'port': 5432, - 'host': '127.0.0.1', - 'user': 'postgres', - 'database': 'postgres' - } - postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ - .format(user=postgresql_url_dict['user'], - host=postgresql_url_dict['host'], - port=postgresql_url_dict['port'], - database=postgresql_url_dict['database'] - ) + # postgresql_url_dict = { + # 'port': 5432, + # 'host': '127.0.0.1', + # 'user': 'postgres', + # 'database': 'postgres' + # } + # postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ + # .format(user=postgresql_url_dict['user'], + # host=postgresql_url_dict['host'], + # port=postgresql_url_dict['port'], + # database=postgresql_url_dict['database'] + # ) + + _postgresql = testing.postgresql.Postgresql() + postgresql_url = _postgresql.url() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + cls._postgresql.stop() def setUp(self): self.test_dir = os.path.join(project_home, 'adsrefpipe/tests') @@ -825,8 +848,13 @@ def test_populate_tables(self): ] arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/') with self.app.session_scope() as session: - session.bulk_save_objects(actions_records) - session.bulk_save_objects(parsers_records) + session.query(Action).delete() + session.query(Parser).delete() + session.commit() + if session.query(Action).count() == 0: + session.bulk_save_objects(actions_records) + if session.query(Parser).count() == 0: + session.bulk_save_objects(parsers_records) session.commit() references = self.app.populate_tables_pre_resolved_initial_status( diff --git a/adsrefpipe/tests/unittests/test_tasks.py b/adsrefpipe/tests/unittests/test_tasks.py index fb4ee58..153f041 100755 --- a/adsrefpipe/tests/unittests/test_tasks.py +++ b/adsrefpipe/tests/unittests/test_tasks.py @@ -74,6 +74,8 @@ def add_stub_data(self): ] with self.app.session_scope() as session: + session.query(Action).delete() + session.query(Parser).delete() session.bulk_save_objects(actions_records) session.bulk_save_objects(parsers_records) session.commit() From 7d7a9a12a66d8089b3b1bcfc4cdd0442455a3f15 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 27 Jan 2026 10:55:05 -0800 Subject: [PATCH 6/8] check external_identifier column --- adsrefpipe/tests/unittests/test_app.py | 368 +++++++++++++++---------- 1 file changed, 227 insertions(+), 141 deletions(-) diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index f118425..8c21957 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -33,6 +33,11 @@ import testing.postgresql +def _get_external_identifier(rr_obj): + """Return external_identifier from a ResolvedReference ORM object.""" + return getattr(rr_obj, "external_identifier", None) + + class TestDatabase(unittest.TestCase): """ @@ -41,19 +46,6 @@ class TestDatabase(unittest.TestCase): maxDiff = None - # postgresql_url_dict = { - # 'port': 5432, - # 'host': '127.0.0.1', - # 'user': 'postgres', - # 'database': 'postgres' - # } - # postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ - # .format(user=postgresql_url_dict['user'], - # host=postgresql_url_dict['host'], - # port=postgresql_url_dict['port'], - # database=postgresql_url_dict['database'] - # ) - _postgresql = testing.postgresql.Postgresql() postgresql_url = _postgresql.url() @@ -85,9 +77,12 @@ def add_stub_data(self): self.arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/') reference_source = [ - ('0001arXiv.........Z',os.path.join(self.arXiv_stubdata_dir,'00001.raw'),os.path.join(self.arXiv_stubdata_dir,'00001.raw.result'),'arXiv'), - ('0002arXiv.........Z',os.path.join(self.arXiv_stubdata_dir,'00002.raw'),os.path.join(self.arXiv_stubdata_dir,'00002.raw.result'),'arXiv'), - ('0003arXiv.........Z',os.path.join(self.arXiv_stubdata_dir,'00003.raw'),os.path.join(self.arXiv_stubdata_dir,'00003.raw.result'),'arXiv') + ('0001arXiv.........Z', os.path.join(self.arXiv_stubdata_dir, '00001.raw'), + os.path.join(self.arXiv_stubdata_dir, '00001.raw.result'), 'arXiv'), + ('0002arXiv.........Z', os.path.join(self.arXiv_stubdata_dir, '00002.raw'), + os.path.join(self.arXiv_stubdata_dir, '00002.raw.result'), 'arXiv'), + ('0003arXiv.........Z', os.path.join(self.arXiv_stubdata_dir, '00003.raw'), + os.path.join(self.arXiv_stubdata_dir, '00003.raw.result'), 'arXiv') ] processed_history = [ @@ -96,33 +91,41 @@ def add_stub_data(self): ('2020-04-03 18:08:32', '2020-05-11 11:14:28', '128', '109') ] + # Add external identifiers for each resolved reference to verify persistence in DB + # Each tuple: (reference_str, bibcode, score, external_identifier_list) resolved_reference = [ [ - ('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ','2011LRR....14....2U',1.0), - ('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.','2017RPPh...80l6902M',1.0) + ('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ', + '2011LRR....14....2U', 1.0, ['arxiv:1009.5514']), + ('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.', + '2017RPPh...80l6902M', 1.0, ['arxiv:1709.02923']) ], [ - ('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.','2011MNRAS.417..709A',1.0), - ('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ','2019A&A...625A.136A',1.0) + ('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.', + '2011MNRAS.417..709A', 1.0, ['doi:10.0000/mnras.417.709']), + ('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', + '2019A&A...625A.136A', 1.0, ['doi:10.0000/aa.625.A136']) ], [ - ('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24','2017ApJ...842L..24A',1.0), - ('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ','2016A&A...586A..71A',1.0) + ('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24', + '2017ApJ...842L..24A', 1.0, ['ascl:1701.001']), + ('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ', + '2016A&A...586A..71A', 1.0, ['doi:10.0000/aa.586.A71']) ], ] compare_classic = [ [ - ('2010arXiv1009.5514U',1,'DIFF'), - ('2017arXiv170902923M',1,'DIFF') + ('2010arXiv1009.5514U', 1, 'DIFF'), + ('2017arXiv170902923M', 1, 'DIFF') ], [ - ('2011MNRAS.417..709A',1,'MATCH'), - ('2019A&A...625A.136A',1,'MATCH') + ('2011MNRAS.417..709A', 1, 'MATCH'), + ('2019A&A...625A.136A', 1, 'MATCH') ], [ - ('2017ApJ...842L..24A',1,'MATCH'), - ('2016A&A...586A..71A',1,'MATCH') + ('2017ApJ...842L..24A', 1, 'MATCH'), + ('2016A&A...586A..71A', 1, 'MATCH') ] ] @@ -136,44 +139,56 @@ def add_stub_data(self): session.bulk_save_objects(parsers_records) session.commit() - for i, (a_reference,a_history) in enumerate(zip(reference_source,processed_history)): - reference_record = ReferenceSource(bibcode=a_reference[0], - source_filename=a_reference[1], - resolved_filename=a_reference[2], - parser_name=a_reference[3]) + for i, (a_reference, a_history) in enumerate(zip(reference_source, processed_history)): + reference_record = ReferenceSource( + bibcode=a_reference[0], + source_filename=a_reference[1], + resolved_filename=a_reference[2], + parser_name=a_reference[3] + ) bibcode, source_filename = self.app.insert_reference_source_record(session, reference_record) self.assertTrue(bibcode == a_reference[0]) self.assertTrue(source_filename == a_reference[1]) - history_record = ProcessedHistory(bibcode=bibcode, - source_filename=source_filename, - source_modified=a_history[0], - status=Action().get_status_new(), - date=a_history[1], - total_ref=a_history[2]) + history_record = ProcessedHistory( + bibcode=bibcode, + source_filename=source_filename, + source_modified=a_history[0], + status=Action().get_status_new(), + date=a_history[1], + total_ref=a_history[2] + ) history_id = self.app.insert_history_record(session, history_record) self.assertTrue(history_id != -1) resolved_records = [] compare_records = [] - for j, (service,classic) in enumerate(zip(resolved_reference[i],compare_classic[i])): - resolved_record = ResolvedReference(history_id=history_id, - item_num=j+1, - reference_str=service[0], - bibcode=service[1], - score=service[2], - reference_raw=service[0]) + for j, (service, classic) in enumerate(zip(resolved_reference[i], compare_classic[i])): + refstr, bib, sc, ext_ids = service + resolved_record = ResolvedReference( + history_id=history_id, + item_num=j + 1, + reference_str=refstr, + bibcode=bib, + score=sc, + reference_raw=refstr, + external_identifier=ext_ids + ) resolved_records.append(resolved_record) - compare_record = CompareClassic(history_id=history_id, - item_num=j+1, - bibcode=classic[0], - score=classic[1], - state=classic[2]) + + compare_record = CompareClassic( + history_id=history_id, + item_num=j + 1, + bibcode=classic[0], + score=classic[1], + state=classic[2] + ) compare_records.append(compare_record) + success = self.app.insert_resolved_reference_records(session, resolved_records) - self.assertTrue(success == True) + self.assertTrue(success is True) success = self.app.insert_compare_records(session, compare_records) - self.assertTrue(success == True) + self.assertTrue(success is True) session.commit() def test_query_reference_tbl(self): @@ -181,8 +196,8 @@ def test_query_reference_tbl(self): result_expected = [ { 'bibcode': '0001arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00001.raw'), - 'resolved_filename': os.path.join(self.arXiv_stubdata_dir,'00001.raw.result'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00001.raw'), + 'resolved_filename': os.path.join(self.arXiv_stubdata_dir, '00001.raw.result'), 'parser_name': 'arXiv', 'num_runs': 1, 'last_run_date': '2020-05-11 11:13:36', @@ -190,8 +205,8 @@ def test_query_reference_tbl(self): 'last_run_num_resolved_references': 2 }, { 'bibcode': '0002arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), - 'resolved_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw.result'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw'), + 'resolved_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw.result'), 'parser_name': 'arXiv', 'num_runs': 1, 'last_run_date': '2020-05-11 11:13:53', @@ -199,8 +214,8 @@ def test_query_reference_tbl(self): 'last_run_num_resolved_references': 2 }, { 'bibcode': '0003arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00003.raw'), - 'resolved_filename': os.path.join(self.arXiv_stubdata_dir,'00003.raw.result'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00003.raw'), + 'resolved_filename': os.path.join(self.arXiv_stubdata_dir, '00003.raw.result'), 'parser_name': 'arXiv', 'num_runs': 1, 'last_run_date': '2020-05-11 11:14:28', @@ -215,9 +230,11 @@ def test_query_reference_tbl(self): self.assertTrue(result_expected == result_got) # test querying filenames - filenames = [os.path.join(self.arXiv_stubdata_dir,'00001.raw'), - os.path.join(self.arXiv_stubdata_dir,'00002.raw'), - os.path.join(self.arXiv_stubdata_dir,'00003.raw')] + filenames = [ + os.path.join(self.arXiv_stubdata_dir, '00001.raw'), + os.path.join(self.arXiv_stubdata_dir, '00002.raw'), + os.path.join(self.arXiv_stubdata_dir, '00003.raw') + ] result_got = self.app.diagnostic_query(source_filename_list=filenames) self.assertTrue(result_expected == result_got) @@ -236,11 +253,13 @@ def test_query_reference_tbl_when_non_exits(self): self.assertTrue(self.app.diagnostic_query(bibcode_list=['0004arXiv.........Z']) == []) # test when filename does not exist - self.assertTrue(self.app.diagnostic_query(source_filename_list=os.path.join(self.arXiv_stubdata_dir,'00004.raw')) == []) + self.assertTrue(self.app.diagnostic_query(source_filename_list=os.path.join(self.arXiv_stubdata_dir, '00004.raw')) == []) # test when both bibcode and filename are passed and nothing is returned - self.assertTrue(self.app.diagnostic_query(bibcode_list=['0004arXiv.........Z'], - source_filename_list=os.path.join(self.arXiv_stubdata_dir,'00004.raw')) == []) + self.assertTrue(self.app.diagnostic_query( + bibcode_list=['0004arXiv.........Z'], + source_filename_list=os.path.join(self.arXiv_stubdata_dir, '00004.raw') + ) == []) def test_insert_reference_record(self): """ test inserting reference_source record """ @@ -249,13 +268,15 @@ def test_insert_reference_record(self): # see that it is returned without it being inserted with self.app.session_scope() as session: count = self.app.get_count_reference_source_records(session) - reference_record = ReferenceSource(bibcode='0001arXiv.........Z', - source_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw'), - resolved_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw.result'), - parser_name=self.app.get_parser(os.path.join(self.arXiv_stubdata_dir,'00001.raw')).get('name')) + reference_record = ReferenceSource( + bibcode='0001arXiv.........Z', + source_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw'), + resolved_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw.result'), + parser_name=self.app.get_parser(os.path.join(self.arXiv_stubdata_dir, '00001.raw')).get('name') + ) bibcode, source_filename = self.app.insert_reference_source_record(session, reference_record) self.assertTrue(bibcode == '0001arXiv.........Z') - self.assertTrue(source_filename == os.path.join(self.arXiv_stubdata_dir,'00001.raw')) + self.assertTrue(source_filename == os.path.join(self.arXiv_stubdata_dir, '00001.raw')) self.assertTrue(self.app.get_count_reference_source_records(session) == count) def test_parser_name(self): @@ -274,7 +295,7 @@ def test_parser_name(self): 'AGU': ['/JGR/0101/issD14.agu.xml', AGUtoREFs], 'arXiv': ['/arXiv/2011/00324.raw', ARXIVtoREFs], } - for name,info in parser.items(): + for name, info in parser.items(): self.assertEqual(name, self.app.get_parser(info[0]).get('name')) self.assertEqual(info[1], verify(name)) # now verify couple of errors @@ -298,7 +319,7 @@ def test_reference_service_endpoint(self): 'arXiv': '/text', 'AEdRvHTML': '/text', } - for name,endpoint in parser.items(): + for name, endpoint in parser.items(): self.assertEqual(endpoint, self.app.get_reference_service_endpoint(name)) # now verify an error self.assertEqual(self.app.get_reference_service_endpoint('errorname'), '') @@ -316,8 +337,10 @@ def test_stats_compare(self): "| review of the physics, searches and implications, | | | | | | | | | |\n" \ "| 1709.02923. | | | | | | | | | |\n" \ "+--------------------------------------------------------------+---------------------+---------------------+-----------------+-----------------+-------+-------+-------+-------+-------+" - result_got, num_references, num_resolved = self.app.get_service_classic_compare_stats_grid(source_bibcode='0001arXiv.........Z', - source_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw')) + result_got, num_references, num_resolved = self.app.get_service_classic_compare_stats_grid( + source_bibcode='0001arXiv.........Z', + source_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw') + ) self.assertEqual(result_got, result_expected) self.assertEqual(num_references, 2) self.assertEqual(num_resolved, 2) @@ -326,7 +349,7 @@ def test_reprocess_references(self): """ test reprocessing references """ result_expected_year = [ {'source_bibcode': '0002arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw'), 'source_modified': datetime(2020, 4, 3, 18, 8, 42), 'parser_name': 'arXiv', 'references': [{'item_num': 2, @@ -335,24 +358,30 @@ def test_reprocess_references(self): ] result_expected_bibstem = [ {'source_bibcode': '0002arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw'), 'source_modified': datetime(2020, 4, 3, 18, 8, 42), 'parser_name': 'arXiv', 'references': [{'item_num': 2, 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', 'refraw': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 '}] - }, + }, {'source_bibcode': '0003arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00003.raw'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00003.raw'), 'source_modified': datetime(2020, 4, 3, 18, 8, 32), 'parser_name': 'arXiv', 'references': [{'item_num': 2, 'refstr': 'Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ', 'refraw': 'Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 '}] - } + } ] - self.assertEqual(self.app.get_reprocess_records(ReprocessQueryType.year, match_bibcode='2019', score_cutoff=None, date_cutoff=None), result_expected_year) - self.assertEqual(self.app.get_reprocess_records(ReprocessQueryType.bibstem, match_bibcode='A&A..', score_cutoff=None, date_cutoff=None), result_expected_bibstem) + self.assertEqual( + self.app.get_reprocess_records(ReprocessQueryType.year, match_bibcode='2019', score_cutoff=None, date_cutoff=None), + result_expected_year + ) + self.assertEqual( + self.app.get_reprocess_records(ReprocessQueryType.bibstem, match_bibcode='A&A..', score_cutoff=None, date_cutoff=None), + result_expected_bibstem + ) references_and_ids_year = [ {'id': 'H4I2', 'reference': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 '} @@ -361,7 +390,8 @@ def test_reprocess_references(self): source_bibcode=result_expected_year[0]['source_bibcode'], source_filename=result_expected_year[0]['source_filename'], source_modified=result_expected_year[0]['source_modified'], - retry_records=result_expected_year[0]['references']) + retry_records=result_expected_year[0]['references'] + ) self.assertTrue(reprocess_references) self.assertTrue(reprocess_references, references_and_ids_year) current_num_records = [ @@ -427,10 +457,12 @@ def test_populate_tables_pre_resolved_initial_status_exception(self): mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") with patch.object(self.app.logger, 'error') as mock_error: - results = self.app.populate_tables_pre_resolved_initial_status('0001arXiv.........Z', - os.path.join(self.arXiv_stubdata_dir,'00001.raw'), - 'arXiv', - references=[]) + results = self.app.populate_tables_pre_resolved_initial_status( + '0001arXiv.........Z', + os.path.join(self.arXiv_stubdata_dir, '00001.raw'), + 'arXiv', + references=[] + ) self.assertEqual(results, []) mock_session.rollback.assert_called_once() mock_error.assert_called() @@ -442,10 +474,12 @@ def test_populate_tables_pre_resolved_retry_status_exception(self): mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") with patch.object(self.app.logger, 'error') as mock_error: - results = self.app.populate_tables_pre_resolved_retry_status('0001arXiv.........Z', - os.path.join(self.arXiv_stubdata_dir,'00001.raw'), - source_modified='', - retry_records=[]) + results = self.app.populate_tables_pre_resolved_retry_status( + '0001arXiv.........Z', + os.path.join(self.arXiv_stubdata_dir, '00001.raw'), + source_modified='', + retry_records=[] + ) self.assertEqual(results, []) mock_session.rollback.assert_called_once() mock_error.assert_called() @@ -457,25 +491,39 @@ def test_populate_tables_post_resolved_exception(self): mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") with patch.object(self.app.logger, 'error') as mock_error: - result = self.app.populate_tables_post_resolved(resolved_reference=[], - source_bibcode='0001arXiv.........Z', - classic_resolved_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw.results')) + result = self.app.populate_tables_post_resolved( + resolved_reference=[], + source_bibcode='0001arXiv.........Z', + classic_resolved_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw.results') + ) self.assertEqual(result, False) mock_session.rollback.assert_called_once() mock_error.assert_called() def test_populate_tables_post_resolved_with_classic(self): - """ test populate_tables_post_resolved when resolved_classic is available """ + """ test populate_tables_post_resolved when resolved_classic is available AND external_identifier is set """ resolved_reference = [ - {'id': 'H1I1', 'refstring': 'Reference 1', 'bibcode': '2023A&A...657A...1X', 'score': 1.0}, - {'id': 'H1I2', 'refstring': 'Reference 2', 'bibcode': '2023A&A...657A...2X', 'score': 0.8} + { + 'id': 'H1I1', + 'refstring': 'Reference 1', + 'bibcode': '2023A&A...657A...1X', + 'score': 1.0, + 'external_identifier': ['doi:10.1234/abc', 'arxiv:2301.00001'], + }, + { + 'id': 'H1I2', + 'refstring': 'Reference 2', + 'bibcode': '2023A&A...657A...2X', + 'score': 0.8, + 'external_identifier': ['ascl:2301.001', 'doi:10.9999/xyz'], + } ] source_bibcode = "2023A&A...657A...1X" classic_resolved_filename = "classic_results.txt" classic_resolved_reference = [ - (1, "2023A&A...657A...1X", "1", "MATCH"), - (2, "2023A&A...657A...2X", "1", "MATCH") + (1, "2023A&A...657A...657A...1X", "1", "MATCH"), + (2, "2023A&A...657A...657A...2X", "1", "MATCH") ] with patch.object(self.app, "session_scope"), \ @@ -491,6 +539,12 @@ def test_populate_tables_post_resolved_with_classic(self): mock_insert.assert_called_once() mock_logger.assert_called_with("Updated 2 resolved reference records successfully.") + # Check whether external_identifier is populated with correct data + _, resolved_records = mock_update.call_args[0] + self.assertEqual(len(resolved_records), 2) + self.assertEqual(_get_external_identifier(resolved_records[0]), ['doi:10.1234/abc', 'arxiv:2301.00001']) + self.assertEqual(_get_external_identifier(resolved_records[1]), ['ascl:2301.001', 'doi:10.9999/xyz']) + @patch("adsrefpipe.app.ProcessedHistory") @patch("adsrefpipe.app.ResolvedReference") @patch("adsrefpipe.app.CompareClassic") @@ -514,16 +568,28 @@ def test_get_service_classic_compare_tags(self, mock_compare, mock_resolved, moc result1 = self.app.get_service_classic_compare_tags(mock_session, source_bibcode="2023A&A...657A...1X", source_filename="") self.assertEqual(result1, "mock_final_subquery") - expected_filter_bibcode = and_(mock_processed.id == mock_resolved.history_id, literal('"2023A&A...657A...1X').op('~')(mock_processed.bibcode)) - found_bibcode_filter = any(call.args and expected_filter_bibcode.compare(call.args[0]) for call in mock_session.query().filter.call_args_list) + expected_filter_bibcode = and_( + mock_processed.id == mock_resolved.history_id, + literal('"2023A&A...657A...1X').op('~')(mock_processed.bibcode) + ) + found_bibcode_filter = any( + call.args and expected_filter_bibcode.compare(call.args[0]) + for call in mock_session.query().filter.call_args_list + ) self.assertTrue(found_bibcode_filter) # test case 2: Only source_filename are provided result2 = self.app.get_service_classic_compare_tags(mock_session, source_bibcode="", source_filename="some_source_file.txt") self.assertEqual(result2, "mock_final_subquery") - expected_filter_filename = and_(mock_processed.id == mock_resolved.history_id, literal('2023A&A...657A...1X').op('~')(mock_processed.source_filename)) - found_filename_filter = any(call.args and expected_filter_filename.compare(call.args[0]) for call in mock_session.query().filter.call_args_list) + expected_filter_filename = and_( + mock_processed.id == mock_resolved.history_id, + literal('2023A&A...657A...1X').op('~')(mock_processed.source_filename) + ) + found_filename_filter = any( + call.args and expected_filter_filename.compare(call.args[0]) + for call in mock_session.query().filter.call_args_list + ) self.assertTrue(found_filename_filter) def test_get_service_classic_compare_stats_grid_error(self): @@ -545,10 +611,15 @@ def test_get_service_classic_compare_stats_grid_error(self): # mock `session.query(...).all()` to return an empty list mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [] - result = self.app.get_service_classic_compare_stats_grid(source_bibcode='0001arXiv.........Z', - source_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw')) + result = self.app.get_service_classic_compare_stats_grid( + source_bibcode='0001arXiv.........Z', + source_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw') + ) - self.assertEqual(result, ('Unable to fetch data for reference source file `%s` from database!'%os.path.join(self.arXiv_stubdata_dir,'00001.raw'), -1, -1)) + self.assertEqual( + result, + ('Unable to fetch data for reference source file `%s` from database!' % os.path.join(self.arXiv_stubdata_dir, '00001.raw'), -1, -1) + ) @patch("adsrefpipe.app.datetime") def test_filter_reprocess_query(self, mock_datetime): @@ -598,12 +669,9 @@ def test_filter_reprocess_query(self, mock_datetime): # Test case: date_cutoff is applied mock_query.reset_mock() self.app.filter_reprocess_query(mock_query, ReprocessQueryType.score, 0.8, "", 10) - expected_since = datetime(2025, 1, 1) - timedelta(days=10) mock_query.filter.assert_called() called_args, _ = mock_query.filter.call_args compiled_query = called_args[0].compile(dialect=postgresql.dialect()) - print(str(called_args[0])) - print(compiled_query.params) self.assertTrue(str(called_args[0]), 'resolved_reference.score <= :score_1') self.assertTrue(compiled_query.params.get('score_1'), 0.8) @@ -620,9 +688,11 @@ def test_get_reprocess_records(self): # mock query results with same history_id to trigger the else block mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ - MockRow(history_id=1, item_num=1, refstr="Reference 1", refraw="Raw 1", source_bibcode="2023A&A...657A...1X", + MockRow(history_id=1, item_num=1, refstr="Reference 1", refraw="Raw 1", + source_bibcode="2023A&A...657A...1X", source_filename="some_source_file.txt", source_modified="D1", parser_name="arXiv"), - MockRow(history_id=1, item_num=2, refstr="Reference 2", refraw="Raw 2", source_bibcode="2023A&A...657A...1X", + MockRow(history_id=1, item_num=2, refstr="Reference 2", refraw="Raw 2", + source_bibcode="2023A&A...657A...1X", source_filename="some_source_file.txt", source_modified="D1", parser_name="arXiv"), ] @@ -643,8 +713,10 @@ def test_get_resolved_references_all(self): # mock query results with highest scores mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv"), - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv"), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, + resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv"), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, + resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv"), ] results = self.app.get_resolved_references_all("2023A&A...657A...1X") @@ -667,12 +739,16 @@ def test_get_resolved_references(self): mock_session = mock_session_scope.return_value.__enter__.return_value # Define a mock SQLAlchemy row with namedtuple - MockRow = namedtuple("MockRow", ["source_bibcode", "date", "id", "resolved_bibcode", "score", "parser_name", "parser_priority"]) + MockRow = namedtuple("MockRow", + ["source_bibcode", "date", "id", "resolved_bibcode", "score", "parser_name", + "parser_priority"]) # Mock query results with highest-ranked records mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv", parser_priority=1), - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv", parser_priority=1), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, + resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv", parser_priority=1), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, + resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv", parser_priority=1), ] results = self.app.get_resolved_references("2023A&A...657A...1X") @@ -741,7 +817,8 @@ def test_compare_classic_toJSON(self): item_num=2, bibcode="0001arXiv.........Z", score=1, - state="MATCH") + state="MATCH" + ) expected_json = { "history_id": 1, "item_num": 2, @@ -760,19 +837,6 @@ class TestDatabaseNoStubdata(unittest.TestCase): maxDiff = None - # postgresql_url_dict = { - # 'port': 5432, - # 'host': '127.0.0.1', - # 'user': 'postgres', - # 'database': 'postgres' - # } - # postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ - # .format(user=postgresql_url_dict['user'], - # host=postgresql_url_dict['host'], - # port=postgresql_url_dict['port'], - # database=postgresql_url_dict['database'] - # ) - _postgresql = testing.postgresql.Postgresql() postgresql_url = _postgresql.url() @@ -811,7 +875,8 @@ def test_populate_tables(self): references = [ { "refstr": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", - "refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. "}, + "refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. " + }, { "refstr": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923." @@ -826,8 +891,11 @@ def test_populate_tables(self): { "refstr": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", - "id": "H1I2"} + "id": "H1I2" + } ] + + # IMPORTANT: use the real column name expected by app/models: external_identifier (list) resolved_references = [ { "score": "1.0", @@ -835,7 +903,7 @@ def test_populate_tables(self): "refstring": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", "refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", "id": "H1I1", - "ext_id": "ExtID1" + "external_identifier": ["arxiv:1009.5514", "doi:10.1234/abc"] }, { "score": "1.0", @@ -843,9 +911,10 @@ def test_populate_tables(self): "refstring": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "id": "H1I2", - "ext_id": "ExtID2" + "external_identifier": ["arxiv:1709.02923", "ascl:2301.001"] } ] + arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/') with self.app.session_scope() as session: session.query(Action).delete() @@ -857,20 +926,36 @@ def test_populate_tables(self): session.bulk_save_objects(parsers_records) session.commit() - references = self.app.populate_tables_pre_resolved_initial_status( + references_out = self.app.populate_tables_pre_resolved_initial_status( source_bibcode='0001arXiv.........Z', - source_filename=os.path.join(arXiv_stubdata_dir,'00001.raw'), - parsername=self.app.get_parser(os.path.join(arXiv_stubdata_dir,'00001.raw')).get('name'), - references=references) + source_filename=os.path.join(arXiv_stubdata_dir, '00001.raw'), + parsername=self.app.get_parser(os.path.join(arXiv_stubdata_dir, '00001.raw')).get('name'), + references=references + ) - self.assertTrue(references) - self.assertTrue(references == references_and_ids) + self.assertTrue(references_out) + self.assertTrue(references_out == references_and_ids) status = self.app.populate_tables_post_resolved( resolved_reference=resolved_references, source_bibcode='0001arXiv.........Z', - classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result')) - self.assertTrue(status == True) + classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result') + ) + self.assertTrue(status is True) + + # NEW: Verify external_identifier was persisted for the two updated rows. + # We know history_id should be 1 for the first inserted ProcessedHistory in an empty DB. + rows = ( + session.query(ResolvedReference) + .filter(ResolvedReference.history_id == 1) + .order_by(ResolvedReference.item_num.asc()) + .all() + ) + self.assertEqual(len(rows), 2) + self.assertEqual(rows[0].item_num, 1) + self.assertEqual(rows[1].item_num, 2) + self.assertEqual(rows[0].external_identifier, ["arxiv:1009.5514", "doi:10.1234/abc"]) + self.assertEqual(rows[1].external_identifier, ["arxiv:1709.02923", "ascl:2301.001"]) def test_get_parser_error(self): """ test get_parser when it errors for unrecognized source filename """ @@ -881,3 +966,4 @@ def test_get_parser_error(self): if __name__ == '__main__': unittest.main() + From 33d1382c49e86fd54f9b6115ec4381de4ecacc79 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 27 Jan 2026 11:24:31 -0800 Subject: [PATCH 7/8] external identifier update --- adsrefpipe/tests/unittests/test_app.py | 284 ++++++++++--------------- 1 file changed, 107 insertions(+), 177 deletions(-) diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index 8c21957..b99a446 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -33,11 +33,6 @@ import testing.postgresql -def _get_external_identifier(rr_obj): - """Return external_identifier from a ResolvedReference ORM object.""" - return getattr(rr_obj, "external_identifier", None) - - class TestDatabase(unittest.TestCase): """ @@ -77,12 +72,9 @@ def add_stub_data(self): self.arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/') reference_source = [ - ('0001arXiv.........Z', os.path.join(self.arXiv_stubdata_dir, '00001.raw'), - os.path.join(self.arXiv_stubdata_dir, '00001.raw.result'), 'arXiv'), - ('0002arXiv.........Z', os.path.join(self.arXiv_stubdata_dir, '00002.raw'), - os.path.join(self.arXiv_stubdata_dir, '00002.raw.result'), 'arXiv'), - ('0003arXiv.........Z', os.path.join(self.arXiv_stubdata_dir, '00003.raw'), - os.path.join(self.arXiv_stubdata_dir, '00003.raw.result'), 'arXiv') + ('0001arXiv.........Z',os.path.join(self.arXiv_stubdata_dir,'00001.raw'),os.path.join(self.arXiv_stubdata_dir,'00001.raw.result'),'arXiv'), + ('0002arXiv.........Z',os.path.join(self.arXiv_stubdata_dir,'00002.raw'),os.path.join(self.arXiv_stubdata_dir,'00002.raw.result'),'arXiv'), + ('0003arXiv.........Z',os.path.join(self.arXiv_stubdata_dir,'00003.raw'),os.path.join(self.arXiv_stubdata_dir,'00003.raw.result'),'arXiv') ] processed_history = [ @@ -91,8 +83,6 @@ def add_stub_data(self): ('2020-04-03 18:08:32', '2020-05-11 11:14:28', '128', '109') ] - # Add external identifiers for each resolved reference to verify persistence in DB - # Each tuple: (reference_str, bibcode, score, external_identifier_list) resolved_reference = [ [ ('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ', @@ -116,16 +106,16 @@ def add_stub_data(self): compare_classic = [ [ - ('2010arXiv1009.5514U', 1, 'DIFF'), - ('2017arXiv170902923M', 1, 'DIFF') + ('2010arXiv1009.5514U',1,'DIFF'), + ('2017arXiv170902923M',1,'DIFF') ], [ - ('2011MNRAS.417..709A', 1, 'MATCH'), - ('2019A&A...625A.136A', 1, 'MATCH') + ('2011MNRAS.417..709A',1,'MATCH'), + ('2019A&A...625A.136A',1,'MATCH') ], [ - ('2017ApJ...842L..24A', 1, 'MATCH'), - ('2016A&A...586A..71A', 1, 'MATCH') + ('2017ApJ...842L..24A',1,'MATCH'), + ('2016A&A...586A..71A',1,'MATCH') ] ] @@ -139,56 +129,44 @@ def add_stub_data(self): session.bulk_save_objects(parsers_records) session.commit() - for i, (a_reference, a_history) in enumerate(zip(reference_source, processed_history)): - reference_record = ReferenceSource( - bibcode=a_reference[0], - source_filename=a_reference[1], - resolved_filename=a_reference[2], - parser_name=a_reference[3] - ) + for i, (a_reference,a_history) in enumerate(zip(reference_source,processed_history)): + reference_record = ReferenceSource(bibcode=a_reference[0], + source_filename=a_reference[1], + resolved_filename=a_reference[2], + parser_name=a_reference[3]) bibcode, source_filename = self.app.insert_reference_source_record(session, reference_record) self.assertTrue(bibcode == a_reference[0]) self.assertTrue(source_filename == a_reference[1]) - history_record = ProcessedHistory( - bibcode=bibcode, - source_filename=source_filename, - source_modified=a_history[0], - status=Action().get_status_new(), - date=a_history[1], - total_ref=a_history[2] - ) + history_record = ProcessedHistory(bibcode=bibcode, + source_filename=source_filename, + source_modified=a_history[0], + status=Action().get_status_new(), + date=a_history[1], + total_ref=a_history[2]) history_id = self.app.insert_history_record(session, history_record) self.assertTrue(history_id != -1) resolved_records = [] compare_records = [] - for j, (service, classic) in enumerate(zip(resolved_reference[i], compare_classic[i])): - refstr, bib, sc, ext_ids = service - resolved_record = ResolvedReference( - history_id=history_id, - item_num=j + 1, - reference_str=refstr, - bibcode=bib, - score=sc, - reference_raw=refstr, - external_identifier=ext_ids - ) + for j, (service,classic) in enumerate(zip(resolved_reference[i],compare_classic[i])): + resolved_record = ResolvedReference(history_id=history_id, + item_num=j+1, + reference_str=service[0], + bibcode=service[1], + score=service[2], + reference_raw=service[0]) resolved_records.append(resolved_record) - - compare_record = CompareClassic( - history_id=history_id, - item_num=j + 1, - bibcode=classic[0], - score=classic[1], - state=classic[2] - ) + compare_record = CompareClassic(history_id=history_id, + item_num=j+1, + bibcode=classic[0], + score=classic[1], + state=classic[2]) compare_records.append(compare_record) - success = self.app.insert_resolved_reference_records(session, resolved_records) - self.assertTrue(success is True) + self.assertTrue(success == True) success = self.app.insert_compare_records(session, compare_records) - self.assertTrue(success is True) + self.assertTrue(success == True) session.commit() def test_query_reference_tbl(self): @@ -196,8 +174,8 @@ def test_query_reference_tbl(self): result_expected = [ { 'bibcode': '0001arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00001.raw'), - 'resolved_filename': os.path.join(self.arXiv_stubdata_dir, '00001.raw.result'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00001.raw'), + 'resolved_filename': os.path.join(self.arXiv_stubdata_dir,'00001.raw.result'), 'parser_name': 'arXiv', 'num_runs': 1, 'last_run_date': '2020-05-11 11:13:36', @@ -205,8 +183,8 @@ def test_query_reference_tbl(self): 'last_run_num_resolved_references': 2 }, { 'bibcode': '0002arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw'), - 'resolved_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw.result'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), + 'resolved_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw.result'), 'parser_name': 'arXiv', 'num_runs': 1, 'last_run_date': '2020-05-11 11:13:53', @@ -214,8 +192,8 @@ def test_query_reference_tbl(self): 'last_run_num_resolved_references': 2 }, { 'bibcode': '0003arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00003.raw'), - 'resolved_filename': os.path.join(self.arXiv_stubdata_dir, '00003.raw.result'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00003.raw'), + 'resolved_filename': os.path.join(self.arXiv_stubdata_dir,'00003.raw.result'), 'parser_name': 'arXiv', 'num_runs': 1, 'last_run_date': '2020-05-11 11:14:28', @@ -230,11 +208,9 @@ def test_query_reference_tbl(self): self.assertTrue(result_expected == result_got) # test querying filenames - filenames = [ - os.path.join(self.arXiv_stubdata_dir, '00001.raw'), - os.path.join(self.arXiv_stubdata_dir, '00002.raw'), - os.path.join(self.arXiv_stubdata_dir, '00003.raw') - ] + filenames = [os.path.join(self.arXiv_stubdata_dir,'00001.raw'), + os.path.join(self.arXiv_stubdata_dir,'00002.raw'), + os.path.join(self.arXiv_stubdata_dir,'00003.raw')] result_got = self.app.diagnostic_query(source_filename_list=filenames) self.assertTrue(result_expected == result_got) @@ -253,13 +229,11 @@ def test_query_reference_tbl_when_non_exits(self): self.assertTrue(self.app.diagnostic_query(bibcode_list=['0004arXiv.........Z']) == []) # test when filename does not exist - self.assertTrue(self.app.diagnostic_query(source_filename_list=os.path.join(self.arXiv_stubdata_dir, '00004.raw')) == []) + self.assertTrue(self.app.diagnostic_query(source_filename_list=os.path.join(self.arXiv_stubdata_dir,'00004.raw')) == []) # test when both bibcode and filename are passed and nothing is returned - self.assertTrue(self.app.diagnostic_query( - bibcode_list=['0004arXiv.........Z'], - source_filename_list=os.path.join(self.arXiv_stubdata_dir, '00004.raw') - ) == []) + self.assertTrue(self.app.diagnostic_query(bibcode_list=['0004arXiv.........Z'], + source_filename_list=os.path.join(self.arXiv_stubdata_dir,'00004.raw')) == []) def test_insert_reference_record(self): """ test inserting reference_source record """ @@ -268,15 +242,13 @@ def test_insert_reference_record(self): # see that it is returned without it being inserted with self.app.session_scope() as session: count = self.app.get_count_reference_source_records(session) - reference_record = ReferenceSource( - bibcode='0001arXiv.........Z', - source_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw'), - resolved_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw.result'), - parser_name=self.app.get_parser(os.path.join(self.arXiv_stubdata_dir, '00001.raw')).get('name') - ) + reference_record = ReferenceSource(bibcode='0001arXiv.........Z', + source_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw'), + resolved_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw.result'), + parser_name=self.app.get_parser(os.path.join(self.arXiv_stubdata_dir,'00001.raw')).get('name')) bibcode, source_filename = self.app.insert_reference_source_record(session, reference_record) self.assertTrue(bibcode == '0001arXiv.........Z') - self.assertTrue(source_filename == os.path.join(self.arXiv_stubdata_dir, '00001.raw')) + self.assertTrue(source_filename == os.path.join(self.arXiv_stubdata_dir,'00001.raw')) self.assertTrue(self.app.get_count_reference_source_records(session) == count) def test_parser_name(self): @@ -295,7 +267,7 @@ def test_parser_name(self): 'AGU': ['/JGR/0101/issD14.agu.xml', AGUtoREFs], 'arXiv': ['/arXiv/2011/00324.raw', ARXIVtoREFs], } - for name, info in parser.items(): + for name,info in parser.items(): self.assertEqual(name, self.app.get_parser(info[0]).get('name')) self.assertEqual(info[1], verify(name)) # now verify couple of errors @@ -319,7 +291,7 @@ def test_reference_service_endpoint(self): 'arXiv': '/text', 'AEdRvHTML': '/text', } - for name, endpoint in parser.items(): + for name,endpoint in parser.items(): self.assertEqual(endpoint, self.app.get_reference_service_endpoint(name)) # now verify an error self.assertEqual(self.app.get_reference_service_endpoint('errorname'), '') @@ -337,10 +309,8 @@ def test_stats_compare(self): "| review of the physics, searches and implications, | | | | | | | | | |\n" \ "| 1709.02923. | | | | | | | | | |\n" \ "+--------------------------------------------------------------+---------------------+---------------------+-----------------+-----------------+-------+-------+-------+-------+-------+" - result_got, num_references, num_resolved = self.app.get_service_classic_compare_stats_grid( - source_bibcode='0001arXiv.........Z', - source_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw') - ) + result_got, num_references, num_resolved = self.app.get_service_classic_compare_stats_grid(source_bibcode='0001arXiv.........Z', + source_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw')) self.assertEqual(result_got, result_expected) self.assertEqual(num_references, 2) self.assertEqual(num_resolved, 2) @@ -349,7 +319,7 @@ def test_reprocess_references(self): """ test reprocessing references """ result_expected_year = [ {'source_bibcode': '0002arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), 'source_modified': datetime(2020, 4, 3, 18, 8, 42), 'parser_name': 'arXiv', 'references': [{'item_num': 2, @@ -358,30 +328,24 @@ def test_reprocess_references(self): ] result_expected_bibstem = [ {'source_bibcode': '0002arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00002.raw'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00002.raw'), 'source_modified': datetime(2020, 4, 3, 18, 8, 42), 'parser_name': 'arXiv', 'references': [{'item_num': 2, 'refstr': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', 'refraw': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 '}] - }, + }, {'source_bibcode': '0003arXiv.........Z', - 'source_filename': os.path.join(self.arXiv_stubdata_dir, '00003.raw'), + 'source_filename': os.path.join(self.arXiv_stubdata_dir,'00003.raw'), 'source_modified': datetime(2020, 4, 3, 18, 8, 32), 'parser_name': 'arXiv', 'references': [{'item_num': 2, 'refstr': 'Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ', 'refraw': 'Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 '}] - } + } ] - self.assertEqual( - self.app.get_reprocess_records(ReprocessQueryType.year, match_bibcode='2019', score_cutoff=None, date_cutoff=None), - result_expected_year - ) - self.assertEqual( - self.app.get_reprocess_records(ReprocessQueryType.bibstem, match_bibcode='A&A..', score_cutoff=None, date_cutoff=None), - result_expected_bibstem - ) + self.assertEqual(self.app.get_reprocess_records(ReprocessQueryType.year, match_bibcode='2019', score_cutoff=None, date_cutoff=None), result_expected_year) + self.assertEqual(self.app.get_reprocess_records(ReprocessQueryType.bibstem, match_bibcode='A&A..', score_cutoff=None, date_cutoff=None), result_expected_bibstem) references_and_ids_year = [ {'id': 'H4I2', 'reference': 'Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 '} @@ -390,8 +354,7 @@ def test_reprocess_references(self): source_bibcode=result_expected_year[0]['source_bibcode'], source_filename=result_expected_year[0]['source_filename'], source_modified=result_expected_year[0]['source_modified'], - retry_records=result_expected_year[0]['references'] - ) + retry_records=result_expected_year[0]['references']) self.assertTrue(reprocess_references) self.assertTrue(reprocess_references, references_and_ids_year) current_num_records = [ @@ -457,12 +420,10 @@ def test_populate_tables_pre_resolved_initial_status_exception(self): mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") with patch.object(self.app.logger, 'error') as mock_error: - results = self.app.populate_tables_pre_resolved_initial_status( - '0001arXiv.........Z', - os.path.join(self.arXiv_stubdata_dir, '00001.raw'), - 'arXiv', - references=[] - ) + results = self.app.populate_tables_pre_resolved_initial_status('0001arXiv.........Z', + os.path.join(self.arXiv_stubdata_dir,'00001.raw'), + 'arXiv', + references=[]) self.assertEqual(results, []) mock_session.rollback.assert_called_once() mock_error.assert_called() @@ -474,12 +435,10 @@ def test_populate_tables_pre_resolved_retry_status_exception(self): mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") with patch.object(self.app.logger, 'error') as mock_error: - results = self.app.populate_tables_pre_resolved_retry_status( - '0001arXiv.........Z', - os.path.join(self.arXiv_stubdata_dir, '00001.raw'), - source_modified='', - retry_records=[] - ) + results = self.app.populate_tables_pre_resolved_retry_status('0001arXiv.........Z', + os.path.join(self.arXiv_stubdata_dir,'00001.raw'), + source_modified='', + retry_records=[]) self.assertEqual(results, []) mock_session.rollback.assert_called_once() mock_error.assert_called() @@ -491,17 +450,15 @@ def test_populate_tables_post_resolved_exception(self): mock_session.commit.side_effect = SQLAlchemyError("Mocked SQLAlchemyError") with patch.object(self.app.logger, 'error') as mock_error: - result = self.app.populate_tables_post_resolved( - resolved_reference=[], - source_bibcode='0001arXiv.........Z', - classic_resolved_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw.results') - ) + result = self.app.populate_tables_post_resolved(resolved_reference=[], + source_bibcode='0001arXiv.........Z', + classic_resolved_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw.results')) self.assertEqual(result, False) mock_session.rollback.assert_called_once() mock_error.assert_called() def test_populate_tables_post_resolved_with_classic(self): - """ test populate_tables_post_resolved when resolved_classic is available AND external_identifier is set """ + """ test populate_tables_post_resolved when resolved_classic is available """ resolved_reference = [ { @@ -518,12 +475,12 @@ def test_populate_tables_post_resolved_with_classic(self): 'score': 0.8, 'external_identifier': ['ascl:2301.001', 'doi:10.9999/xyz'], } - ] + source_bibcode = "2023A&A...657A...1X" classic_resolved_filename = "classic_results.txt" classic_resolved_reference = [ - (1, "2023A&A...657A...657A...1X", "1", "MATCH"), - (2, "2023A&A...657A...657A...2X", "1", "MATCH") + (1, "2023A&A...657A...1X", "1", "MATCH"), + (2, "2023A&A...657A...2X", "1", "MATCH") ] with patch.object(self.app, "session_scope"), \ @@ -568,28 +525,16 @@ def test_get_service_classic_compare_tags(self, mock_compare, mock_resolved, moc result1 = self.app.get_service_classic_compare_tags(mock_session, source_bibcode="2023A&A...657A...1X", source_filename="") self.assertEqual(result1, "mock_final_subquery") - expected_filter_bibcode = and_( - mock_processed.id == mock_resolved.history_id, - literal('"2023A&A...657A...1X').op('~')(mock_processed.bibcode) - ) - found_bibcode_filter = any( - call.args and expected_filter_bibcode.compare(call.args[0]) - for call in mock_session.query().filter.call_args_list - ) + expected_filter_bibcode = and_(mock_processed.id == mock_resolved.history_id, literal('"2023A&A...657A...1X').op('~')(mock_processed.bibcode)) + found_bibcode_filter = any(call.args and expected_filter_bibcode.compare(call.args[0]) for call in mock_session.query().filter.call_args_list) self.assertTrue(found_bibcode_filter) # test case 2: Only source_filename are provided result2 = self.app.get_service_classic_compare_tags(mock_session, source_bibcode="", source_filename="some_source_file.txt") self.assertEqual(result2, "mock_final_subquery") - expected_filter_filename = and_( - mock_processed.id == mock_resolved.history_id, - literal('2023A&A...657A...1X').op('~')(mock_processed.source_filename) - ) - found_filename_filter = any( - call.args and expected_filter_filename.compare(call.args[0]) - for call in mock_session.query().filter.call_args_list - ) + expected_filter_filename = and_(mock_processed.id == mock_resolved.history_id, literal('2023A&A...657A...1X').op('~')(mock_processed.source_filename)) + found_filename_filter = any(call.args and expected_filter_filename.compare(call.args[0]) for call in mock_session.query().filter.call_args_list) self.assertTrue(found_filename_filter) def test_get_service_classic_compare_stats_grid_error(self): @@ -611,15 +556,10 @@ def test_get_service_classic_compare_stats_grid_error(self): # mock `session.query(...).all()` to return an empty list mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [] - result = self.app.get_service_classic_compare_stats_grid( - source_bibcode='0001arXiv.........Z', - source_filename=os.path.join(self.arXiv_stubdata_dir, '00001.raw') - ) + result = self.app.get_service_classic_compare_stats_grid(source_bibcode='0001arXiv.........Z', + source_filename=os.path.join(self.arXiv_stubdata_dir,'00001.raw')) - self.assertEqual( - result, - ('Unable to fetch data for reference source file `%s` from database!' % os.path.join(self.arXiv_stubdata_dir, '00001.raw'), -1, -1) - ) + self.assertEqual(result, ('Unable to fetch data for reference source file `%s` from database!'%os.path.join(self.arXiv_stubdata_dir,'00001.raw'), -1, -1)) @patch("adsrefpipe.app.datetime") def test_filter_reprocess_query(self, mock_datetime): @@ -669,9 +609,12 @@ def test_filter_reprocess_query(self, mock_datetime): # Test case: date_cutoff is applied mock_query.reset_mock() self.app.filter_reprocess_query(mock_query, ReprocessQueryType.score, 0.8, "", 10) + expected_since = datetime(2025, 1, 1) - timedelta(days=10) mock_query.filter.assert_called() called_args, _ = mock_query.filter.call_args compiled_query = called_args[0].compile(dialect=postgresql.dialect()) + print(str(called_args[0])) + print(compiled_query.params) self.assertTrue(str(called_args[0]), 'resolved_reference.score <= :score_1') self.assertTrue(compiled_query.params.get('score_1'), 0.8) @@ -688,11 +631,9 @@ def test_get_reprocess_records(self): # mock query results with same history_id to trigger the else block mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ - MockRow(history_id=1, item_num=1, refstr="Reference 1", refraw="Raw 1", - source_bibcode="2023A&A...657A...1X", + MockRow(history_id=1, item_num=1, refstr="Reference 1", refraw="Raw 1", source_bibcode="2023A&A...657A...1X", source_filename="some_source_file.txt", source_modified="D1", parser_name="arXiv"), - MockRow(history_id=1, item_num=2, refstr="Reference 2", refraw="Raw 2", - source_bibcode="2023A&A...657A...1X", + MockRow(history_id=1, item_num=2, refstr="Reference 2", refraw="Raw 2", source_bibcode="2023A&A...657A...1X", source_filename="some_source_file.txt", source_modified="D1", parser_name="arXiv"), ] @@ -713,10 +654,8 @@ def test_get_resolved_references_all(self): # mock query results with highest scores mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, - resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv"), - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, - resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv"), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv"), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv"), ] results = self.app.get_resolved_references_all("2023A&A...657A...1X") @@ -739,16 +678,12 @@ def test_get_resolved_references(self): mock_session = mock_session_scope.return_value.__enter__.return_value # Define a mock SQLAlchemy row with namedtuple - MockRow = namedtuple("MockRow", - ["source_bibcode", "date", "id", "resolved_bibcode", "score", "parser_name", - "parser_priority"]) + MockRow = namedtuple("MockRow", ["source_bibcode", "date", "id", "resolved_bibcode", "score", "parser_name", "parser_priority"]) # Mock query results with highest-ranked records mock_session.query.return_value.filter.return_value.order_by.return_value.all.return_value = [ - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, - resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv", parser_priority=1), - MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, - resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv", parser_priority=1), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 1), id=1, resolved_bibcode="0001arXiv.........Z", score=0.95, parser_name="arXiv", parser_priority=1), + MockRow(source_bibcode="2023A&A...657A...1X", date=datetime(2025, 1, 2), id=2, resolved_bibcode="0002arXiv.........Z", score=0.85, parser_name="arXiv", parser_priority=1), ] results = self.app.get_resolved_references("2023A&A...657A...1X") @@ -817,8 +752,7 @@ def test_compare_classic_toJSON(self): item_num=2, bibcode="0001arXiv.........Z", score=1, - state="MATCH" - ) + state="MATCH") expected_json = { "history_id": 1, "item_num": 2, @@ -875,8 +809,7 @@ def test_populate_tables(self): references = [ { "refstr": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", - "refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. " - }, + "refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. "}, { "refstr": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923." @@ -891,8 +824,7 @@ def test_populate_tables(self): { "refstr": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", - "id": "H1I2" - } + "id": "H1I2"} ] # IMPORTANT: use the real column name expected by app/models: external_identifier (list) @@ -926,24 +858,22 @@ def test_populate_tables(self): session.bulk_save_objects(parsers_records) session.commit() - references_out = self.app.populate_tables_pre_resolved_initial_status( + references = self.app.populate_tables_pre_resolved_initial_status( source_bibcode='0001arXiv.........Z', - source_filename=os.path.join(arXiv_stubdata_dir, '00001.raw'), - parsername=self.app.get_parser(os.path.join(arXiv_stubdata_dir, '00001.raw')).get('name'), - references=references - ) + source_filename=os.path.join(arXiv_stubdata_dir,'00001.raw'), + parsername=self.app.get_parser(os.path.join(arXiv_stubdata_dir,'00001.raw')).get('name'), + references=references) - self.assertTrue(references_out) - self.assertTrue(references_out == references_and_ids) + self.assertTrue(references) + self.assertTrue(references == references_and_ids) status = self.app.populate_tables_post_resolved( resolved_reference=resolved_references, source_bibcode='0001arXiv.........Z', - classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result') - ) - self.assertTrue(status is True) + classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result')) + self.assertTrue(status == True) - # NEW: Verify external_identifier was persisted for the two updated rows. + # Verify external_identifier was persisted on ResolvedReference rows # We know history_id should be 1 for the first inserted ProcessedHistory in an empty DB. rows = ( session.query(ResolvedReference) From 6895fac016f0250a68dd381f6aaeaed4062ea994 Mon Sep 17 00:00:00 2001 From: thomasallen Date: Tue, 27 Jan 2026 12:03:19 -0800 Subject: [PATCH 8/8] unit test fix --- adsrefpipe/app.py | 28 +++++++++++++++++++++++++- adsrefpipe/tests/unittests/test_app.py | 10 +++++++++ 2 files changed, 37 insertions(+), 1 deletion(-) diff --git a/adsrefpipe/app.py b/adsrefpipe/app.py index d3f12a8..f5c7516 100755 --- a/adsrefpipe/app.py +++ b/adsrefpipe/app.py @@ -416,6 +416,31 @@ def update_resolved_reference_records(self, session: object, resolved_list: List self.logger.debug("Added `ResolvedReference` records successfully.") return True + def update_resolved_reference_records(self, session: object, resolved_list: List[ResolvedReference]) -> bool: + """ + update resolved reference records in the database + """ + mappings = [] + for r in resolved_list: + mappings.append({ + # must include PK columns for bulk_update_mappings + "history_id": r.history_id, + "item_num": r.item_num, + "reference_str": r.reference_str, + + # fields to update + "bibcode": r.bibcode, + "score": r.score, + "reference_raw": r.reference_raw, + "external_identifier": _ensure_list(getattr(r, "external_identifier", None)) or [], + }) + + session.bulk_update_mappings(ResolvedReference, mappings) + session.flush() + self.logger.debug("Added `ResolvedReference` records successfully.") + return True + + def insert_compare_records(self, session: object, compared_list: List[CompareClassic]) -> bool: """ insert records into the compare classic table @@ -549,7 +574,8 @@ def populate_tables_post_resolved(self, resolved_reference: List, source_bibcode reference_str=ref.get('refstring', None), bibcode=ref.get('bibcode', None), score=ref.get('score', None), - reference_raw=ref.get('refstring', None)) + reference_raw=ref.get('refstring', None), + external_identifier=_ensure_list(ref.get('external_identifier', None)) or []) resolved_records.append(resolved_record) if resolved_classic: compare_record = CompareClassic(history_id=history_id, diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index b99a446..f00e7fb 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -32,6 +32,15 @@ import testing.postgresql +def _get_external_identifier(rec): + """ + Works whether rec is a dict (bulk mappings) or an ORM object. + """ + if rec is None: + return [] + if isinstance(rec, dict): + return rec.get("external_identifier") or [] + return getattr(rec, "external_identifier", None) or [] class TestDatabase(unittest.TestCase): @@ -475,6 +484,7 @@ def test_populate_tables_post_resolved_with_classic(self): 'score': 0.8, 'external_identifier': ['ascl:2301.001', 'doi:10.9999/xyz'], } + ] source_bibcode = "2023A&A...657A...1X" classic_resolved_filename = "classic_results.txt"