diff --git a/adsrefpipe/app.py b/adsrefpipe/app.py index 1a587cb..f5c7516 100755 --- a/adsrefpipe/app.py +++ b/adsrefpipe/app.py @@ -22,6 +22,15 @@ from texttable import Texttable +def _ensure_list(x): + if x is None: + return None + # treat strings as scalars, not iterables + if isinstance(x, (str, bytes)): + return [x] + # already list-like + return list(x) + class ADSReferencePipelineCelery(ADSCelery): """ celery-based pipeline for processing and resolving references @@ -306,6 +315,7 @@ def query_resolved_reference_tbl(self, history_id_list: List = None) -> List: return results + def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List = None) -> List: """ perform a diagnostic query to retrieve combined reference records @@ -315,6 +325,8 @@ def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List :return: List of combined records from multiple tables """ results = [] + bibcode_list = _ensure_list(bibcode_list) + source_filename_list = _ensure_list(source_filename_list) reference_source = self.query_reference_source_tbl(bibcode_list, source_filename_list) processed_history = self.query_processed_history_tbl(bibcode_list, source_filename_list) @@ -404,6 +416,31 @@ def update_resolved_reference_records(self, session: object, resolved_list: List self.logger.debug("Added `ResolvedReference` records successfully.") return True + def update_resolved_reference_records(self, session: object, resolved_list: List[ResolvedReference]) -> bool: + """ + update resolved reference records in the database + """ + mappings = [] + for r in resolved_list: + mappings.append({ + # must include PK columns for bulk_update_mappings + "history_id": r.history_id, + "item_num": r.item_num, + "reference_str": r.reference_str, + + # fields to update + "bibcode": r.bibcode, + "score": r.score, + "reference_raw": r.reference_raw, + "external_identifier": _ensure_list(getattr(r, "external_identifier", None)) or [], + }) + + session.bulk_update_mappings(ResolvedReference, mappings) + session.flush() + self.logger.debug("Added `ResolvedReference` records successfully.") + return True + + def insert_compare_records(self, session: object, compared_list: List[CompareClassic]) -> bool: """ insert records into the compare classic table @@ -537,7 +574,8 @@ def populate_tables_post_resolved(self, resolved_reference: List, source_bibcode reference_str=ref.get('refstring', None), bibcode=ref.get('bibcode', None), score=ref.get('score', None), - reference_raw=ref.get('refstring', None)) + reference_raw=ref.get('refstring', None), + external_identifier=_ensure_list(ref.get('external_identifier', None)) or []) resolved_records.append(resolved_record) if resolved_classic: compare_record = CompareClassic(history_id=history_id, diff --git a/adsrefpipe/models.py b/adsrefpipe/models.py index 5e2c725..b2db105 100755 --- a/adsrefpipe/models.py +++ b/adsrefpipe/models.py @@ -2,7 +2,7 @@ from sqlalchemy import Integer, String, Column, ForeignKey, DateTime, func, Numeric, ForeignKeyConstraint -from sqlalchemy.dialects.postgresql import JSONB +from sqlalchemy.dialects.postgresql import JSONB, ARRAY from sqlalchemy.ext.declarative import declarative_base @@ -213,8 +213,9 @@ class ResolvedReference(Base): bibcode = Column(String) score = Column(Numeric) reference_raw = Column(String) + external_identifier = Column(ARRAY(String)) - def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str): + def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str, external_identifier: list = None): """ initializes a resolved reference object @@ -224,6 +225,7 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: :param bibcode: resolved bibcode :param score: confidence score of the resolved reference :param reference_raw: raw reference string + :param external_identifier: list of external identifiers associated with the reference, e.g. ["doi:...", "arxiv:...", "ascl:..."] """ self.history_id = history_id self.item_num = item_num @@ -231,6 +233,7 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: self.bibcode = bibcode self.score = score self.reference_raw = reference_raw + self.external_identifier = external_identifier or [] def toJSON(self) -> dict: """ @@ -244,7 +247,8 @@ def toJSON(self) -> dict: 'bibcode': self.bibcode, 'score': self.score, 'item_num': self.item_num, - **({'reference_raw': self.reference_raw} if self.reference_raw else {}) + **({'reference_raw': self.reference_raw} if self.reference_raw else {}), + 'external_identifier': self.external_identifier } diff --git a/adsrefpipe/tests/unittests/test_app.py b/adsrefpipe/tests/unittests/test_app.py index c21cff3..f00e7fb 100644 --- a/adsrefpipe/tests/unittests/test_app.py +++ b/adsrefpipe/tests/unittests/test_app.py @@ -30,6 +30,17 @@ from adsrefpipe.refparsers.handler import verify from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records +import testing.postgresql + +def _get_external_identifier(rec): + """ + Works whether rec is a dict (bulk mappings) or an ORM object. + """ + if rec is None: + return [] + if isinstance(rec, dict): + return rec.get("external_identifier") or [] + return getattr(rec, "external_identifier", None) or [] class TestDatabase(unittest.TestCase): @@ -39,18 +50,13 @@ class TestDatabase(unittest.TestCase): maxDiff = None - postgresql_url_dict = { - 'port': 5432, - 'host': '127.0.0.1', - 'user': 'postgres', - 'database': 'postgres' - } - postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ - .format(user=postgresql_url_dict['user'], - host=postgresql_url_dict['host'], - port=postgresql_url_dict['port'], - database=postgresql_url_dict['database'] - ) + _postgresql = testing.postgresql.Postgresql() + postgresql_url = _postgresql.url() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + cls._postgresql.stop() def setUp(self): self.test_dir = os.path.join(project_home, 'adsrefpipe/tests') @@ -88,16 +94,22 @@ def add_stub_data(self): resolved_reference = [ [ - ('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ','2011LRR....14....2U',1.0), - ('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.','2017RPPh...80l6902M',1.0) + ('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ', + '2011LRR....14....2U', 1.0, ['arxiv:1009.5514']), + ('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.', + '2017RPPh...80l6902M', 1.0, ['arxiv:1709.02923']) ], [ - ('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.','2011MNRAS.417..709A',1.0), - ('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ','2019A&A...625A.136A',1.0) + ('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.', + '2011MNRAS.417..709A', 1.0, ['doi:10.0000/mnras.417.709']), + ('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ', + '2019A&A...625A.136A', 1.0, ['doi:10.0000/aa.625.A136']) ], [ - ('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24','2017ApJ...842L..24A',1.0), - ('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ','2016A&A...586A..71A',1.0) + ('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24', + '2017ApJ...842L..24A', 1.0, ['ascl:1701.001']), + ('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ', + '2016A&A...586A..71A', 1.0, ['doi:10.0000/aa.586.A71']) ], ] @@ -117,8 +129,13 @@ def add_stub_data(self): ] with self.app.session_scope() as session: - session.bulk_save_objects(actions_records) - session.bulk_save_objects(parsers_records) + session.query(Action).delete() + session.query(Parser).delete() + session.commit() + if session.query(Action).count() == 0: + session.bulk_save_objects(actions_records) + if session.query(Parser).count() == 0: + session.bulk_save_objects(parsers_records) session.commit() for i, (a_reference,a_history) in enumerate(zip(reference_source,processed_history)): @@ -453,9 +470,22 @@ def test_populate_tables_post_resolved_with_classic(self): """ test populate_tables_post_resolved when resolved_classic is available """ resolved_reference = [ - {'id': 'H1I1', 'refstring': 'Reference 1', 'bibcode': '2023A&A...657A...1X', 'score': 1.0}, - {'id': 'H1I2', 'refstring': 'Reference 2', 'bibcode': '2023A&A...657A...2X', 'score': 0.8} + { + 'id': 'H1I1', + 'refstring': 'Reference 1', + 'bibcode': '2023A&A...657A...1X', + 'score': 1.0, + 'external_identifier': ['doi:10.1234/abc', 'arxiv:2301.00001'], + }, + { + 'id': 'H1I2', + 'refstring': 'Reference 2', + 'bibcode': '2023A&A...657A...2X', + 'score': 0.8, + 'external_identifier': ['ascl:2301.001', 'doi:10.9999/xyz'], + } ] + source_bibcode = "2023A&A...657A...1X" classic_resolved_filename = "classic_results.txt" classic_resolved_reference = [ @@ -476,6 +506,12 @@ def test_populate_tables_post_resolved_with_classic(self): mock_insert.assert_called_once() mock_logger.assert_called_with("Updated 2 resolved reference records successfully.") + # Check whether external_identifier is populated with correct data + _, resolved_records = mock_update.call_args[0] + self.assertEqual(len(resolved_records), 2) + self.assertEqual(_get_external_identifier(resolved_records[0]), ['doi:10.1234/abc', 'arxiv:2301.00001']) + self.assertEqual(_get_external_identifier(resolved_records[1]), ['ascl:2301.001', 'doi:10.9999/xyz']) + @patch("adsrefpipe.app.ProcessedHistory") @patch("adsrefpipe.app.ResolvedReference") @patch("adsrefpipe.app.CompareClassic") @@ -745,18 +781,13 @@ class TestDatabaseNoStubdata(unittest.TestCase): maxDiff = None - postgresql_url_dict = { - 'port': 5432, - 'host': '127.0.0.1', - 'user': 'postgres', - 'database': 'postgres' - } - postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \ - .format(user=postgresql_url_dict['user'], - host=postgresql_url_dict['host'], - port=postgresql_url_dict['port'], - database=postgresql_url_dict['database'] - ) + _postgresql = testing.postgresql.Postgresql() + postgresql_url = _postgresql.url() + + @classmethod + def tearDownClass(cls): + super().tearDownClass() + cls._postgresql.stop() def setUp(self): self.test_dir = os.path.join(project_home, 'adsrefpipe/tests') @@ -805,13 +836,16 @@ def test_populate_tables(self): "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "id": "H1I2"} ] + + # IMPORTANT: use the real column name expected by app/models: external_identifier (list) resolved_references = [ { "score": "1.0", "bibcode": "2011LRR....14....2U", "refstring": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", "refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ", - "id": "H1I1" + "id": "H1I1", + "external_identifier": ["arxiv:1009.5514", "doi:10.1234/abc"] }, { "score": "1.0", @@ -819,12 +853,19 @@ def test_populate_tables(self): "refstring": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.", "id": "H1I2", + "external_identifier": ["arxiv:1709.02923", "ascl:2301.001"] } ] + arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/') with self.app.session_scope() as session: - session.bulk_save_objects(actions_records) - session.bulk_save_objects(parsers_records) + session.query(Action).delete() + session.query(Parser).delete() + session.commit() + if session.query(Action).count() == 0: + session.bulk_save_objects(actions_records) + if session.query(Parser).count() == 0: + session.bulk_save_objects(parsers_records) session.commit() references = self.app.populate_tables_pre_resolved_initial_status( @@ -842,6 +883,20 @@ def test_populate_tables(self): classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result')) self.assertTrue(status == True) + # Verify external_identifier was persisted on ResolvedReference rows + # We know history_id should be 1 for the first inserted ProcessedHistory in an empty DB. + rows = ( + session.query(ResolvedReference) + .filter(ResolvedReference.history_id == 1) + .order_by(ResolvedReference.item_num.asc()) + .all() + ) + self.assertEqual(len(rows), 2) + self.assertEqual(rows[0].item_num, 1) + self.assertEqual(rows[1].item_num, 2) + self.assertEqual(rows[0].external_identifier, ["arxiv:1009.5514", "doi:10.1234/abc"]) + self.assertEqual(rows[1].external_identifier, ["arxiv:1709.02923", "ascl:2301.001"]) + def test_get_parser_error(self): """ test get_parser when it errors for unrecognized source filename """ with patch.object(self.app.logger, 'error') as mock_error: @@ -851,3 +906,4 @@ def test_get_parser_error(self): if __name__ == '__main__': unittest.main() + diff --git a/adsrefpipe/tests/unittests/test_tasks.py b/adsrefpipe/tests/unittests/test_tasks.py index fb4ee58..153f041 100755 --- a/adsrefpipe/tests/unittests/test_tasks.py +++ b/adsrefpipe/tests/unittests/test_tasks.py @@ -74,6 +74,8 @@ def add_stub_data(self): ] with self.app.session_scope() as session: + session.query(Action).delete() + session.query(Parser).delete() session.bulk_save_objects(actions_records) session.bulk_save_objects(parsers_records) session.commit() diff --git a/alembic/versions/08ca70bd6f5f_add_external_identifier.py b/alembic/versions/08ca70bd6f5f_add_external_identifier.py new file mode 100644 index 0000000..3a360a0 --- /dev/null +++ b/alembic/versions/08ca70bd6f5f_add_external_identifier.py @@ -0,0 +1,29 @@ +"""add_external_identifier + +Revision ID: 08ca70bd6f5f +Revises: e3d6e15c3b8c +Create Date: 2026-01-05 11:16:27.454389 + +""" +from alembic import op +import sqlalchemy as sa +from sqlalchemy.dialects import postgresql + + +# revision identifiers, used by Alembic. +revision = '08ca70bd6f5f' +down_revision = 'e3d6e15c3b8c' +branch_labels = None +depends_on = None + + +def upgrade(): + op.add_column('resolved_reference', + sa.Column("external_identifier", + postgresql.ARRAY(sa.String())) + ) + + +def downgrade(): + op.drop_column('resolved_reference', 'external_identifier') +