Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
40 changes: 39 additions & 1 deletion adsrefpipe/app.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,15 @@

from texttable import Texttable

def _ensure_list(x):
if x is None:
return None
# treat strings as scalars, not iterables
if isinstance(x, (str, bytes)):
return [x]
# already list-like
return list(x)

class ADSReferencePipelineCelery(ADSCelery):
"""
celery-based pipeline for processing and resolving references
Expand Down Expand Up @@ -306,6 +315,7 @@ def query_resolved_reference_tbl(self, history_id_list: List = None) -> List:

return results


def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List = None) -> List:
"""
perform a diagnostic query to retrieve combined reference records
Expand All @@ -315,6 +325,8 @@ def diagnostic_query(self, bibcode_list: List = None, source_filename_list: List
:return: List of combined records from multiple tables
"""
results = []
bibcode_list = _ensure_list(bibcode_list)
source_filename_list = _ensure_list(source_filename_list)

reference_source = self.query_reference_source_tbl(bibcode_list, source_filename_list)
processed_history = self.query_processed_history_tbl(bibcode_list, source_filename_list)
Expand Down Expand Up @@ -404,6 +416,31 @@ def update_resolved_reference_records(self, session: object, resolved_list: List
self.logger.debug("Added `ResolvedReference` records successfully.")
return True

def update_resolved_reference_records(self, session: object, resolved_list: List[ResolvedReference]) -> bool:
"""
update resolved reference records in the database
"""
mappings = []
for r in resolved_list:
mappings.append({
# must include PK columns for bulk_update_mappings
"history_id": r.history_id,
"item_num": r.item_num,
"reference_str": r.reference_str,

# fields to update
"bibcode": r.bibcode,
"score": r.score,
"reference_raw": r.reference_raw,
"external_identifier": _ensure_list(getattr(r, "external_identifier", None)) or [],
})

session.bulk_update_mappings(ResolvedReference, mappings)
session.flush()
self.logger.debug("Added `ResolvedReference` records successfully.")
return True


def insert_compare_records(self, session: object, compared_list: List[CompareClassic]) -> bool:
"""
insert records into the compare classic table
Expand Down Expand Up @@ -537,7 +574,8 @@ def populate_tables_post_resolved(self, resolved_reference: List, source_bibcode
reference_str=ref.get('refstring', None),
bibcode=ref.get('bibcode', None),
score=ref.get('score', None),
reference_raw=ref.get('refstring', None))
reference_raw=ref.get('refstring', None),
external_identifier=_ensure_list(ref.get('external_identifier', None)) or [])
resolved_records.append(resolved_record)
if resolved_classic:
compare_record = CompareClassic(history_id=history_id,
Expand Down
10 changes: 7 additions & 3 deletions adsrefpipe/models.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,7 @@


from sqlalchemy import Integer, String, Column, ForeignKey, DateTime, func, Numeric, ForeignKeyConstraint
from sqlalchemy.dialects.postgresql import JSONB
from sqlalchemy.dialects.postgresql import JSONB, ARRAY
from sqlalchemy.ext.declarative import declarative_base


Expand Down Expand Up @@ -213,8 +213,9 @@ class ResolvedReference(Base):
bibcode = Column(String)
score = Column(Numeric)
reference_raw = Column(String)
external_identifier = Column(ARRAY(String))

def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str):
def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode: str, score: float, reference_raw: str, external_identifier: list = None):
"""
initializes a resolved reference object

Expand All @@ -224,13 +225,15 @@ def __init__(self, history_id: int, item_num: int, reference_str: str, bibcode:
:param bibcode: resolved bibcode
:param score: confidence score of the resolved reference
:param reference_raw: raw reference string
:param external_identifier: list of external identifiers associated with the reference, e.g. ["doi:...", "arxiv:...", "ascl:..."]
"""
self.history_id = history_id
self.item_num = item_num
self.reference_str = reference_str
self.bibcode = bibcode
self.score = score
self.reference_raw = reference_raw
self.external_identifier = external_identifier or []

def toJSON(self) -> dict:
"""
Expand All @@ -244,7 +247,8 @@ def toJSON(self) -> dict:
'bibcode': self.bibcode,
'score': self.score,
'item_num': self.item_num,
**({'reference_raw': self.reference_raw} if self.reference_raw else {})
**({'reference_raw': self.reference_raw} if self.reference_raw else {}),
'external_identifier': self.external_identifier
}


Expand Down
130 changes: 93 additions & 37 deletions adsrefpipe/tests/unittests/test_app.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,17 @@
from adsrefpipe.refparsers.handler import verify
from adsrefpipe.tests.unittests.stubdata.dbdata import actions_records, parsers_records

import testing.postgresql

def _get_external_identifier(rec):
"""
Works whether rec is a dict (bulk mappings) or an ORM object.
"""
if rec is None:
return []
if isinstance(rec, dict):
return rec.get("external_identifier") or []
return getattr(rec, "external_identifier", None) or []

class TestDatabase(unittest.TestCase):

Expand All @@ -39,18 +50,13 @@ class TestDatabase(unittest.TestCase):

maxDiff = None

postgresql_url_dict = {
'port': 5432,
'host': '127.0.0.1',
'user': 'postgres',
'database': 'postgres'
}
postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \
.format(user=postgresql_url_dict['user'],
host=postgresql_url_dict['host'],
port=postgresql_url_dict['port'],
database=postgresql_url_dict['database']
)
_postgresql = testing.postgresql.Postgresql()
postgresql_url = _postgresql.url()

@classmethod
def tearDownClass(cls):
super().tearDownClass()
cls._postgresql.stop()

def setUp(self):
self.test_dir = os.path.join(project_home, 'adsrefpipe/tests')
Expand Down Expand Up @@ -88,16 +94,22 @@ def add_stub_data(self):

resolved_reference = [
[
('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ','2011LRR....14....2U',1.0),
('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.','2017RPPh...80l6902M',1.0)
('J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ',
'2011LRR....14....2U', 1.0, ['arxiv:1009.5514']),
('C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.',
'2017RPPh...80l6902M', 1.0, ['arxiv:1709.02923'])
],
[
('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.','2011MNRAS.417..709A',1.0),
('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ','2019A&A...625A.136A',1.0)
('Alsubai, K. A., Parley, N. R., Bramich, D. M., et al. 2011, MNRAS, 417, 709.',
'2011MNRAS.417..709A', 1.0, ['doi:10.0000/mnras.417.709']),
('Arcangeli, J., Desert, J.-M., Parmentier, V., et al. 2019, A&A, 625, A136 ',
'2019A&A...625A.136A', 1.0, ['doi:10.0000/aa.625.A136'])
],
[
('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24','2017ApJ...842L..24A',1.0),
('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ','2016A&A...586A..71A',1.0)
('Abellan, F. J., Indebetouw, R., Marcaide, J. M., et al. 2017, ApJL, 842, L24',
'2017ApJ...842L..24A', 1.0, ['ascl:1701.001']),
('Ackermann, M., Albert, A., Atwood, W. B., et al. 2016, A&A, 586, A71 ',
'2016A&A...586A..71A', 1.0, ['doi:10.0000/aa.586.A71'])
],
]

Expand All @@ -117,8 +129,13 @@ def add_stub_data(self):
]

with self.app.session_scope() as session:
session.bulk_save_objects(actions_records)
session.bulk_save_objects(parsers_records)
session.query(Action).delete()
session.query(Parser).delete()
session.commit()
if session.query(Action).count() == 0:
session.bulk_save_objects(actions_records)
if session.query(Parser).count() == 0:
session.bulk_save_objects(parsers_records)
session.commit()

for i, (a_reference,a_history) in enumerate(zip(reference_source,processed_history)):
Expand Down Expand Up @@ -453,9 +470,22 @@ def test_populate_tables_post_resolved_with_classic(self):
""" test populate_tables_post_resolved when resolved_classic is available """

resolved_reference = [
{'id': 'H1I1', 'refstring': 'Reference 1', 'bibcode': '2023A&A...657A...1X', 'score': 1.0},
{'id': 'H1I2', 'refstring': 'Reference 2', 'bibcode': '2023A&A...657A...2X', 'score': 0.8}
{
'id': 'H1I1',
'refstring': 'Reference 1',
'bibcode': '2023A&A...657A...1X',
'score': 1.0,
'external_identifier': ['doi:10.1234/abc', 'arxiv:2301.00001'],
},
{
'id': 'H1I2',
'refstring': 'Reference 2',
'bibcode': '2023A&A...657A...2X',
'score': 0.8,
'external_identifier': ['ascl:2301.001', 'doi:10.9999/xyz'],
}
]

source_bibcode = "2023A&A...657A...1X"
classic_resolved_filename = "classic_results.txt"
classic_resolved_reference = [
Expand All @@ -476,6 +506,12 @@ def test_populate_tables_post_resolved_with_classic(self):
mock_insert.assert_called_once()
mock_logger.assert_called_with("Updated 2 resolved reference records successfully.")

# Check whether external_identifier is populated with correct data
_, resolved_records = mock_update.call_args[0]
self.assertEqual(len(resolved_records), 2)
self.assertEqual(_get_external_identifier(resolved_records[0]), ['doi:10.1234/abc', 'arxiv:2301.00001'])
self.assertEqual(_get_external_identifier(resolved_records[1]), ['ascl:2301.001', 'doi:10.9999/xyz'])

@patch("adsrefpipe.app.ProcessedHistory")
@patch("adsrefpipe.app.ResolvedReference")
@patch("adsrefpipe.app.CompareClassic")
Expand Down Expand Up @@ -745,18 +781,13 @@ class TestDatabaseNoStubdata(unittest.TestCase):

maxDiff = None

postgresql_url_dict = {
'port': 5432,
'host': '127.0.0.1',
'user': 'postgres',
'database': 'postgres'
}
postgresql_url = 'postgresql://{user}:{user}@{host}:{port}/{database}' \
.format(user=postgresql_url_dict['user'],
host=postgresql_url_dict['host'],
port=postgresql_url_dict['port'],
database=postgresql_url_dict['database']
)
_postgresql = testing.postgresql.Postgresql()
postgresql_url = _postgresql.url()

@classmethod
def tearDownClass(cls):
super().tearDownClass()
cls._postgresql.stop()

def setUp(self):
self.test_dir = os.path.join(project_home, 'adsrefpipe/tests')
Expand Down Expand Up @@ -805,26 +836,36 @@ def test_populate_tables(self):
"refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.",
"id": "H1I2"}
]

# IMPORTANT: use the real column name expected by app/models: external_identifier (list)
resolved_references = [
{
"score": "1.0",
"bibcode": "2011LRR....14....2U",
"refstring": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ",
"refraw": "J.-P. Uzan, Varying constants, gravitation and cosmology, Living Rev. Rel. 14 (2011) 2, [1009.5514]. ",
"id": "H1I1"
"id": "H1I1",
"external_identifier": ["arxiv:1009.5514", "doi:10.1234/abc"]
},
{
"score": "1.0",
"bibcode": "2017RPPh...80l6902M",
"refstring": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.",
"refraw": "C. J. A. P. Martins, The status of varying constants: A review of the physics, searches and implications, 1709.02923.",
"id": "H1I2",
"external_identifier": ["arxiv:1709.02923", "ascl:2301.001"]
}
]

arXiv_stubdata_dir = os.path.join(self.test_dir, 'unittests/stubdata/txt/arXiv/0/')
with self.app.session_scope() as session:
session.bulk_save_objects(actions_records)
session.bulk_save_objects(parsers_records)
session.query(Action).delete()
session.query(Parser).delete()
session.commit()
if session.query(Action).count() == 0:
session.bulk_save_objects(actions_records)
if session.query(Parser).count() == 0:
session.bulk_save_objects(parsers_records)
session.commit()

references = self.app.populate_tables_pre_resolved_initial_status(
Expand All @@ -842,6 +883,20 @@ def test_populate_tables(self):
classic_resolved_filename=os.path.join(arXiv_stubdata_dir, '00001.raw.result'))
self.assertTrue(status == True)

# Verify external_identifier was persisted on ResolvedReference rows
# We know history_id should be 1 for the first inserted ProcessedHistory in an empty DB.
rows = (
session.query(ResolvedReference)
.filter(ResolvedReference.history_id == 1)
.order_by(ResolvedReference.item_num.asc())
.all()
)
self.assertEqual(len(rows), 2)
self.assertEqual(rows[0].item_num, 1)
self.assertEqual(rows[1].item_num, 2)
self.assertEqual(rows[0].external_identifier, ["arxiv:1009.5514", "doi:10.1234/abc"])
self.assertEqual(rows[1].external_identifier, ["arxiv:1709.02923", "ascl:2301.001"])

def test_get_parser_error(self):
""" test get_parser when it errors for unrecognized source filename """
with patch.object(self.app.logger, 'error') as mock_error:
Expand All @@ -851,3 +906,4 @@ def test_get_parser_error(self):

if __name__ == '__main__':
unittest.main()

2 changes: 2 additions & 0 deletions adsrefpipe/tests/unittests/test_tasks.py
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,8 @@ def add_stub_data(self):
]

with self.app.session_scope() as session:
session.query(Action).delete()
session.query(Parser).delete()
session.bulk_save_objects(actions_records)
session.bulk_save_objects(parsers_records)
session.commit()
Expand Down
29 changes: 29 additions & 0 deletions alembic/versions/08ca70bd6f5f_add_external_identifier.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
"""add_external_identifier

Revision ID: 08ca70bd6f5f
Revises: e3d6e15c3b8c
Create Date: 2026-01-05 11:16:27.454389

"""
from alembic import op
import sqlalchemy as sa
from sqlalchemy.dialects import postgresql


# revision identifiers, used by Alembic.
revision = '08ca70bd6f5f'
down_revision = 'e3d6e15c3b8c'
branch_labels = None
depends_on = None


def upgrade():
op.add_column('resolved_reference',
sa.Column("external_identifier",
postgresql.ARRAY(sa.String()))
)


def downgrade():
op.drop_column('resolved_reference', 'external_identifier')