From a2c58959ed8165150abef2a5d0ad779efc89c2b1 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Mar 2026 15:49:20 -0500 Subject: [PATCH 01/49] Install postgres dependencies for tests --- .github/workflows/build.yaml | 7 ++++++- 1 file changed, 6 insertions(+), 1 deletion(-) diff --git a/.github/workflows/build.yaml b/.github/workflows/build.yaml index 3828057a..75f5dbcc 100644 --- a/.github/workflows/build.yaml +++ b/.github/workflows/build.yaml @@ -36,12 +36,17 @@ jobs: mamba install -y -q pip wheel pip install uv + - name: Install Postgres for testing + shell: bash -l {0} + run: | + mamba install -y -q postgresql + - name: Install dependencies shell: bash -l {0} run: | uv pip install -r requirements.txt + uv pip install testing.postgresql - # We have two cores so we can speed up the testing with xdist - name: Install pytest packages shell: bash -l {0} run: | From 0e697d490e80fe9d55221557274ba3e678faf05d Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 17 Feb 2026 15:45:31 -0600 Subject: [PATCH 02/49] Rename test modules using snakecase --- tests/{test_ppdbBigQuery.py => test_ppdb_bigquery.py} | 0 tests/{test_ppdbSql.py => test_ppdb_sql.py} | 0 2 files changed, 0 insertions(+), 0 deletions(-) rename tests/{test_ppdbBigQuery.py => test_ppdb_bigquery.py} (100%) rename tests/{test_ppdbSql.py => test_ppdb_sql.py} (100%) diff --git a/tests/test_ppdbBigQuery.py b/tests/test_ppdb_bigquery.py similarity index 100% rename from tests/test_ppdbBigQuery.py rename to tests/test_ppdb_bigquery.py diff --git a/tests/test_ppdbSql.py b/tests/test_ppdb_sql.py similarity index 100% rename from tests/test_ppdbSql.py rename to tests/test_ppdb_sql.py From 074b4b7254808a88563a4e1104d4db7d0921e68b Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 10 Feb 2026 17:33:21 -0600 Subject: [PATCH 03/49] Write update records to JSON file when storing replica chunks --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 72 +++-- .../lsst/dax/ppdb/bigquery/update_records.py | 115 ++++++++ tests/test_ppdb_bigquery.py | 246 +++++++++++++++++- 3 files changed, 408 insertions(+), 25 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/update_records.py diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 877bb229..871531a6 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -46,6 +46,7 @@ from ..sql import PpdbSqlBase, PpdbSqlBaseConfig from .manifest import Manifest, TableStats from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended +from .update_records import UpdateRecords __all__ = ["ConfigValidationError", "PpdbBigQuery", "PpdbBigQueryConfig"] @@ -178,22 +179,11 @@ def store( # Docstring is inherited. _LOG.info("Processing %s", replica_chunk.id) - # TODO: APDB does not generate ApdbUpdateRecords yet, but we will - # eventually have to add support for it. - if update_records: - raise NotImplementedError("PpdbBigQuery does not support record updates yet.") - try: - chunk_dir = self._get_chunk_path(replica_chunk) - - if chunk_dir.exists(): - if not self.delete_existing_dirs: - raise FileExistsError(f"Directory already exists for {replica_chunk.id}: {chunk_dir}") - _LOG.warning("Overwriting existing directory for %s: %s", replica_chunk.id, chunk_dir) - shutil.rmtree(chunk_dir) + chunk_dir = self._create_chunk_dir(replica_chunk) - chunk_dir.mkdir(parents=True) - _LOG.info("Created directory for %s: %s", replica_chunk.id, chunk_dir) + if update_records: + self._handle_updates(replica_chunk, update_records, chunk_dir) table_dict = { ApdbTables.DiaObject.value: objects, @@ -261,15 +251,32 @@ def store( _LOG.info("Done processing %s", replica_chunk.id) - def _get_chunk_path(self, chunk: ReplicaChunk) -> Path: + def _create_chunk_dir(self, chunk: ReplicaChunk) -> Path: + """Create the directory for the replica chunk based on its last update + time and ID. + + Returns + ------- + chunk_dir + Path to the created directory for the replica chunk. + """ last_update_time = chunk.last_update_time.to_datetime() assert isinstance(last_update_time, datetime.datetime) - path = Path( + chunk_dir = Path( self.replication_path, chunk.last_update_time.strftime("%Y/%m/%d"), str(chunk.id), ) - return path + if chunk_dir.exists(): + if not self.delete_existing_dirs: + raise FileExistsError(f"Directory already exists for {chunk.id}: {chunk_dir}") + _LOG.warning("Overwriting existing directory for %s: %s", chunk.id, chunk_dir) + shutil.rmtree(chunk_dir) + + chunk_dir.mkdir(parents=True) + _LOG.info("Created directory for %s: %s", chunk.id, chunk_dir) + + return chunk_dir def get_replica_chunks(self, start_chunk_id: int | None = None) -> Sequence[PpdbReplicaChunk] | None: # Docstring is inherited. @@ -567,3 +574,34 @@ def validate_config(cls, config: PpdbBigQueryConfig) -> None: check_dataset_exists(config.project_id, config.dataset_id) except Exception as e: raise ConfigValidationError("Failed to validate BigQuery dataset") from e + + def _handle_updates( + self, replica_chunk: ReplicaChunk, apdb_update_records: Collection[ApdbUpdateRecord], chunk_dir: Path + ) -> None: + """Handle updates to existing records in the PPDB. + + Parameters + ---------- + replica_chunk : `ReplicaChunk` + The replica chunk associated with the updates. + update_records : `~collections.abc.Collection` [ `ApdbUpdateRecord` ] + Collection of update records to process. + + Notes + ----- + Serializes the ApdbUpdateRecord objects into a dictionary structure + for processing. + """ + update_records = UpdateRecords( + replica_chunk_id=replica_chunk.id, + records=apdb_update_records, + record_count=len(apdb_update_records), + ) + update_records.to_json_file(chunk_dir / "update_records.json") + + _LOG.info( + "Saved %d update records for %s to %s", + update_records.record_count, + replica_chunk.id, + chunk_dir / "update_records.json", + ) diff --git a/python/lsst/dax/ppdb/bigquery/update_records.py b/python/lsst/dax/ppdb/bigquery/update_records.py new file mode 100644 index 00000000..56e651be --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/update_records.py @@ -0,0 +1,115 @@ +# This file is part of dax_ppdb +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +import json +from pathlib import Path +from typing import Any + +from pydantic import BaseModel, field_serializer, field_validator + +from lsst.dax.apdb.apdbUpdateRecord import ApdbUpdateRecord + + +class UpdateRecords(BaseModel): + """Data model for APDB update records.""" + + replica_chunk_id: int + """Identifier of the replica chunk to which these update records belong.""" + + record_count: int + """Number of update records included in this object.""" + + records: list[ApdbUpdateRecord] + """List of APDB update records included in this object.""" + + @field_serializer("records") + def serialize_update_records( + self, + records: list[ApdbUpdateRecord], + ) -> list[dict[str, Any]]: + """Serialize the ``ApdbUpdateRecord`` objects to JSON. + + Parameters + ---------- + records : `list` [ `ApdbUpdateRecord` ] + The list of APDB update records to serialize. + + Returns + ------- + serialized_records : `list` [ `dict` [ `str`, `Any` ]] + The serialized JSON data. + """ + serialized_records: list[dict[str, Any]] = [] + + for update_record in records: + record_dict: dict[str, Any] = json.loads(update_record.to_json()) + record_dict["update_time_ns"] = update_record.update_time_ns + record_dict["update_order"] = update_record.update_order + serialized_records.append(record_dict) + + return serialized_records + + @field_validator("records", mode="before") + @classmethod + def deserialize_update_records( + cls, + records: list[dict[str, Any]] | list[ApdbUpdateRecord], + ) -> list[ApdbUpdateRecord]: + """Deserialize the JSON data to ``ApdbUpdateRecord`` objects. + + Parameters + ---------- + records : `list` [ `dict` [ `str`, `Any` ] | `ApdbUpdateRecord` ] + The list of serialized JSON data or already deserialized + ApdbUpdateRecord objects. + + Returns + ------- + update_records : `list` [ `ApdbUpdateRecord` ] + The list of APDB update records. + """ + if records and isinstance(records[0], ApdbUpdateRecord): + return records + deserialized_records: list[ApdbUpdateRecord] = [] + for record_dict in records: + record_copy = record_dict.copy() + update_time_ns = record_copy.pop("update_time_ns") + update_order = record_copy.pop("update_order") + json_str = json.dumps(record_copy) + update_record = ApdbUpdateRecord.from_json( + update_time_ns, + update_order, + json_str, + ) + deserialized_records.append(update_record) + return deserialized_records + + def to_json_file(self, path: Path) -> None: + with open(path, "w") as f: + json.dump(self.model_dump(), f, indent=2, default=str) + + @classmethod + def from_json_file(cls, path: Path) -> UpdateRecords: + with open(path) as f: + data = json.load(f) + return cls.model_validate(data) diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index 198f0bca..6aa64f52 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -26,10 +26,17 @@ import unittest from typing import Any -from lsst.dax.apdb import ApdbConfig +from lsst.dax.apdb import ( + Apdb, + ApdbConfig, + ApdbReplica, + apdbUpdateRecord, +) from lsst.dax.apdb.sql import ApdbSql -from lsst.dax.ppdb import PpdbConfig +from lsst.dax.ppdb import Ppdb, PpdbConfig from lsst.dax.ppdb.bigquery import PpdbBigQuery +from lsst.dax.ppdb.bigquery.update_records import UpdateRecords +from lsst.dax.ppdb.replicator import Replicator from lsst.dax.ppdb.tests import PpdbTest try: @@ -50,8 +57,10 @@ } -class SqliteTestCase(PpdbTest, unittest.TestCase): - """A test case for the PpdbBigQuery class using a SQLite backend.""" +class _SqliteMixin: + """Mixin class to provide Sqlite-specific setup/teardown and instance + creation. + """ def setUp(self) -> None: self.tempdir = tempfile.mkdtemp() @@ -86,9 +95,10 @@ def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: return ApdbSql.init_database(**kw) # type: ignore[arg-type] -@unittest.skipUnless(testing is not None, "testing.postgresql module not found") -class PostgresTestCase(PpdbTest, unittest.TestCase): - """A test case for the PpdbBigQuery class using a Postgres backend.""" +class _PostgresMixin: + """Mixin class to provide Postgres-specific setup/teardown and instance + creation. + """ postgresql: Any @@ -119,7 +129,7 @@ def make_instance(self, **kwargs: Any) -> PpdbConfig: kw = { **TEST_CONFIG, "db_url": self.server.url(), - "db_schema": None, + "db_schema": "ppdb_test", "felis_path": TEST_SCHEMA, "replication_dir": self.tempdir, } @@ -136,3 +146,223 @@ def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: } kw.update(kwargs) return ApdbSql.init_database(**kw) # type: ignore[arg-type] + + +class SqliteTestCase(_SqliteMixin, PpdbTest, unittest.TestCase): + """A test case for the PpdbBigQuery class using a SQLite backend.""" + + +@unittest.skipUnless(testing is not None, "testing.postgresql module not found") +class PostgresTestCase(_PostgresMixin, PpdbTest, unittest.TestCase): + """A test case for the PpdbBigQuery class using a Postgres backend.""" + + +class UpdateRecordsTestCase(_PostgresMixin, PpdbTest, unittest.TestCase): + """A test case for the PpdbBigQuery class update functionality using a + Postgres backend. + """ + + include_update_records = True + + def setUp(self): + super().setUp() + + # Make APDB instance and fill it with test data. + apdb_config = self.make_apdb_instance() + apdb = Apdb.from_config(apdb_config) + self._fill_apdb(apdb) + apdb_replica = ApdbReplica.from_config(apdb_config) + + # Make PPDB instance. + ppdb_config = self.make_instance() + self.ppdb = Ppdb.from_config(ppdb_config) + assert isinstance(self.ppdb, PpdbBigQuery) + + # Replicate those to PPDB. + replicator = Replicator( + apdb_replica, self.ppdb, update=False, min_wait_time=0, max_wait_time=0, check_interval=0 + ) + + # Copy chunks. + replicator.run(exit_on_empty=True) + + def test_update_records_json_serialization(self) -> None: + """Test that the APDB update records are correctly saved to a JSON file + in the replication output and can be read back correctly as + UpdateRecords objects. + """ + update_records_path = self.ppdb.replication_path / "2021/03/01/1614600000" / "update_records.json" + self.assertTrue(update_records_path.exists(), "Update records file not found in replication output") + + update_records = UpdateRecords.from_json_file(update_records_path) + print("\n" + str(update_records)) + + self.assertEqual( + update_records.replica_chunk_id, + 1614600000, + "Unexpected replica chunk ID in deserialized update records", + ) + + self.assertEqual(update_records.record_count, 3, "Unexpected number of update records deserialized") + + self.assertEqual( + len(update_records.records), 3, "Unexpected number of update records in the deserialized object" + ) + + for record in update_records.records: + self.assertIsInstance( + record, + apdbUpdateRecord.ApdbUpdateRecord, + "Deserialized record is not an instance of ApdbUpdateRecord", + ) + + update_record = update_records.records[0] + self.assertIsInstance( + update_record, + apdbUpdateRecord.ApdbReassignDiaSourceToSSObjectRecord, + "Deserialized record is not an instance of ApdbReassignDiaSourceToSSObjectRecord", + ) + assert isinstance(update_record, apdbUpdateRecord.ApdbReassignDiaSourceToSSObjectRecord) + self.assertEqual( + update_record.diaSourceId, + 700, + "Unexpected diaSourceId in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.ssObjectId, + 1, + "Unexpected ssObjectId in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.update_time_ns, + 1614600037000000000, + "Unexpected update_time_ns in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.update_order, + 0, + "Unexpected update_order in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.midpointMjdTai, + 60000.0, + "Unexpected midpointMjdTai in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.ssObjectReassocTimeMjdTai, + 59274.50042824074, + "Unexpected ssObjectReassocTimeMjdTai in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbReassignDiaSourceToSSObjectRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbReassignDiaSourceToSSObjectRecord, should not be 0.0", + ) + + update_record = update_records.records[1] + self.assertIsInstance( + update_record, + apdbUpdateRecord.ApdbCloseDiaObjectValidityRecord, + "Deserialized record is not an instance of ApdbCloseDiaObjectValidityRecord", + ) + self.assertEqual( + update_record.diaObjectId, + 200, + "Unexpected diaObjectId in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbCloseDiaObjectValidityRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbCloseDiaObjectValidityRecord, should not be 0.0", + ) + self.assertEqual( + update_record.update_time_ns, + 1614600037000000000, + "Unexpected update_time_ns in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertEqual( + update_record.update_order, + 1, + "Unexpected update_order in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertEqual( + update_record.validityEndMjdTai, + 59274.50042824074, + "Unexpected validityEndMjdTai in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertIsNone( + update_record.nDiaSources, + "Unexpected nDiaSources in deserialized ApdbCloseDiaObjectValidityRecord, expected None", + ) + + update_record = update_records.records[2] + self.assertIsInstance( + update_record, + apdbUpdateRecord.ApdbWithdrawDiaForcedSourceRecord, + "Deserialized record is not an instance of ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.diaObjectId, + 200, + "Unexpected diaObjectId in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.visit, + 7, + "Unexpected visit in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.detector, + 1, + "Unexpected detector in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) + self.assertEqual( + update_record.midpointMjdTai, + 60000.0, + "Unexpected midpointMjdTai in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.update_time_ns, + 1614600037000000000, + "Unexpected update_time_ns in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.update_order, + 2, + "Unexpected update_order in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.timeWithdrawnMjdTai, + 59274.50042824074, + "Unexpected timeWithdrawnMjdTai in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) From 1cb349534c026abd54119912fd66e56240fd9fb8 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 10 Feb 2026 17:42:25 -0600 Subject: [PATCH 04/49] Separate APDB test functionality into a mixin class --- python/lsst/dax/ppdb/tests/_ppdb.py | 90 +++++++++++++++-------------- tests/test_ppdb_bigquery.py | 4 +- 2 files changed, 50 insertions(+), 44 deletions(-) diff --git a/python/lsst/dax/ppdb/tests/_ppdb.py b/python/lsst/dax/ppdb/tests/_ppdb.py index 06a84a37..2535d187 100644 --- a/python/lsst/dax/ppdb/tests/_ppdb.py +++ b/python/lsst/dax/ppdb/tests/_ppdb.py @@ -21,7 +21,7 @@ from __future__ import annotations -__all__ = ["PpdbTest"] +__all__ = ["ApdbMixin", "PpdbTest"] import unittest from abc import ABC, abstractmethod @@ -68,50 +68,14 @@ def _make_region(xyz: tuple[float, float, float] = (1.0, 1.0, -1.0)) -> Region: return region -class PpdbTest(TestCaseMixin, ABC): - """Base class for Ppdb tests that can be specialized for concrete - implementation. - - This can only be used as a mixin class for a unittest.TestCase and it - calls various assert methods. +class ApdbMixin: + """Mixin class containing APDB setuup and record generation for PPDB + testing. """ include_update_records = False """If True then test replication of ApdbUpdateRecords.""" - @abstractmethod - def make_instance(self, **kwargs: Any) -> PpdbConfig: - """Make database instance and return configuration for it. - - Parameters - ---------- - **kwargs : `Any` - Instance-specific parameters for the PPDB database. - """ - raise NotImplementedError() - - @abstractmethod - def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: - """Make APDB instance and return configuration for it, APDB must have - replication enabled. - - Parameters - ---------- - **kwargs : `Any` - Instance-specific parameters for the APDB. - """ - raise NotImplementedError() - - def test_empty_db(self) -> None: - """Test for instantiation a database and making queries on empty - database. - """ - config = self.make_instance() - ppdb = Ppdb.from_config(config) - chunks = ppdb.get_replica_chunks() - if chunks is not None: - self.assertEqual(len(chunks), 0) - def _fill_apdb(self, apdb: Apdb) -> None: """Populate APDB with some data to replicate.""" visit_time = astropy.time.Time("2021-01-01T00:01:00", format="isot", scale="tai") @@ -135,7 +99,7 @@ def _fill_apdb(self, apdb: Apdb) -> None: (astropy.time.Time("2021-03-01T00:02:00", format="isot", scale="tai"), objects2), ] - # Time when apdates are applied. + # Time when updates are applied. update_time = astropy.time.Time("2021-03-01T12:00:00") update_records = [] @@ -147,7 +111,7 @@ def _fill_apdb(self, apdb: Apdb) -> None: start_id += nobj if self.include_update_records and visit == (len(visits) - 1): - # Generate few update records. + # Generate a few update records. update_records = self._make_update_records(sources, fsources, update_time) if self.include_update_records: @@ -218,6 +182,48 @@ def _check_chunks( self.assertEqual(ppdb_chunks[i].last_update_time, apdb_chunks[i].last_update_time) self.assertEqual(ppdb_chunks[i].unique_id, apdb_chunks[i].unique_id) + +class PpdbTest(TestCaseMixin, ApdbMixin, ABC): + """Base class for Ppdb tests that can be specialized for concrete + implementation. + + This can only be used as a mixin class for a unittest.TestCase and it + calls various assert methods. + """ + + @abstractmethod + def make_instance(self, **kwargs: Any) -> PpdbConfig: + """Make database instance and return configuration for it. + + Parameters + ---------- + **kwargs : `Any` + Instance-specific parameters for the PPDB database. + """ + raise NotImplementedError() + + @abstractmethod + def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: + """Make APDB instance and return configuration for it, APDB must have + replication enabled. + + Parameters + ---------- + **kwargs : `Any` + Instance-specific parameters for the APDB. + """ + raise NotImplementedError() + + def test_empty_db(self) -> None: + """Test for instantiation a database and making queries on empty + database. + """ + config = self.make_instance() + ppdb = Ppdb.from_config(config) + chunks = ppdb.get_replica_chunks() + if chunks is not None: + self.assertEqual(len(chunks), 0) + def test_replication_single(self) -> None: """Test replication from APDB to PPDB using a single chunk option.""" apdb_config = self.make_apdb_instance() diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index 6aa64f52..af520e5a 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -37,7 +37,7 @@ from lsst.dax.ppdb.bigquery import PpdbBigQuery from lsst.dax.ppdb.bigquery.update_records import UpdateRecords from lsst.dax.ppdb.replicator import Replicator -from lsst.dax.ppdb.tests import PpdbTest +from lsst.dax.ppdb.tests import ApdbMixin, PpdbTest try: import testing.postgresql @@ -157,7 +157,7 @@ class PostgresTestCase(_PostgresMixin, PpdbTest, unittest.TestCase): """A test case for the PpdbBigQuery class using a Postgres backend.""" -class UpdateRecordsTestCase(_PostgresMixin, PpdbTest, unittest.TestCase): +class UpdateRecordsTestCase(_PostgresMixin, ApdbMixin, unittest.TestCase): """A test case for the PpdbBigQuery class update functionality using a Postgres backend. """ From a4d221778d3ff48d9c304070e76ee1d4ca87d46b Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 11 Feb 2026 16:45:51 -0600 Subject: [PATCH 05/49] Use dax_ppdbx_gcp ticket for development --- requirements.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/requirements.txt b/requirements.txt index 4f213f08..ba5c456c 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,5 @@ lsst-dax-apdb @ git+https://github.com/lsst/dax_apdb@main lsst-utils @ git+https://github.com/lsst/utils@main lsst-resources[s3] @ git+https://github.com/lsst/resources@main lsst-felis @ git+https://github.com/lsst/felis@main -lsst-dax-ppdbx-gcp @ git+https://github.com/lsst-dm/dax_ppdbx_gcp@main +lsst-dax-ppdbx-gcp @ git+https://github.com/lsst-dm/dax_ppdbx_gcp@tickets/DM-54070 lsst-sdm-schemas @ git+https://github.com/lsst/sdm_schemas@main From eb0436733978058885612213d54ffaf0a8a7335b Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 11 Feb 2026 16:47:01 -0600 Subject: [PATCH 06/49] WIP: Add test of GCS upload --- tests/test_ppdb_bigquery.py | 76 ++++++++++++++++++++++++++++++++++--- 1 file changed, 71 insertions(+), 5 deletions(-) diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index af520e5a..408a1307 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -21,11 +21,15 @@ import gc import os +import posixpath import shutil import tempfile import unittest +import uuid from typing import Any +import pytest + from lsst.dax.apdb import ( Apdb, ApdbConfig, @@ -50,8 +54,8 @@ "db_drop": True, "validate_config": False, "delete_existing_dirs": True, - "bucket_name": "test_bucket", - "object_prefix": "test_prefix", + "bucket_name": "ppdb-test", + "object_prefix": "data/test", "dataset_id": "test_dataset", "project_id": "test_project", } @@ -157,6 +161,12 @@ class PostgresTestCase(_PostgresMixin, PpdbTest, unittest.TestCase): """A test case for the PpdbBigQuery class using a Postgres backend.""" +def generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: + """Generate a unique bucket name for testing.""" + test_id = uuid.uuid4().hex[:16] + return f"{test_prefix}-{test_id}" + + class UpdateRecordsTestCase(_PostgresMixin, ApdbMixin, unittest.TestCase): """A test case for the PpdbBigQuery class update functionality using a Postgres backend. @@ -174,8 +184,8 @@ def setUp(self): apdb_replica = ApdbReplica.from_config(apdb_config) # Make PPDB instance. - ppdb_config = self.make_instance() - self.ppdb = Ppdb.from_config(ppdb_config) + self.ppdb_config = self.make_instance() + self.ppdb = Ppdb.from_config(self.ppdb_config) assert isinstance(self.ppdb, PpdbBigQuery) # Replicate those to PPDB. @@ -186,7 +196,7 @@ def setUp(self): # Copy chunks. replicator.run(exit_on_empty=True) - def test_update_records_json_serialization(self) -> None: + def test_json_serialization(self) -> None: """Test that the APDB update records are correctly saved to a JSON file in the replication output and can be read back correctly as UpdateRecords objects. @@ -366,3 +376,59 @@ def test_update_records_json_serialization(self) -> None: 0.0, "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", ) + + @pytest.mark.skipif( + pytest.importorskip("lsst.dax.ppdbx.gcp", reason="GCP support not installed") is None, + reason="GCP support is not installed", + ) + def test_gcs_upload(self) -> None: + """Test that the replication output, including the update records JSON + file, is correctly uploaded to Google Cloud Storage after replication. + + This will only run if there is GCP support installed. + """ + print("\nTesting GCS upload of replication output...") + + from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader + + # Patch the ChunkUploader to print the message that would be published + # to the Pub/Sub topic, because there is no support for that service in + # a test environment. + class DummyChunkUploader(ChunkUploader): + def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: + message = { + "dataset": self.dataset_id, + "chunk_id": str(chunk_id), + "folder": f"gs://{posixpath.join(bucket_name, chunk_prefix)}", + } + print(f"Dummy publish to Pub/Sub topic: {message}") + + # Configure the uploader to use a unique object prefix to avoid + # conflicts + ppdb_config_copy = self.ppdb_config.model_copy() + ppdb_config_copy.bucket_name = generate_test_bucket_name("ppdb-test-gcs-upload") + + from lsst.dax.ppdbx.gcp.gcs import StorageClient + + # Create the test GCS bucket + storage_client = StorageClient(ppdb_config_copy.bucket_name) + try: + storage_client.create_bucket() + except Exception as e: + self.fail(f"Failed to create test GCS bucket: {e}") + + uploader = DummyChunkUploader( + ppdb_config_copy, + wait_interval=0, + exit_on_empty=True, + exit_on_error=True, + ) + + print(f"Uploader will copy files to {uploader.bucket_name}/{uploader.prefix}/") + uploader.run() + + # Delete the test GCS bucket + try: + storage_client.delete_bucket(force=True) + except Exception as e: + self.fail(f"Failed to delete test GCS bucket: {e}") From ebfad2939366a62ca3b20185898da9ed07e34b0a Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 11 Feb 2026 17:41:12 -0600 Subject: [PATCH 07/49] Upload the JSON file with APDB record updates when present --- .../lsst/dax/ppdb/bigquery/chunk_uploader.py | 36 ++++++++----- python/lsst/dax/ppdb/bigquery/manifest.py | 15 ++++-- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 11 ++-- tests/test_ppdb_bigquery.py | 52 ++++++++++++------- 4 files changed, 76 insertions(+), 38 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/chunk_uploader.py b/python/lsst/dax/ppdb/bigquery/chunk_uploader.py index 9efe0e22..d23a123c 100644 --- a/python/lsst/dax/ppdb/bigquery/chunk_uploader.py +++ b/python/lsst/dax/ppdb/bigquery/chunk_uploader.py @@ -237,15 +237,27 @@ def _process_chunk(self, replica_chunk: PpdbReplicaChunkExtended) -> None: ) # Make a list of local parquet files to upload. - parquet_files = list(chunk_dir.glob("*.parquet")) + upload_file_list = list(chunk_dir.glob("*.parquet")) + + # Include the update records file if the manifest indicates it should + # exist + if manifest.includes_update_records: + update_records_file = chunk_dir / "update_records.json" + if not update_records_file.exists(): + raise ChunkUploadError( + chunk_id, + f"Manifest indicates update records are included but file does not exist: " + f"{update_records_file}", + ) + upload_file_list.append(update_records_file) # Check if the chunk is expected to be empty. is_empty = manifest.is_empty_chunk() - if not parquet_files and not is_empty: + if not upload_file_list and not is_empty: # There is a mismatch between the manifest and the actual files. # Some processing error may have occurred when exporting. - raise ChunkUploadError(chunk_id, f"No parquet files found in {chunk_dir} for non-empty chunk") + raise ChunkUploadError(chunk_id, f"No files found to upload in {chunk_dir} for non-empty chunk") # Check that all expected parquet files from the manifest are present. for table_name, table_stats in manifest.table_data.items(): @@ -258,19 +270,16 @@ def _process_chunk(self, replica_chunk: PpdbReplicaChunkExtended) -> None: ) try: - # 1) Upload parquet files, which will happen only for non-empty - # chunks. - if parquet_files: - gcs_names = {path: posixpath.join(gcs_prefix, path.name) for path in parquet_files} + # 1) Upload the files to GCS for non-empty chunks. + if upload_file_list: + gcs_names = {path: posixpath.join(gcs_prefix, path.name) for path in upload_file_list} try: - _LOG.info( - "Uploading %d parquet files to GCS under prefix: %s", len(gcs_names), gcs_prefix - ) + _LOG.info("Uploading %d files to GCS under prefix: %s", len(gcs_names), gcs_prefix) with Timer( "upload_files_time", _MON, tags={"prefix": str(gcs_prefix), "chunk_id": str(chunk_id)} ) as timer: self.storage.upload_files(gcs_names) - total_bytes = sum(p.stat().st_size for p in parquet_files) + total_bytes = sum(p.stat().st_size for p in upload_file_list) timer.add_values(file_count=len(gcs_names), total_bytes=total_bytes) except* UploadError as eg: raise ChunkUploadError(chunk_id, f"{len(eg.exceptions)} upload(s) failed") from eg @@ -284,7 +293,8 @@ def _process_chunk(self, replica_chunk: PpdbReplicaChunkExtended) -> None: except UploadError as e: raise ChunkUploadError(chunk_id, "Manifest upload failed") from e - # 3) Update DB status, but not for empty chunks. + # 3) Update status in the database, but not for empty chunks. + # They have already been marked as skipped during export. if not is_empty: try: self._bq.store_chunk(replica_chunk.with_new_status(ChunkStatus.UPLOADED), True) @@ -294,7 +304,7 @@ def _process_chunk(self, replica_chunk: PpdbReplicaChunkExtended) -> None: ) from e # 4) Publish Pub/Sub staging message to trigger BigQuery load, but - # not for empty chunks. (Empty chunks cannot be staged.) + # not for empty chunks. (Empty chunks do not need to be staged.) if not is_empty: try: self._post_to_stage_chunk_topic(self.bucket_name, gcs_prefix, chunk_id) diff --git a/python/lsst/dax/ppdb/bigquery/manifest.py b/python/lsst/dax/ppdb/bigquery/manifest.py index b53c6f5a..da0fa456 100644 --- a/python/lsst/dax/ppdb/bigquery/manifest.py +++ b/python/lsst/dax/ppdb/bigquery/manifest.py @@ -79,6 +79,10 @@ class Manifest(BaseModel): """Name of the compression format used for artifacts (e.g., "gzip", "zstd", "snappy", etc.).""" + includes_update_records: bool = False + """Whether the exported data includes update records (e.g., in a separate + file) or not (`bool`).""" + @property def filename(self) -> str: """Generate the filename for this manifest based on the replica chunk @@ -118,12 +122,15 @@ def from_json_file(cls, file_path: Path) -> Manifest: def is_empty_chunk(self) -> bool: """Check if the manifest represents an empty replica chunk in which - all tables have zero rows. + all tables have zero rows and no update records are included. Returns ------- bool - `True` if all tables have zero rows, indicating an empty chunk, - `False` otherwise. + `True` if all tables have zero rows and no update records are + included, indicating an empty chunk, `False` otherwise. """ - return all(table.row_count == 0 for table in self.table_data.values()) + return ( + all(table.row_count == 0 for table in self.table_data.values()) + and not self.includes_update_records + ) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 871531a6..bf421015 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -151,7 +151,10 @@ def metadata(self) -> ApdbMetadata: return self._metadata def _generate_manifest( - self, replica_chunk: ReplicaChunk, table_dict: dict[str, ApdbTableData] + self, + replica_chunk: ReplicaChunk, + table_dict: dict[str, ApdbTableData], + update_records: Collection[ApdbUpdateRecord], ) -> Manifest: """Generate the manifest data for the replica chunk.""" return Manifest( @@ -164,6 +167,7 @@ def _generate_manifest( table_name: TableStats(row_count=len(data.rows())) for table_name, data in table_dict.items() }, compression_format=self.parq_compression, + includes_update_records=bool(update_records), ) def store( @@ -217,7 +221,7 @@ def store( # Create manifest for the replica chunk. try: - manifest = self._generate_manifest(replica_chunk, table_dict) + manifest = self._generate_manifest(replica_chunk, table_dict, update_records) _LOG.info("Generated manifest for %s: %s", replica_chunk.id, manifest.model_dump_json()) except Exception: _LOG.exception("Failed to generate manifest for %d", replica_chunk.id) @@ -578,7 +582,8 @@ def validate_config(cls, config: PpdbBigQueryConfig) -> None: def _handle_updates( self, replica_chunk: ReplicaChunk, apdb_update_records: Collection[ApdbUpdateRecord], chunk_dir: Path ) -> None: - """Handle updates to existing records in the PPDB. + """Handle updates to existing records in the PPDB by writing a JSON + file with the update information for the replica chunk. Parameters ---------- diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index 408a1307..21ee600d 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -20,6 +20,7 @@ # along with this program. If not, see . import gc +import json import os import posixpath import shutil @@ -180,7 +181,7 @@ def setUp(self): # Make APDB instance and fill it with test data. apdb_config = self.make_apdb_instance() apdb = Apdb.from_config(apdb_config) - self._fill_apdb(apdb) + self._fill_apdb(apdb) # FIXME: Only include replica chunks with the updates apdb_replica = ApdbReplica.from_config(apdb_config) # Make PPDB instance. @@ -378,22 +379,30 @@ def test_json_serialization(self) -> None: ) @pytest.mark.skipif( - pytest.importorskip("lsst.dax.ppdbx.gcp", reason="GCP support not installed") is None, - reason="GCP support is not installed", + pytest.importorskip("lsst.dax.ppdbx.gcp", reason="dax_ppdbx_gcp is not installed") is None, + reason="", ) - def test_gcs_upload(self) -> None: - """Test that the replication output, including the update records JSON - file, is correctly uploaded to Google Cloud Storage after replication. + def test_chunk_uploader(self) -> None: + """Test that the update records are correctly uploaded to Google Cloud + Storage after replication. - This will only run if there is GCP support installed. + This will only run if ``dax_ppdbx_gcp`` is installed, which provides + Google Cloud support. Imports are inlined so that the module can run + without it. """ + from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader + from lsst.dax.ppdbx.gcp.gcs import StorageClient + print("\nTesting GCS upload of replication output...") - from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader + # Change the configuration to use a unique test bucket name to avoid + # conflicts + ppdb_config_copy = self.ppdb_config.model_copy() + ppdb_config_copy.bucket_name = generate_test_bucket_name("ppdb-test-gcs-upload") # Patch the ChunkUploader to print the message that would be published - # to the Pub/Sub topic, because there is no support for that service in - # a test environment. + # to the Pub/Sub topic instead of publishing, because there is no + # support for that service in the test environment. class DummyChunkUploader(ChunkUploader): def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: message = { @@ -403,13 +412,6 @@ def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_ } print(f"Dummy publish to Pub/Sub topic: {message}") - # Configure the uploader to use a unique object prefix to avoid - # conflicts - ppdb_config_copy = self.ppdb_config.model_copy() - ppdb_config_copy.bucket_name = generate_test_bucket_name("ppdb-test-gcs-upload") - - from lsst.dax.ppdbx.gcp.gcs import StorageClient - # Create the test GCS bucket storage_client = StorageClient(ppdb_config_copy.bucket_name) try: @@ -417,16 +419,30 @@ def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_ except Exception as e: self.fail(f"Failed to create test GCS bucket: {e}") + # Configure and run the uploader uploader = DummyChunkUploader( ppdb_config_copy, wait_interval=0, exit_on_empty=True, exit_on_error=True, ) - print(f"Uploader will copy files to {uploader.bucket_name}/{uploader.prefix}/") uploader.run() + # Retrieve the update records file + update_records_files = storage_client.list_files("**/update_records.json") + self.assertEqual( + len(update_records_files), + 1, + f"Expected exactly one update_records.json file in GCS, found " + f"{len(update_records_files)}: {update_records_files}", + ) + update_records_str = storage_client.read_as_string(update_records_files[0]) + + # Print the contents of the update records file for debugging + update_records_json = json.loads(update_records_str) + print(f"Contents of update_records.json in GCS:\n{json.dumps(update_records_json, indent=2)}") + # Delete the test GCS bucket try: storage_client.delete_bucket(force=True) From 6d7dca2cac3feb31f5ace9eb729fe9f323d1b1f3 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 13 Feb 2026 17:58:36 -0600 Subject: [PATCH 08/49] WIP: Add support for expanding update records (modules need to be renamed to camelcase) --- .../lsst/dax/ppdb/bigquery/update_handler.py | 290 ++++++++++++ .../lsst/dax/ppdb/bigquery/update_records.py | 17 +- tests/test_update_handler.py | 446 ++++++++++++++++++ 3 files changed, 748 insertions(+), 5 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/update_handler.py create mode 100644 tests/test_update_handler.py diff --git a/python/lsst/dax/ppdb/bigquery/update_handler.py b/python/lsst/dax/ppdb/bigquery/update_handler.py new file mode 100644 index 00000000..adf0e1fb --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/update_handler.py @@ -0,0 +1,290 @@ +# This file is part of dax_ppdb +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + +from lsst.dax.apdb.apdbUpdateRecord import ApdbUpdateRecord + +from .update_records import UpdateRecords + + +# TODO: Move to an expandedUpdateRecord.py module +class ExpandedUpdateRecord(BaseModel): + """ + A single normalized (expanded) update row. + + This model represents one field-level update after expanding an + original logical update event into one row per updated field. + It is the canonical shape loaded into the BigQuery updates table. + """ + + table_name: str = Field( + ..., + min_length=1, + description=( + "Logical target table for the update (e.g., 'DiaObject', " + "'DiaSource'). This determines which production table " + "the update will be applied to." + ), + ) + + record_id: int = Field( + ..., + description=( + "Canonical primary key of the record being modified as an integer. " + "For composite keys, a single integer representation must be used." + ), + ) + + field_name: str = Field( + ..., + min_length=1, + description=( + "Name of the target column being updated within the logical table identified by 'table_name'." + ), + ) + + value_json: Any = Field( + ..., + description=( + "JSON-serializable new value for the field, including explicit " + "None to represent setting the column to NULL. This value must " + "be compatible with the BigQuery JSON type and later castable " + "to the target column type during MERGE." + ), + ) + + replica_chunk_id: int = Field( + ..., + ge=0, + description=( + "Source replica chunk identifier associated with this update. " + "Used as part of the deterministic ordering rule when resolving " + "multiple updates to the same (record_id, field_name)." + ), + ) + + update_order: int | None = Field( + default=None, + ge=0, + description=( + "Ordering value within the replica chunk or update batch, " + "if provided by the source system. Nullable if not available. " + "Used to break ties between updates within the same chunk." + ), + ) + + update_time_ns: int | None = Field( + default=None, + ge=0, + description=( + "Source event timestamp in nanoseconds since the epoch, " + "if provided. Nullable if not available. Used as an additional " + "ordering signal during deduplication." + ), + ) + + +# Move to `updateRecordExpander.py` to follow camelcase convention +class UpdateRecordExpander: + """Expand APDB update records into individual field-level updates for + BigQuery. + """ + + _UPDATE_FIELD_MAPPING = { + "reassign_diasource_to_diaobject": ["diaObjectId"], + "reassign_diasource_to_ssobject": ["ssObjectId", "ssObjectReassocTimeMjdTai"], + "withdraw_diasource": ["timeWithdrawnMjdTai"], + "withdraw_diaforcedsource": ["timeWithdrawnMjdTai"], + "close_diaobject_validity": ["validityEndMjdTai", "nDiaSources"], + "update_n_dia_sources": ["nDiaSources"], + } + + _RECORD_ID_FIELD_MAPPING = { + "reassign_diasource_to_diaobject": "diaSourceId", + "reassign_diasource_to_ssobject": "diaSourceId", + "withdraw_diasource": "diaSourceId", + "withdraw_diaforcedsource": ["diaObjectId", "visit", "detector"], + "close_diaobject_validity": "diaObjectId", + "update_n_dia_sources": "diaObjectId", + } + + @classmethod + def get_update_fields(cls, update_type: str) -> list[str]: + """Get the names of fields to update for a given update type. + + Parameters + ---------- + update_type : `str` + The type of update record. + + Returns + ------- + field_names : `list` [ `str` ] + List of field names that should be updated for this update type. + + Raises + ------ + ValueError + If the update_type is not recognized. + """ + if update_type not in cls._UPDATE_FIELD_MAPPING: + raise ValueError(f"Unknown update_type: {update_type}") + + return cls._UPDATE_FIELD_MAPPING[update_type] + + @classmethod + def get_record_id_field(cls, update_type: str) -> str | list[str]: + """Get the field name(s) that serve as the record ID for a given update + type. + + Parameters + ---------- + update_type : `str` + The type of update record. + + Returns + ------- + field_name : `str` or `list` [ `str` ] + Name of the field that contains the record ID for this update type, + or list of field names for composite keys. + + Raises + ------ + ValueError + If the update_type is not recognized. + """ + if update_type not in cls._RECORD_ID_FIELD_MAPPING: + raise ValueError(f"Unknown update_type: {update_type}") + + return cls._RECORD_ID_FIELD_MAPPING[update_type] + + @classmethod + def expand_single_record( + cls, update_record: ApdbUpdateRecord, replica_chunk_id: int + ) -> list[ExpandedUpdateRecord]: + """Expand a single APDB update record into ExpandedUpdateRecord + objects. + + Parameters + ---------- + update_record : `ApdbUpdateRecord` + A single APDB update record to expand. + replica_chunk_id : `int` + The replica chunk ID associated with this update record. + + Returns + ------- + expanded_records : `list` [ `ExpandedUpdateRecord` ] + List of ExpandedUpdateRecord objects, one per field being updated. + """ + update_type = update_record.update_type + field_names = cls.get_update_fields(update_type) + + # Get the target table from the update record + table_name = update_record.apdb_table.name + + # Get the record ID + record_id = cls._generate_record_id(update_record) + + expanded_records = [] + for field_name in field_names: + if not hasattr(update_record, field_name): + raise ValueError( + f"Update record of type {update_type} is missing expected field {field_name}" + ) + + value = getattr(update_record, field_name) + + expanded_record = ExpandedUpdateRecord( + table_name=table_name, + record_id=record_id, + field_name=field_name, + value_json=value, + replica_chunk_id=replica_chunk_id, + update_order=update_record.update_order, + update_time_ns=update_record.update_time_ns, + ) + expanded_records.append(expanded_record) + + return expanded_records + + @classmethod + def _generate_record_id(cls, update_record: ApdbUpdateRecord) -> int: + """Generate a record ID integer from an update record. + + Parameters + ---------- + update_record : `ApdbUpdateRecord` + The update record to generate an ID for. + + Returns + ------- + record_id : `int` + Integer representation of the record's primary key. + """ + update_type = update_record.update_type + id_field = cls.get_record_id_field(update_type) + + if isinstance(id_field, list): + # Handle composite key (e.g., DiaForcedSource) + key_values = [] + for field in id_field: + if not hasattr(update_record, field): + raise ValueError( + f"Update record of type {update_type} is missing expected ID field {field}" + ) + key_values.append(getattr(update_record, field)) + # Create a hash of the composite key components + return hash(tuple(key_values)) + else: + # Handle single field key + if not hasattr(update_record, id_field): + raise ValueError( + f"Update record of type {update_type} is missing expected ID field {id_field}" + ) + return int(getattr(update_record, id_field)) + + @classmethod + def expand_updates(cls, update_records: UpdateRecords) -> list[ExpandedUpdateRecord]: + """Expand the APDB update records into a list of individual updates. + + Parameters + ---------- + update_records : `UpdateRecords` + The APDB update records to expand. + + Returns + ------- + expanded_updates : `list` [ `ExpandedUpdateRecord` ] + A list of individual updates derived from the input update records. + """ + expanded_updates = [] + + for update_record in update_records.records: + expanded_records = cls.expand_single_record(update_record, update_records.replica_chunk_id) + expanded_updates.extend(expanded_records) + + return expanded_updates diff --git a/python/lsst/dax/ppdb/bigquery/update_records.py b/python/lsst/dax/ppdb/bigquery/update_records.py index 56e651be..47c63f01 100644 --- a/python/lsst/dax/ppdb/bigquery/update_records.py +++ b/python/lsst/dax/ppdb/bigquery/update_records.py @@ -29,7 +29,11 @@ from lsst.dax.apdb.apdbUpdateRecord import ApdbUpdateRecord +DEFAULT_FILENAME = "update_records.json" +"""Default filename for the update records JSON file.""" + +# Move to `updateRecords.py` to follow camelcase convention class UpdateRecords(BaseModel): """Data model for APDB update records.""" @@ -43,7 +47,7 @@ class UpdateRecords(BaseModel): """List of APDB update records included in this object.""" @field_serializer("records") - def serialize_update_records( + def serialize_records( self, records: list[ApdbUpdateRecord], ) -> list[dict[str, Any]]: @@ -60,18 +64,16 @@ def serialize_update_records( The serialized JSON data. """ serialized_records: list[dict[str, Any]] = [] - for update_record in records: record_dict: dict[str, Any] = json.loads(update_record.to_json()) record_dict["update_time_ns"] = update_record.update_time_ns record_dict["update_order"] = update_record.update_order serialized_records.append(record_dict) - return serialized_records @field_validator("records", mode="before") @classmethod - def deserialize_update_records( + def deserialize_records( cls, records: list[dict[str, Any]] | list[ApdbUpdateRecord], ) -> list[ApdbUpdateRecord]: @@ -104,7 +106,7 @@ def deserialize_update_records( deserialized_records.append(update_record) return deserialized_records - def to_json_file(self, path: Path) -> None: + def write_json_file(self, path: Path) -> None: with open(path, "w") as f: json.dump(self.model_dump(), f, indent=2, default=str) @@ -113,3 +115,8 @@ def from_json_file(cls, path: Path) -> UpdateRecords: with open(path) as f: data = json.load(f) return cls.model_validate(data) + + @classmethod + def from_json_string(cls, json_str: str) -> UpdateRecords: + data = json.loads(json_str) + return cls.model_validate(data) diff --git a/tests/test_update_handler.py b/tests/test_update_handler.py new file mode 100644 index 00000000..02ea0162 --- /dev/null +++ b/tests/test_update_handler.py @@ -0,0 +1,446 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import datetime +import unittest + +import astropy.time + +from lsst.dax.apdb import ( + ApdbCloseDiaObjectValidityRecord, + ApdbReassignDiaSourceToDiaObjectRecord, + ApdbReassignDiaSourceToSSObjectRecord, + ApdbUpdateNDiaSourcesRecord, + ApdbWithdrawDiaForcedSourceRecord, + ApdbWithdrawDiaSourceRecord, +) +from lsst.dax.ppdb.bigquery.update_handler import ExpandedUpdateRecord, UpdateRecordExpander +from lsst.dax.ppdb.bigquery.update_records import UpdateRecords + + +# Move to `test_updateRecordExpander.py` to follow camelcase convention +class TestUpdateRecordExpander(unittest.TestCase): + """Test UpdateRecordExpander functionality.""" + + def setUp(self) -> None: + """Set up test fixtures.""" + # Test time for consistent timestamps + self.update_time = astropy.time.Time("2021-03-01T12:00:00", format="isot", scale="tai") + self.update_time_ns = int(self.update_time.unix_tai * 1e9) + + # Test replica chunk ID + self.replica_chunk_id = 12345 + + def _create_test_update_records(self) -> UpdateRecords: + """Create test UpdateRecords with sample ApdbUpdateRecord instances. + + Based on patterns from _ppdb.py _make_update_records method. + """ + records = [] + + # Reassign DIASource to different DIAObject + records.append( + ApdbReassignDiaSourceToDiaObjectRecord( + update_time_ns=self.update_time_ns, + update_order=0, + diaSourceId=100001, + diaObjectId=300001, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Reassign DIASource to SSObject + records.append( + ApdbReassignDiaSourceToSSObjectRecord( + update_time_ns=self.update_time_ns, + update_order=1, + diaSourceId=100002, + ssObjectId=2001, + ssObjectReassocTimeMjdTai=float(self.update_time.tai.mjd), + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Withdraw DIASource + records.append( + ApdbWithdrawDiaSourceRecord( + update_time_ns=self.update_time_ns, + update_order=2, + diaSourceId=100003, + timeWithdrawnMjdTai=self.update_time.tai.mjd, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Withdraw DIAForcedSource + records.append( + ApdbWithdrawDiaForcedSourceRecord( + update_time_ns=self.update_time_ns, + update_order=3, + diaObjectId=200001, + visit=12345, + detector=42, + timeWithdrawnMjdTai=self.update_time.tai.mjd, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Close DIAObject validity interval + records.append( + ApdbCloseDiaObjectValidityRecord( + update_time_ns=self.update_time_ns, + update_order=4, + diaObjectId=200001, + validityEndMjdTai=self.update_time.tai.mjd, + nDiaSources=5, + ra=45.0, + dec=-30.0, + ) + ) + + # Update DIAObject nDiaSources count + records.append( + ApdbUpdateNDiaSourcesRecord( + update_time_ns=self.update_time_ns, + update_order=5, + diaObjectId=200002, + nDiaSources=10, + ra=45.0, + dec=-30.0, + ) + ) + + return UpdateRecords( + replica_chunk_id=self.replica_chunk_id, + record_count=len(records), + records=records, + file_created_at=datetime.datetime.now(datetime.UTC), + ) + + def test_get_update_fields(self) -> None: + """Test get_update_fields class method.""" + # Test known update types + self.assertEqual( + UpdateRecordExpander.get_update_fields("reassign_diasource_to_diaobject"), ["diaObjectId"] + ) + self.assertEqual( + UpdateRecordExpander.get_update_fields("reassign_diasource_to_ssobject"), + ["ssObjectId", "ssObjectReassocTimeMjdTai"], + ) + self.assertEqual( + UpdateRecordExpander.get_update_fields("withdraw_diasource"), ["timeWithdrawnMjdTai"] + ) + self.assertEqual( + UpdateRecordExpander.get_update_fields("withdraw_diaforcedsource"), ["timeWithdrawnMjdTai"] + ) + self.assertEqual( + UpdateRecordExpander.get_update_fields("close_diaobject_validity"), + ["validityEndMjdTai", "nDiaSources"], + ) + self.assertEqual(UpdateRecordExpander.get_update_fields("update_n_dia_sources"), ["nDiaSources"]) + + # Test unknown update type + with self.assertRaises(ValueError) as cm: + UpdateRecordExpander.get_update_fields("unknown_update_type") + self.assertIn("Unknown update_type: unknown_update_type", str(cm.exception)) + + def test_get_record_id_field(self) -> None: + """Test get_record_id_field class method.""" + # Test known update types + self.assertEqual( + UpdateRecordExpander.get_record_id_field("reassign_diasource_to_diaobject"), "diaSourceId" + ) + self.assertEqual( + UpdateRecordExpander.get_record_id_field("reassign_diasource_to_ssobject"), "diaSourceId" + ) + self.assertEqual(UpdateRecordExpander.get_record_id_field("withdraw_diasource"), "diaSourceId") + self.assertEqual( + UpdateRecordExpander.get_record_id_field("withdraw_diaforcedsource"), + ["diaObjectId", "visit", "detector"], + ) + self.assertEqual(UpdateRecordExpander.get_record_id_field("close_diaobject_validity"), "diaObjectId") + self.assertEqual(UpdateRecordExpander.get_record_id_field("update_n_dia_sources"), "diaObjectId") + + # Test unknown update type + with self.assertRaises(ValueError) as cm: + UpdateRecordExpander.get_record_id_field("unknown_update_type") + self.assertIn("Unknown update_type: unknown_update_type", str(cm.exception)) + + def test_expand_single_record_reassign_to_diaobject(self) -> None: + """Test expand_single_record with ApdbReassignDiaSourceToDiaObjectRecord.""" + record = ApdbReassignDiaSourceToDiaObjectRecord( + update_time_ns=self.update_time_ns, + update_order=0, + diaSourceId=100001, + diaObjectId=300001, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + + expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) + + # Should expand to 1 record (diaObjectId) + self.assertEqual(len(expanded), 1) + + expanded_record = expanded[0] + self.assertIsInstance(expanded_record, ExpandedUpdateRecord) + self.assertEqual(expanded_record.table_name, "DiaSource") + self.assertEqual(expanded_record.record_id, 100001) + self.assertEqual(expanded_record.field_name, "diaObjectId") + self.assertEqual(expanded_record.value_json, 300001) + self.assertEqual(expanded_record.replica_chunk_id, self.replica_chunk_id) + self.assertEqual(expanded_record.update_order, 0) + self.assertEqual(expanded_record.update_time_ns, self.update_time_ns) + """Test expand_single_record with + ApdbReassignDiaSourceToSSObjectRecord. + """ + record = ApdbReassignDiaSourceToSSObjectRecord( + update_time_ns=self.update_time_ns, + update_order=0, + diaSourceId=100001, + ssObjectId=2001, + ssObjectReassocTimeMjdTai=float(self.update_time.tai.mjd), + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + + expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) + + # Should expand to 2 records (ssObjectId and ssObjectReassocTimeMjdTai) + self.assertEqual(len(expanded), 2) + + # Check first expanded record (ssObjectId) + first_record = expanded[0] + self.assertIsInstance(first_record, ExpandedUpdateRecord) + self.assertEqual(first_record.table_name, "DiaSource") + self.assertEqual(first_record.record_id, 100001) + self.assertEqual(first_record.field_name, "ssObjectId") + self.assertEqual(first_record.value_json, 2001) + self.assertEqual(first_record.replica_chunk_id, self.replica_chunk_id) + self.assertEqual(first_record.update_order, 0) + self.assertEqual(first_record.update_time_ns, self.update_time_ns) + + # Check second expanded record (ssObjectReassocTimeMjdTai) + second_record = expanded[1] + self.assertEqual(second_record.table_name, "DiaSource") + self.assertEqual(second_record.record_id, 100001) + self.assertEqual(second_record.field_name, "ssObjectReassocTimeMjdTai") + self.assertEqual(second_record.value_json, float(self.update_time.tai.mjd)) + + def test_expand_single_record_withdraw_diasource(self) -> None: + """Test expand_single_record with ApdbWithdrawDiaSourceRecord.""" + record = ApdbWithdrawDiaSourceRecord( + update_time_ns=self.update_time_ns, + update_order=2, + diaSourceId=100003, + timeWithdrawnMjdTai=self.update_time.tai.mjd, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + + expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) + + # Should expand to 1 record (timeWithdrawnMjdTai) + self.assertEqual(len(expanded), 1) + + expanded_record = expanded[0] + self.assertEqual(expanded_record.table_name, "DiaSource") + self.assertEqual(expanded_record.record_id, 100003) + self.assertEqual(expanded_record.field_name, "timeWithdrawnMjdTai") + self.assertEqual(expanded_record.value_json, self.update_time.tai.mjd) + + def test_expand_single_record_reassign_to_ssobject(self) -> None: + """Test expand_single_record with ApdbCloseDiaObjectValidityRecord.""" + record = ApdbCloseDiaObjectValidityRecord( + update_time_ns=self.update_time_ns, + update_order=1, + diaObjectId=200001, + validityEndMjdTai=self.update_time.tai.mjd, + nDiaSources=5, + ra=45.0, + dec=-30.0, + ) + + expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) + + # Should expand to 2 records (validityEndMjdTai and nDiaSources) + self.assertEqual(len(expanded), 2) + + # Check first expanded record (validityEndMjdTai) + first_record = expanded[0] + self.assertEqual(first_record.table_name, "DiaObject") + self.assertEqual(first_record.record_id, 200001) + self.assertEqual(first_record.field_name, "validityEndMjdTai") + self.assertEqual(first_record.value_json, self.update_time.tai.mjd) + + # Check second expanded record (nDiaSources) + second_record = expanded[1] + self.assertEqual(second_record.table_name, "DiaObject") + self.assertEqual(second_record.record_id, 200001) + self.assertEqual(second_record.field_name, "nDiaSources") + self.assertEqual(second_record.value_json, 5) + + def test_expand_single_record_update_n_dia_sources(self) -> None: + """Test expand_single_record with ApdbUpdateNDiaSourcesRecord.""" + record = ApdbUpdateNDiaSourcesRecord( + update_time_ns=self.update_time_ns, + update_order=5, + diaObjectId=200002, + nDiaSources=10, + ra=45.0, + dec=-30.0, + ) + + expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) + + # Should expand to 1 record (nDiaSources) + self.assertEqual(len(expanded), 1) + + expanded_record = expanded[0] + self.assertEqual(expanded_record.table_name, "DiaObject") + self.assertEqual(expanded_record.record_id, 200002) + self.assertEqual(expanded_record.field_name, "nDiaSources") + self.assertEqual(expanded_record.value_json, 10) + + def test_expand_single_record_close_validity(self) -> None: + """Test expand_single_record with ApdbCloseDiaObjectValidityRecord.""" + record = ApdbCloseDiaObjectValidityRecord( + update_time_ns=self.update_time_ns, + update_order=4, + diaObjectId=200001, + validityEndMjdTai=self.update_time.tai.mjd, + nDiaSources=5, + ra=45.0, + dec=-30.0, + ) + + expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) + + # Should expand to 2 records (validityEndMjdTai and nDiaSources) + self.assertEqual(len(expanded), 2) + + # Check first expanded record (validityEndMjdTai) + first_record = expanded[0] + self.assertIsInstance(first_record, ExpandedUpdateRecord) + self.assertEqual(first_record.table_name, "DiaObject") + self.assertEqual(first_record.record_id, 200001) + self.assertEqual(first_record.field_name, "validityEndMjdTai") + self.assertEqual(first_record.value_json, self.update_time.tai.mjd) + + # Check second expanded record (nDiaSources) + second_record = expanded[1] + self.assertEqual(second_record.table_name, "DiaObject") + self.assertEqual(second_record.record_id, 200001) + self.assertEqual(second_record.field_name, "nDiaSources") + self.assertEqual(second_record.value_json, 5) + + def test_expand_single_record_withdraw_forcedsource(self) -> None: + """Test expand_single_record with ApdbWithdrawDiaForcedSourceRecord.""" + record = ApdbWithdrawDiaForcedSourceRecord( + update_time_ns=self.update_time_ns, + update_order=2, + diaObjectId=200001, + visit=12345, + detector=42, + timeWithdrawnMjdTai=self.update_time.tai.mjd, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + + expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) + + # Should expand to 1 record (timeWithdrawnMjdTai) + self.assertEqual(len(expanded), 1) + + expanded_record = expanded[0] + self.assertEqual(expanded_record.table_name, "DiaForcedSource") + # The record ID should be a hash of the composite key (diaObjectId, + # visit, detector) + expected_record_id = hash((200001, 12345, 42)) + self.assertEqual(expanded_record.record_id, expected_record_id) + self.assertEqual(expanded_record.field_name, "timeWithdrawnMjdTai") + self.assertEqual(expanded_record.value_json, self.update_time.tai.mjd) + + def test_expand_updates_full_integration(self) -> None: + """Test the full expand_updates method with multiple record types.""" + update_records = self._create_test_update_records() + + expanded = UpdateRecordExpander.expand_updates(update_records) + + # Should have 8 total expanded records: + # - 1 from ApdbReassignDiaSourceToDiaObjectRecord + # - 2 from ApdbReassignDiaSourceToSSObjectRecord + # - 1 from ApdbWithdrawDiaSourceRecord + # - 1 from ApdbWithdrawDiaForcedSourceRecord + # - 2 from ApdbCloseDiaObjectValidityRecord + # - 1 from ApdbUpdateNDiaSourcesRecord + self.assertEqual(len(expanded), 8) + + # Verify all expanded records have correct replica_chunk_id + for record in expanded: + self.assertEqual(record.replica_chunk_id, self.replica_chunk_id) + self.assertIsInstance(record.update_time_ns, int) + self.assertIsInstance(record.update_order, int) + + # Check that we have the expected table names + table_names = {record.table_name for record in expanded} + expected_tables = {"DiaSource", "DiaObject", "DiaForcedSource"} + self.assertEqual(table_names, expected_tables) + + # Check that we have the expected field names + field_names = {record.field_name for record in expanded} + expected_fields = { + "diaObjectId", # from reassign to diaobject + "ssObjectId", + "ssObjectReassocTimeMjdTai", # from reassign to ssobject + "timeWithdrawnMjdTai", # from withdraw diasource and withdraw forced source + "validityEndMjdTai", + "nDiaSources", # from close validity and update n dia sources + } + self.assertEqual(field_names, expected_fields) + + def test_expand_updates_empty_records(self) -> None: + """Test expand_updates with empty records list.""" + empty_update_records = UpdateRecords( + replica_chunk_id=self.replica_chunk_id, + record_count=0, + records=[], + file_created_at=datetime.datetime.now(datetime.UTC), + ) + + expanded = UpdateRecordExpander.expand_updates(empty_update_records) + self.assertEqual(len(expanded), 0) + + +if __name__ == "__main__": + unittest.main() From dcbf99267363d0353f4c06f9863afcc054f93688 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 13 Feb 2026 17:58:56 -0600 Subject: [PATCH 09/49] WIP: Updates to ppdb_bigquery and test --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 2 +- tests/test_ppdb_bigquery.py | 39 ++++++++++++------- 2 files changed, 27 insertions(+), 14 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index bf421015..adc13673 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -602,7 +602,7 @@ def _handle_updates( records=apdb_update_records, record_count=len(apdb_update_records), ) - update_records.to_json_file(chunk_dir / "update_records.json") + update_records.write_json_file(chunk_dir / "update_records.json") _LOG.info( "Saved %d update records for %s to %s", diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index 21ee600d..dd4d0954 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -169,8 +169,8 @@ def generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: class UpdateRecordsTestCase(_PostgresMixin, ApdbMixin, unittest.TestCase): - """A test case for the PpdbBigQuery class update functionality using a - Postgres backend. + """A test case for the handling of APDB record updates by PpdbBigQuery and + related classes including the ChunkUploader. """ include_update_records = True @@ -189,18 +189,16 @@ def setUp(self): self.ppdb = Ppdb.from_config(self.ppdb_config) assert isinstance(self.ppdb, PpdbBigQuery) - # Replicate those to PPDB. + # Replicate APDB replica chunks to the PPDB. replicator = Replicator( apdb_replica, self.ppdb, update=False, min_wait_time=0, max_wait_time=0, check_interval=0 ) - - # Copy chunks. replicator.run(exit_on_empty=True) def test_json_serialization(self) -> None: """Test that the APDB update records are correctly saved to a JSON file - in the replication output and can be read back correctly as - UpdateRecords objects. + in the replication output and can be read back as valid UpdateRecords + objects. """ update_records_path = self.ppdb.replication_path / "2021/03/01/1614600000" / "update_records.json" self.assertTrue(update_records_path.exists(), "Update records file not found in replication output") @@ -385,16 +383,10 @@ def test_json_serialization(self) -> None: def test_chunk_uploader(self) -> None: """Test that the update records are correctly uploaded to Google Cloud Storage after replication. - - This will only run if ``dax_ppdbx_gcp`` is installed, which provides - Google Cloud support. Imports are inlined so that the module can run - without it. """ from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader from lsst.dax.ppdbx.gcp.gcs import StorageClient - print("\nTesting GCS upload of replication output...") - # Change the configuration to use a unique test bucket name to avoid # conflicts ppdb_config_copy = self.ppdb_config.model_copy() @@ -443,6 +435,27 @@ def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_ update_records_json = json.loads(update_records_str) print(f"Contents of update_records.json in GCS:\n{json.dumps(update_records_json, indent=2)}") + # Load the update records into the data model and perform a few basic + # checks (test_json_serialization already tests this in detail, so we + # just check a few key fields here). + update_records = UpdateRecords.model_validate(update_records_json) + self.assertEqual( + update_records.replica_chunk_id, + 1614600000, + "Unexpected replica chunk ID in update records file from GCS", + ) + self.assertEqual( + update_records.record_count, + 3, + f"Expected record_count of 3 in update records file from GCS, found " + f"{update_records.record_count}", + ) + self.assertEqual( + len(update_records.records), + 3, + f"Expected 3 update records in the file from GCS, found {len(update_records.records)}", + ) + # Delete the test GCS bucket try: storage_client.delete_bucket(force=True) From 2312aeefa0067a9306f17e47b5612b0f59150e58 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 17 Feb 2026 15:36:49 -0600 Subject: [PATCH 10/49] Create new package for handling APDB updates --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 2 +- .../dax/ppdb/bigquery/updates/__init__.py | 23 +++++++++++++++++++ .../bigquery/{ => updates}/update_handler.py | 0 .../bigquery/{ => updates}/update_records.py | 0 tests/test_ppdb_bigquery.py | 2 +- tests/test_update_handler.py | 3 +-- 6 files changed, 26 insertions(+), 4 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/updates/__init__.py rename python/lsst/dax/ppdb/bigquery/{ => updates}/update_handler.py (100%) rename python/lsst/dax/ppdb/bigquery/{ => updates}/update_records.py (100%) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index adc13673..a6c72fa9 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -46,7 +46,7 @@ from ..sql import PpdbSqlBase, PpdbSqlBaseConfig from .manifest import Manifest, TableStats from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended -from .update_records import UpdateRecords +from .updates.update_records import UpdateRecords __all__ = ["ConfigValidationError", "PpdbBigQuery", "PpdbBigQueryConfig"] diff --git a/python/lsst/dax/ppdb/bigquery/updates/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/__init__.py new file mode 100644 index 00000000..dd3a25f1 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/__init__.py @@ -0,0 +1,23 @@ +# This file is part of dax_ppdb +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from .update_records import UpdateRecords +from .update_handler import ExpandedUpdateRecord, UpdateRecordExpander diff --git a/python/lsst/dax/ppdb/bigquery/update_handler.py b/python/lsst/dax/ppdb/bigquery/updates/update_handler.py similarity index 100% rename from python/lsst/dax/ppdb/bigquery/update_handler.py rename to python/lsst/dax/ppdb/bigquery/updates/update_handler.py diff --git a/python/lsst/dax/ppdb/bigquery/update_records.py b/python/lsst/dax/ppdb/bigquery/updates/update_records.py similarity index 100% rename from python/lsst/dax/ppdb/bigquery/update_records.py rename to python/lsst/dax/ppdb/bigquery/updates/update_records.py diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index dd4d0954..5bf253bd 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -40,7 +40,7 @@ from lsst.dax.apdb.sql import ApdbSql from lsst.dax.ppdb import Ppdb, PpdbConfig from lsst.dax.ppdb.bigquery import PpdbBigQuery -from lsst.dax.ppdb.bigquery.update_records import UpdateRecords +from lsst.dax.ppdb.bigquery.updates import UpdateRecords from lsst.dax.ppdb.replicator import Replicator from lsst.dax.ppdb.tests import ApdbMixin, PpdbTest diff --git a/tests/test_update_handler.py b/tests/test_update_handler.py index 02ea0162..06abc73f 100644 --- a/tests/test_update_handler.py +++ b/tests/test_update_handler.py @@ -32,8 +32,7 @@ ApdbWithdrawDiaForcedSourceRecord, ApdbWithdrawDiaSourceRecord, ) -from lsst.dax.ppdb.bigquery.update_handler import ExpandedUpdateRecord, UpdateRecordExpander -from lsst.dax.ppdb.bigquery.update_records import UpdateRecords +from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords # Move to `test_updateRecordExpander.py` to follow camelcase convention From e307d47eed39be8987bffd3c703f283aa4888dfd Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 17 Feb 2026 15:43:06 -0600 Subject: [PATCH 11/49] Add expanded_update_record module --- .../updates/expanded_update_record.py | 76 +++++++++++++++++ .../ppdb/bigquery/updates/update_handler.py | 83 +------------------ 2 files changed, 77 insertions(+), 82 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py diff --git a/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py b/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py new file mode 100644 index 00000000..fa5df388 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py @@ -0,0 +1,76 @@ +# This file is part of dax_ppdb +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +from typing import Any + +from pydantic import BaseModel, Field + + +class ExpandedUpdateRecord(BaseModel): + """ + A single normalized (expanded) update row. + + This model represents one field-level update after expanding an + original logical update event into one row per updated field. + It is the canonical shape loaded into the BigQuery updates table. + """ + + table_name: str = Field( + ..., + min_length=1, + description=("Logical target table for the update (e.g., 'DiaObject', 'DiaSource')."), + ) + + record_id: int = Field( + ..., + description=("Canonical identifier of the record being modified."), + ) + + field_name: str = Field( + ..., + min_length=1, + description=("Name of the target column being updated."), + ) + + value_json: Any = Field( + ..., + description=("JSON-serializable new value for the field."), + ) + + replica_chunk_id: int = Field( + ..., + ge=0, + description=("Source replica chunk identifier associated with this update."), + ) + + update_order: int | None = Field( + default=None, + ge=0, + description=("Ordering value within the replica chunk or update batch."), + ) + + update_time_ns: int | None = Field( + default=None, + ge=0, + description=("Source event timestamp in nanoseconds since the epoch."), + ) diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_handler.py b/python/lsst/dax/ppdb/bigquery/updates/update_handler.py index adf0e1fb..15191ca5 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_handler.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_handler.py @@ -21,93 +21,12 @@ from __future__ import annotations -from typing import Any - -from pydantic import BaseModel, Field - from lsst.dax.apdb.apdbUpdateRecord import ApdbUpdateRecord +from .expanded_update_record import ExpandedUpdateRecord from .update_records import UpdateRecords -# TODO: Move to an expandedUpdateRecord.py module -class ExpandedUpdateRecord(BaseModel): - """ - A single normalized (expanded) update row. - - This model represents one field-level update after expanding an - original logical update event into one row per updated field. - It is the canonical shape loaded into the BigQuery updates table. - """ - - table_name: str = Field( - ..., - min_length=1, - description=( - "Logical target table for the update (e.g., 'DiaObject', " - "'DiaSource'). This determines which production table " - "the update will be applied to." - ), - ) - - record_id: int = Field( - ..., - description=( - "Canonical primary key of the record being modified as an integer. " - "For composite keys, a single integer representation must be used." - ), - ) - - field_name: str = Field( - ..., - min_length=1, - description=( - "Name of the target column being updated within the logical table identified by 'table_name'." - ), - ) - - value_json: Any = Field( - ..., - description=( - "JSON-serializable new value for the field, including explicit " - "None to represent setting the column to NULL. This value must " - "be compatible with the BigQuery JSON type and later castable " - "to the target column type during MERGE." - ), - ) - - replica_chunk_id: int = Field( - ..., - ge=0, - description=( - "Source replica chunk identifier associated with this update. " - "Used as part of the deterministic ordering rule when resolving " - "multiple updates to the same (record_id, field_name)." - ), - ) - - update_order: int | None = Field( - default=None, - ge=0, - description=( - "Ordering value within the replica chunk or update batch, " - "if provided by the source system. Nullable if not available. " - "Used to break ties between updates within the same chunk." - ), - ) - - update_time_ns: int | None = Field( - default=None, - ge=0, - description=( - "Source event timestamp in nanoseconds since the epoch, " - "if provided. Nullable if not available. Used as an additional " - "ordering signal during deduplication." - ), - ) - - -# Move to `updateRecordExpander.py` to follow camelcase convention class UpdateRecordExpander: """Expand APDB update records into individual field-level updates for BigQuery. From 021af3d8eef93018e1b287afe0fb88b728aeb8c5 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 17 Feb 2026 15:51:35 -0600 Subject: [PATCH 12/49] Rename update_handler module to update_record_expander --- python/lsst/dax/ppdb/bigquery/updates/__init__.py | 3 ++- .../updates/{update_handler.py => update_record_expander.py} | 0 .../{test_update_handler.py => test_update_record_expander.py} | 1 - 3 files changed, 2 insertions(+), 2 deletions(-) rename python/lsst/dax/ppdb/bigquery/updates/{update_handler.py => update_record_expander.py} (100%) rename tests/{test_update_handler.py => test_update_record_expander.py} (99%) diff --git a/python/lsst/dax/ppdb/bigquery/updates/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/__init__.py index dd3a25f1..35cd37ae 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/__init__.py +++ b/python/lsst/dax/ppdb/bigquery/updates/__init__.py @@ -20,4 +20,5 @@ # along with this program. If not, see . from .update_records import UpdateRecords -from .update_handler import ExpandedUpdateRecord, UpdateRecordExpander +from .expanded_update_record import ExpandedUpdateRecord +from .update_record_expander import UpdateRecordExpander diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_handler.py b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py similarity index 100% rename from python/lsst/dax/ppdb/bigquery/updates/update_handler.py rename to python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py diff --git a/tests/test_update_handler.py b/tests/test_update_record_expander.py similarity index 99% rename from tests/test_update_handler.py rename to tests/test_update_record_expander.py index 06abc73f..54050e89 100644 --- a/tests/test_update_handler.py +++ b/tests/test_update_record_expander.py @@ -35,7 +35,6 @@ from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords -# Move to `test_updateRecordExpander.py` to follow camelcase convention class TestUpdateRecordExpander(unittest.TestCase): """Test UpdateRecordExpander functionality.""" From 465cf32e2d18ebc769ea70a9d3f7cd46b6a6167d Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 17 Feb 2026 16:33:12 -0600 Subject: [PATCH 13/49] Model record_id as a list of integers --- .../updates/expanded_update_record.py | 8 +- .../updates/update_record_expander.py | 74 ++++++++++++------- tests/test_update_record_expander.py | 61 ++++++++------- 3 files changed, 89 insertions(+), 54 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py b/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py index fa5df388..d23c87c2 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py +++ b/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py @@ -41,9 +41,13 @@ class ExpandedUpdateRecord(BaseModel): description=("Logical target table for the update (e.g., 'DiaObject', 'DiaSource')."), ) - record_id: int = Field( + record_id: list[int] = Field( ..., - description=("Canonical identifier of the record being modified."), + description=( + "Identifier of the record being updated. For update types with a single record ID, this " + "will be a list of one element. For updates on records with a composite key " + "(e.g., DiaForcedSource), this will include all components of the key, in order." + ), ) field_name: str = Field( diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py index 15191ca5..18c632c9 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py @@ -75,7 +75,7 @@ def get_update_fields(cls, update_type: str) -> list[str]: return cls._UPDATE_FIELD_MAPPING[update_type] @classmethod - def get_record_id_field(cls, update_type: str) -> str | list[str]: + def get_record_id_field_names(cls, update_type: str) -> str | list[str]: """Get the field name(s) that serve as the record ID for a given update type. @@ -100,6 +100,32 @@ def get_record_id_field(cls, update_type: str) -> str | list[str]: return cls._RECORD_ID_FIELD_MAPPING[update_type] + @classmethod + def get_record_id_field(cls, update_type: str) -> str | list[str]: + """Get the field name(s) that serve as the record ID for a given update + type. + + This method is an alias for get_record_id_field_names for backward + compatibility. + + Parameters + ---------- + update_type : `str` + The type of update record. + + Returns + ------- + field_name : `str` or `list` [ `str` ] + Name of the field that contains the record ID for this update type, + or list of field names for composite keys. + + Raises + ------ + ValueError + If the update_type is not recognized. + """ + return cls.get_record_id_field_names(update_type) + @classmethod def expand_single_record( cls, update_record: ApdbUpdateRecord, replica_chunk_id: int @@ -126,7 +152,7 @@ def expand_single_record( table_name = update_record.apdb_table.name # Get the record ID - record_id = cls._generate_record_id(update_record) + record_id = cls._get_record_id(update_record) expanded_records = [] for field_name in field_names: @@ -151,8 +177,8 @@ def expand_single_record( return expanded_records @classmethod - def _generate_record_id(cls, update_record: ApdbUpdateRecord) -> int: - """Generate a record ID integer from an update record. + def _get_record_id(cls, update_record: ApdbUpdateRecord) -> list[int]: + """Generate a record ID from an update record. Parameters ---------- @@ -161,30 +187,28 @@ def _generate_record_id(cls, update_record: ApdbUpdateRecord) -> int: Returns ------- - record_id : `int` - Integer representation of the record's primary key. + record_id : `list` [ `int` ] + The record ID as a list of integers. For simple keys, a + single-element list. For composite keys, a multi-element list. """ update_type = update_record.update_type - id_field = cls.get_record_id_field(update_type) - - if isinstance(id_field, list): - # Handle composite key (e.g., DiaForcedSource) - key_values = [] - for field in id_field: - if not hasattr(update_record, field): - raise ValueError( - f"Update record of type {update_type} is missing expected ID field {field}" - ) - key_values.append(getattr(update_record, field)) - # Create a hash of the composite key components - return hash(tuple(key_values)) + id_fields = cls.get_record_id_field_names(update_type) + + # Handle both single field (string) and composite fields (list) + if isinstance(id_fields, str): + # Single field key + field = id_fields + if not hasattr(update_record, field): + raise ValueError(f"Update record of type {update_type} is missing expected ID field {field}") + return [int(getattr(update_record, field))] else: - # Handle single field key - if not hasattr(update_record, id_field): - raise ValueError( - f"Update record of type {update_type} is missing expected ID field {id_field}" - ) - return int(getattr(update_record, id_field)) + # Composite key (list of fields) + record_id = [] + for field in id_fields: + if not hasattr(update_record, field): + raise ValueError(f"Update record of type {update_type} is missing expected ID field {field}") + record_id.append(int(getattr(update_record, field))) + return record_id @classmethod def expand_updates(cls, update_records: UpdateRecords) -> list[ExpandedUpdateRecord]: diff --git a/tests/test_update_record_expander.py b/tests/test_update_record_expander.py index 54050e89..b9f487a4 100644 --- a/tests/test_update_record_expander.py +++ b/tests/test_update_record_expander.py @@ -191,7 +191,9 @@ def test_get_record_id_field(self) -> None: self.assertIn("Unknown update_type: unknown_update_type", str(cm.exception)) def test_expand_single_record_reassign_to_diaobject(self) -> None: - """Test expand_single_record with ApdbReassignDiaSourceToDiaObjectRecord.""" + """Test expand_single_record with + ApdbReassignDiaSourceToDiaObjectRecord. + """ record = ApdbReassignDiaSourceToDiaObjectRecord( update_time_ns=self.update_time_ns, update_order=0, @@ -210,7 +212,7 @@ def test_expand_single_record_reassign_to_diaobject(self) -> None: expanded_record = expanded[0] self.assertIsInstance(expanded_record, ExpandedUpdateRecord) self.assertEqual(expanded_record.table_name, "DiaSource") - self.assertEqual(expanded_record.record_id, 100001) + self.assertEqual(expanded_record.record_id, [100001]) self.assertEqual(expanded_record.field_name, "diaObjectId") self.assertEqual(expanded_record.value_json, 300001) self.assertEqual(expanded_record.replica_chunk_id, self.replica_chunk_id) @@ -239,7 +241,7 @@ def test_expand_single_record_reassign_to_diaobject(self) -> None: first_record = expanded[0] self.assertIsInstance(first_record, ExpandedUpdateRecord) self.assertEqual(first_record.table_name, "DiaSource") - self.assertEqual(first_record.record_id, 100001) + self.assertEqual(first_record.record_id, [100001]) self.assertEqual(first_record.field_name, "ssObjectId") self.assertEqual(first_record.value_json, 2001) self.assertEqual(first_record.replica_chunk_id, self.replica_chunk_id) @@ -249,7 +251,7 @@ def test_expand_single_record_reassign_to_diaobject(self) -> None: # Check second expanded record (ssObjectReassocTimeMjdTai) second_record = expanded[1] self.assertEqual(second_record.table_name, "DiaSource") - self.assertEqual(second_record.record_id, 100001) + self.assertEqual(second_record.record_id, [100001]) self.assertEqual(second_record.field_name, "ssObjectReassocTimeMjdTai") self.assertEqual(second_record.value_json, float(self.update_time.tai.mjd)) @@ -272,40 +274,45 @@ def test_expand_single_record_withdraw_diasource(self) -> None: expanded_record = expanded[0] self.assertEqual(expanded_record.table_name, "DiaSource") - self.assertEqual(expanded_record.record_id, 100003) + self.assertEqual(expanded_record.record_id, [100003]) self.assertEqual(expanded_record.field_name, "timeWithdrawnMjdTai") self.assertEqual(expanded_record.value_json, self.update_time.tai.mjd) def test_expand_single_record_reassign_to_ssobject(self) -> None: - """Test expand_single_record with ApdbCloseDiaObjectValidityRecord.""" - record = ApdbCloseDiaObjectValidityRecord( + """Test expand_single_record with ApdbReassignDiaSourceToSSObjectRecord.""" + record = ApdbReassignDiaSourceToSSObjectRecord( update_time_ns=self.update_time_ns, update_order=1, - diaObjectId=200001, - validityEndMjdTai=self.update_time.tai.mjd, - nDiaSources=5, + diaSourceId=100002, + ssObjectId=2001, + ssObjectReassocTimeMjdTai=float(self.update_time.tai.mjd), ra=45.0, dec=-30.0, + midpointMjdTai=60000.0, ) expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) - # Should expand to 2 records (validityEndMjdTai and nDiaSources) + # Should expand to 2 records (ssObjectId and ssObjectReassocTimeMjdTai) self.assertEqual(len(expanded), 2) - # Check first expanded record (validityEndMjdTai) + # Check first expanded record (ssObjectId) first_record = expanded[0] - self.assertEqual(first_record.table_name, "DiaObject") - self.assertEqual(first_record.record_id, 200001) - self.assertEqual(first_record.field_name, "validityEndMjdTai") - self.assertEqual(first_record.value_json, self.update_time.tai.mjd) + self.assertIsInstance(first_record, ExpandedUpdateRecord) + self.assertEqual(first_record.table_name, "DiaSource") + self.assertEqual(first_record.record_id, [100002]) + self.assertEqual(first_record.field_name, "ssObjectId") + self.assertEqual(first_record.value_json, 2001) + self.assertEqual(first_record.replica_chunk_id, self.replica_chunk_id) + self.assertEqual(first_record.update_order, 1) + self.assertEqual(first_record.update_time_ns, self.update_time_ns) - # Check second expanded record (nDiaSources) + # Check second expanded record (ssObjectReassocTimeMjdTai) second_record = expanded[1] - self.assertEqual(second_record.table_name, "DiaObject") - self.assertEqual(second_record.record_id, 200001) - self.assertEqual(second_record.field_name, "nDiaSources") - self.assertEqual(second_record.value_json, 5) + self.assertEqual(second_record.table_name, "DiaSource") + self.assertEqual(second_record.record_id, [100002]) + self.assertEqual(second_record.field_name, "ssObjectReassocTimeMjdTai") + self.assertEqual(second_record.value_json, float(self.update_time.tai.mjd)) def test_expand_single_record_update_n_dia_sources(self) -> None: """Test expand_single_record with ApdbUpdateNDiaSourcesRecord.""" @@ -325,7 +332,7 @@ def test_expand_single_record_update_n_dia_sources(self) -> None: expanded_record = expanded[0] self.assertEqual(expanded_record.table_name, "DiaObject") - self.assertEqual(expanded_record.record_id, 200002) + self.assertEqual(expanded_record.record_id, [200002]) self.assertEqual(expanded_record.field_name, "nDiaSources") self.assertEqual(expanded_record.value_json, 10) @@ -350,14 +357,14 @@ def test_expand_single_record_close_validity(self) -> None: first_record = expanded[0] self.assertIsInstance(first_record, ExpandedUpdateRecord) self.assertEqual(first_record.table_name, "DiaObject") - self.assertEqual(first_record.record_id, 200001) + self.assertEqual(first_record.record_id, [200001]) self.assertEqual(first_record.field_name, "validityEndMjdTai") self.assertEqual(first_record.value_json, self.update_time.tai.mjd) # Check second expanded record (nDiaSources) second_record = expanded[1] self.assertEqual(second_record.table_name, "DiaObject") - self.assertEqual(second_record.record_id, 200001) + self.assertEqual(second_record.record_id, [200001]) self.assertEqual(second_record.field_name, "nDiaSources") self.assertEqual(second_record.value_json, 5) @@ -382,9 +389,9 @@ def test_expand_single_record_withdraw_forcedsource(self) -> None: expanded_record = expanded[0] self.assertEqual(expanded_record.table_name, "DiaForcedSource") - # The record ID should be a hash of the composite key (diaObjectId, - # visit, detector) - expected_record_id = hash((200001, 12345, 42)) + # The record ID should be a list of the composite key components + # [diaObjectId, visit, detector] for BigQuery compatibility + expected_record_id = [200001, 12345, 42] self.assertEqual(expanded_record.record_id, expected_record_id) self.assertEqual(expanded_record.field_name, "timeWithdrawnMjdTai") self.assertEqual(expanded_record.value_json, self.update_time.tai.mjd) From 4a9df5d54bb9b137029e9a693bdd98daf62990ca Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 17 Feb 2026 17:52:51 -0600 Subject: [PATCH 14/49] Add insertion of update records into BigQuery --- .../dax/ppdb/bigquery/updates/__init__.py | 1 + .../updates/expanded_update_record.py | 2 +- .../updates/update_record_expander.py | 6 +- .../ppdb/bigquery/updates/updates_table.py | 154 ++++++++++++++++ python/lsst/dax/ppdb/tests/_updates.py | 130 +++++++++++++ tests/test_update_record_expander.py | 174 +++--------------- tests/test_updates_table.py | 165 +++++++++++++++++ 7 files changed, 481 insertions(+), 151 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/updates/updates_table.py create mode 100644 python/lsst/dax/ppdb/tests/_updates.py create mode 100644 tests/test_updates_table.py diff --git a/python/lsst/dax/ppdb/bigquery/updates/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/__init__.py index 35cd37ae..a21b1add 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/__init__.py +++ b/python/lsst/dax/ppdb/bigquery/updates/__init__.py @@ -22,3 +22,4 @@ from .update_records import UpdateRecords from .expanded_update_record import ExpandedUpdateRecord from .update_record_expander import UpdateRecordExpander +from .updates_table import UpdatesTable diff --git a/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py b/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py index d23c87c2..d59c9785 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py +++ b/python/lsst/dax/ppdb/bigquery/updates/expanded_update_record.py @@ -1,4 +1,4 @@ -# This file is part of dax_ppdb +# This file is part of dax_ppdb. # # Developed for the LSST Data Management System. # This product includes software developed by the LSST Project diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py index 18c632c9..e9b6e911 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py @@ -1,4 +1,4 @@ -# This file is part of dax_ppdb +# This file is part of dax_ppdb. # # Developed for the LSST Data Management System. # This product includes software developed by the LSST Project @@ -206,7 +206,9 @@ def _get_record_id(cls, update_record: ApdbUpdateRecord) -> list[int]: record_id = [] for field in id_fields: if not hasattr(update_record, field): - raise ValueError(f"Update record of type {update_type} is missing expected ID field {field}") + raise ValueError( + f"Update record of type {update_type} is missing expected ID field {field}" + ) record_id.append(int(getattr(update_record, field))) return record_id diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_table.py b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py new file mode 100644 index 00000000..c39d1592 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py @@ -0,0 +1,154 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +from collections.abc import Iterable +from typing import Any + +from google.cloud import bigquery + +from .expanded_update_record import ExpandedUpdateRecord + + +class UpdatesTable: + """ + Manage a BigQuery updates table for `ExpandedUpdateRecord` rows. + + This class is responsible for creating the updates table with the correct + schema and appending expanded update records into it. + """ + + def __init__(self, client: bigquery.Client, table_fqn: str) -> None: + """ + Parameters + ---------- + client + BigQuery client. + table_fqn + Fully-qualified table name in the form ``"project.dataset.table"``. + """ + self._client: bigquery.Client = client + self._table_fqn: str = table_fqn + + @property + def table_fqn(self) -> str: + """ + Fully-qualified BigQuery table name. + + Returns + ------- + str + Table name in the form ``"project.dataset.table"``. + """ + return self._table_fqn + + def create(self) -> bigquery.Table: + """ + Create the updates table. + + Returns + ------- + google.cloud.bigquery.Table + The created table. + + Raises + ------ + google.api_core.exceptions.Conflict + If the table already exists. + + Notes + ----- + Schema: + + - table_name: STRING (REQUIRED) + - record_id: ARRAY (REQUIRED) + - field_name: STRING (REQUIRED) + - value_json: JSON (REQUIRED) + - replica_chunk_id: INT64 (REQUIRED) + - update_order: INT64 (NULLABLE) + - update_time_ns: INT64 (NULLABLE) + """ + schema: list[bigquery.SchemaField] = [ + bigquery.SchemaField("table_name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("record_id", "INT64", mode="REPEATED"), + bigquery.SchemaField("field_name", "STRING", mode="REQUIRED"), + bigquery.SchemaField("value_json", "JSON", mode="REQUIRED"), + bigquery.SchemaField("replica_chunk_id", "INT64", mode="REQUIRED"), + bigquery.SchemaField("update_order", "INT64", mode="NULLABLE"), + bigquery.SchemaField("update_time_ns", "INT64", mode="NULLABLE"), + ] + + table = bigquery.Table(self._table_fqn, schema=schema) + return self._client.create_table(table) + + def append(self, records: Iterable[ExpandedUpdateRecord]) -> bigquery.LoadJob: + """ + Append `ExpandedUpdateRecord` rows into the updates table. + + Parameters + ---------- + records + Iterable of update records to append. + + Returns + ------- + google.cloud.bigquery.LoadJob + Completed BigQuery load job. + + Raises + ------ + RuntimeError + If the BigQuery load job completes with errors. + + Notes + ----- + This uses a batch load via `Client.load_table_from_json` (not streaming + inserts). The table must already exist. + """ + rows: list[dict[str, Any]] = [ + { + "table_name": r.table_name, + "record_id": r.record_id, + "field_name": r.field_name, + "value_json": r.value_json, + "replica_chunk_id": r.replica_chunk_id, + "update_order": r.update_order, + "update_time_ns": r.update_time_ns, + } + for r in records + ] + + print("Appending rows to BigQuery:", rows) # Debug print to verify the data being loaded + + job = self._client.load_table_from_json( + rows, + self._table_fqn, + job_config=bigquery.LoadJobConfig( + write_disposition=bigquery.WriteDisposition.WRITE_APPEND, + ), + ) + job.result() + + if job.errors: + raise RuntimeError(f"BigQuery load failed: {job.errors}") + + return job diff --git a/python/lsst/dax/ppdb/tests/_updates.py b/python/lsst/dax/ppdb/tests/_updates.py new file mode 100644 index 00000000..52069359 --- /dev/null +++ b/python/lsst/dax/ppdb/tests/_updates.py @@ -0,0 +1,130 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import datetime + +from lsst.dax.apdb import ( + ApdbCloseDiaObjectValidityRecord, + ApdbReassignDiaSourceToDiaObjectRecord, + ApdbReassignDiaSourceToSSObjectRecord, + ApdbUpdateNDiaSourcesRecord, + ApdbWithdrawDiaForcedSourceRecord, + ApdbWithdrawDiaSourceRecord, +) + +from ..bigquery.updates import UpdateRecords + + +def _create_test_update_records() -> UpdateRecords: + """Create test UpdateRecords with sample ApdbUpdateRecord instances.""" + records = [] + + # Hardcoded test values + test_update_time_ns = 1640995200000000000 # 2022-01-01 00:00:00 UTC in nanoseconds + test_mjd_tai = 59580.0 # Corresponding MJD TAI for 2022-01-01 + test_replica_chunk_id = 12345 + + # Reassign DIASource to different DIAObject + records.append( + ApdbReassignDiaSourceToDiaObjectRecord( + update_time_ns=test_update_time_ns, + update_order=0, + diaSourceId=100001, + diaObjectId=300001, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Reassign DIASource to SSObject + records.append( + ApdbReassignDiaSourceToSSObjectRecord( + update_time_ns=test_update_time_ns, + update_order=1, + diaSourceId=100002, + ssObjectId=2001, + ssObjectReassocTimeMjdTai=test_mjd_tai, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Withdraw DIASource + records.append( + ApdbWithdrawDiaSourceRecord( + update_time_ns=test_update_time_ns, + update_order=2, + diaSourceId=100003, + timeWithdrawnMjdTai=test_mjd_tai, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Withdraw DIAForcedSource + records.append( + ApdbWithdrawDiaForcedSourceRecord( + update_time_ns=test_update_time_ns, + update_order=3, + diaObjectId=200001, + visit=12345, + detector=42, + timeWithdrawnMjdTai=test_mjd_tai, + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Close DIAObject validity interval + records.append( + ApdbCloseDiaObjectValidityRecord( + update_time_ns=test_update_time_ns, + update_order=4, + diaObjectId=200001, + validityEndMjdTai=test_mjd_tai, + nDiaSources=5, + ra=45.0, + dec=-30.0, + ) + ) + + # Update DIAObject nDiaSources count + records.append( + ApdbUpdateNDiaSourcesRecord( + update_time_ns=test_update_time_ns, + update_order=5, + diaObjectId=200002, + nDiaSources=10, + ra=45.0, + dec=-30.0, + ) + ) + + return UpdateRecords( + replica_chunk_id=test_replica_chunk_id, + record_count=len(records), + records=records, + file_created_at=datetime.datetime.now(datetime.UTC), + ) diff --git a/tests/test_update_record_expander.py b/tests/test_update_record_expander.py index b9f487a4..6da65484 100644 --- a/tests/test_update_record_expander.py +++ b/tests/test_update_record_expander.py @@ -33,9 +33,10 @@ ApdbWithdrawDiaSourceRecord, ) from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords +from lsst.dax.ppdb.tests._updates import _create_test_update_records -class TestUpdateRecordExpander(unittest.TestCase): +class UpdateRecordExpanderTestCase(unittest.TestCase): """Test UpdateRecordExpander functionality.""" def setUp(self) -> None: @@ -47,100 +48,6 @@ def setUp(self) -> None: # Test replica chunk ID self.replica_chunk_id = 12345 - def _create_test_update_records(self) -> UpdateRecords: - """Create test UpdateRecords with sample ApdbUpdateRecord instances. - - Based on patterns from _ppdb.py _make_update_records method. - """ - records = [] - - # Reassign DIASource to different DIAObject - records.append( - ApdbReassignDiaSourceToDiaObjectRecord( - update_time_ns=self.update_time_ns, - update_order=0, - diaSourceId=100001, - diaObjectId=300001, - ra=45.0, - dec=-30.0, - midpointMjdTai=60000.0, - ) - ) - - # Reassign DIASource to SSObject - records.append( - ApdbReassignDiaSourceToSSObjectRecord( - update_time_ns=self.update_time_ns, - update_order=1, - diaSourceId=100002, - ssObjectId=2001, - ssObjectReassocTimeMjdTai=float(self.update_time.tai.mjd), - ra=45.0, - dec=-30.0, - midpointMjdTai=60000.0, - ) - ) - - # Withdraw DIASource - records.append( - ApdbWithdrawDiaSourceRecord( - update_time_ns=self.update_time_ns, - update_order=2, - diaSourceId=100003, - timeWithdrawnMjdTai=self.update_time.tai.mjd, - ra=45.0, - dec=-30.0, - midpointMjdTai=60000.0, - ) - ) - - # Withdraw DIAForcedSource - records.append( - ApdbWithdrawDiaForcedSourceRecord( - update_time_ns=self.update_time_ns, - update_order=3, - diaObjectId=200001, - visit=12345, - detector=42, - timeWithdrawnMjdTai=self.update_time.tai.mjd, - ra=45.0, - dec=-30.0, - midpointMjdTai=60000.0, - ) - ) - - # Close DIAObject validity interval - records.append( - ApdbCloseDiaObjectValidityRecord( - update_time_ns=self.update_time_ns, - update_order=4, - diaObjectId=200001, - validityEndMjdTai=self.update_time.tai.mjd, - nDiaSources=5, - ra=45.0, - dec=-30.0, - ) - ) - - # Update DIAObject nDiaSources count - records.append( - ApdbUpdateNDiaSourcesRecord( - update_time_ns=self.update_time_ns, - update_order=5, - diaObjectId=200002, - nDiaSources=10, - ra=45.0, - dec=-30.0, - ) - ) - - return UpdateRecords( - replica_chunk_id=self.replica_chunk_id, - record_count=len(records), - records=records, - file_created_at=datetime.datetime.now(datetime.UTC), - ) - def test_get_update_fields(self) -> None: """Test get_update_fields class method.""" # Test known update types @@ -168,29 +75,34 @@ def test_get_update_fields(self) -> None: UpdateRecordExpander.get_update_fields("unknown_update_type") self.assertIn("Unknown update_type: unknown_update_type", str(cm.exception)) - def test_get_record_id_field(self) -> None: + def test_get_record_id_field_names(self) -> None: """Test get_record_id_field class method.""" - # Test known update types self.assertEqual( - UpdateRecordExpander.get_record_id_field("reassign_diasource_to_diaobject"), "diaSourceId" + UpdateRecordExpander.get_record_id_field_names("reassign_diasource_to_diaobject"), ["diaSourceId"] + ) + self.assertEqual( + UpdateRecordExpander.get_record_id_field_names("reassign_diasource_to_ssobject"), ["diaSourceId"] ) self.assertEqual( - UpdateRecordExpander.get_record_id_field("reassign_diasource_to_ssobject"), "diaSourceId" + UpdateRecordExpander.get_record_id_field_names("withdraw_diasource"), ["diaSourceId"] ) - self.assertEqual(UpdateRecordExpander.get_record_id_field("withdraw_diasource"), "diaSourceId") self.assertEqual( - UpdateRecordExpander.get_record_id_field("withdraw_diaforcedsource"), + UpdateRecordExpander.get_record_id_field_names("withdraw_diaforcedsource"), ["diaObjectId", "visit", "detector"], ) - self.assertEqual(UpdateRecordExpander.get_record_id_field("close_diaobject_validity"), "diaObjectId") - self.assertEqual(UpdateRecordExpander.get_record_id_field("update_n_dia_sources"), "diaObjectId") + self.assertEqual( + UpdateRecordExpander.get_record_id_field_names("close_diaobject_validity"), ["diaObjectId"] + ) + self.assertEqual( + UpdateRecordExpander.get_record_id_field_names("update_n_dia_sources"), ["diaObjectId"] + ) # Test unknown update type with self.assertRaises(ValueError) as cm: - UpdateRecordExpander.get_record_id_field("unknown_update_type") + UpdateRecordExpander.get_record_id_field_names("unknown_update_type") self.assertIn("Unknown update_type: unknown_update_type", str(cm.exception)) - def test_expand_single_record_reassign_to_diaobject(self) -> None: + def test_reassign_diasource_to_diaobject(self) -> None: """Test expand_single_record with ApdbReassignDiaSourceToDiaObjectRecord. """ @@ -218,6 +130,8 @@ def test_expand_single_record_reassign_to_diaobject(self) -> None: self.assertEqual(expanded_record.replica_chunk_id, self.replica_chunk_id) self.assertEqual(expanded_record.update_order, 0) self.assertEqual(expanded_record.update_time_ns, self.update_time_ns) + + def test_reassign_diasource_to_ssobject(self) -> None: """Test expand_single_record with ApdbReassignDiaSourceToSSObjectRecord. """ @@ -255,7 +169,7 @@ def test_expand_single_record_reassign_to_diaobject(self) -> None: self.assertEqual(second_record.field_name, "ssObjectReassocTimeMjdTai") self.assertEqual(second_record.value_json, float(self.update_time.tai.mjd)) - def test_expand_single_record_withdraw_diasource(self) -> None: + def test_withdraw_diasource(self) -> None: """Test expand_single_record with ApdbWithdrawDiaSourceRecord.""" record = ApdbWithdrawDiaSourceRecord( update_time_ns=self.update_time_ns, @@ -278,43 +192,7 @@ def test_expand_single_record_withdraw_diasource(self) -> None: self.assertEqual(expanded_record.field_name, "timeWithdrawnMjdTai") self.assertEqual(expanded_record.value_json, self.update_time.tai.mjd) - def test_expand_single_record_reassign_to_ssobject(self) -> None: - """Test expand_single_record with ApdbReassignDiaSourceToSSObjectRecord.""" - record = ApdbReassignDiaSourceToSSObjectRecord( - update_time_ns=self.update_time_ns, - update_order=1, - diaSourceId=100002, - ssObjectId=2001, - ssObjectReassocTimeMjdTai=float(self.update_time.tai.mjd), - ra=45.0, - dec=-30.0, - midpointMjdTai=60000.0, - ) - - expanded = UpdateRecordExpander.expand_single_record(record, self.replica_chunk_id) - - # Should expand to 2 records (ssObjectId and ssObjectReassocTimeMjdTai) - self.assertEqual(len(expanded), 2) - - # Check first expanded record (ssObjectId) - first_record = expanded[0] - self.assertIsInstance(first_record, ExpandedUpdateRecord) - self.assertEqual(first_record.table_name, "DiaSource") - self.assertEqual(first_record.record_id, [100002]) - self.assertEqual(first_record.field_name, "ssObjectId") - self.assertEqual(first_record.value_json, 2001) - self.assertEqual(first_record.replica_chunk_id, self.replica_chunk_id) - self.assertEqual(first_record.update_order, 1) - self.assertEqual(first_record.update_time_ns, self.update_time_ns) - - # Check second expanded record (ssObjectReassocTimeMjdTai) - second_record = expanded[1] - self.assertEqual(second_record.table_name, "DiaSource") - self.assertEqual(second_record.record_id, [100002]) - self.assertEqual(second_record.field_name, "ssObjectReassocTimeMjdTai") - self.assertEqual(second_record.value_json, float(self.update_time.tai.mjd)) - - def test_expand_single_record_update_n_dia_sources(self) -> None: + def test_update_n_dia_sources(self) -> None: """Test expand_single_record with ApdbUpdateNDiaSourcesRecord.""" record = ApdbUpdateNDiaSourcesRecord( update_time_ns=self.update_time_ns, @@ -336,7 +214,7 @@ def test_expand_single_record_update_n_dia_sources(self) -> None: self.assertEqual(expanded_record.field_name, "nDiaSources") self.assertEqual(expanded_record.value_json, 10) - def test_expand_single_record_close_validity(self) -> None: + def test_close_diaobject_validity(self) -> None: """Test expand_single_record with ApdbCloseDiaObjectValidityRecord.""" record = ApdbCloseDiaObjectValidityRecord( update_time_ns=self.update_time_ns, @@ -368,7 +246,7 @@ def test_expand_single_record_close_validity(self) -> None: self.assertEqual(second_record.field_name, "nDiaSources") self.assertEqual(second_record.value_json, 5) - def test_expand_single_record_withdraw_forcedsource(self) -> None: + def test_withdraw_diaforcedsource(self) -> None: """Test expand_single_record with ApdbWithdrawDiaForcedSourceRecord.""" record = ApdbWithdrawDiaForcedSourceRecord( update_time_ns=self.update_time_ns, @@ -396,9 +274,9 @@ def test_expand_single_record_withdraw_forcedsource(self) -> None: self.assertEqual(expanded_record.field_name, "timeWithdrawnMjdTai") self.assertEqual(expanded_record.value_json, self.update_time.tai.mjd) - def test_expand_updates_full_integration(self) -> None: + def test_update_records_all(self) -> None: """Test the full expand_updates method with multiple record types.""" - update_records = self._create_test_update_records() + update_records = _create_test_update_records() expanded = UpdateRecordExpander.expand_updates(update_records) @@ -434,7 +312,7 @@ def test_expand_updates_full_integration(self) -> None: } self.assertEqual(field_names, expected_fields) - def test_expand_updates_empty_records(self) -> None: + def test_empty_records(self) -> None: """Test expand_updates with empty records list.""" empty_update_records = UpdateRecords( replica_chunk_id=self.replica_chunk_id, diff --git a/tests/test_updates_table.py b/tests/test_updates_table.py new file mode 100644 index 00000000..67372af6 --- /dev/null +++ b/tests/test_updates_table.py @@ -0,0 +1,165 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import unittest +import uuid + +try: + from google.cloud import bigquery +except ImportError: + bigquery = None + +from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander, UpdatesTable +from lsst.dax.ppdb.tests._updates import _create_test_update_records + + +@unittest.skipIf(bigquery is None, "google-cloud-bigquery not available") +class TestUpdatesTable(unittest.TestCase): + """Test UpdatesTable functionality.""" + + def setUp(self) -> None: + """Set up test fixtures.""" + # Create BigQuery client + self.client = bigquery.Client() + + # Create unique dataset name for this test run + self.dataset_id = f"test_updates_{uuid.uuid4().hex[:8]}" + self.project_id = self.client.project + self.table_name = "updates" + self.table_fqn = f"{self.project_id}.{self.dataset_id}.{self.table_name}" + + # Create the test dataset + dataset = bigquery.Dataset(f"{self.project_id}.{self.dataset_id}") + # Set a short expiration for cleanup safety (1 hour) + dataset.default_table_expiration_ms = 3600000 # 1 hour + self.dataset = self.client.create_dataset(dataset) + + # Create UpdatesTable instance + self.updates_table = UpdatesTable(self.client, self.table_fqn) + + def tearDown(self) -> None: + """Clean up test fixtures.""" + # Always clean up the test dataset, whether test passed or failed + try: + self.client.delete_dataset(self.dataset_id, delete_contents=True, not_found_ok=True) + except Exception: + # If deletion fails, at least the expiration will clean it up + pass + + def test_table_fqn_property(self) -> None: + """Test the table_fqn property.""" + self.assertEqual(self.updates_table.table_fqn, self.table_fqn) + + def test_create_table(self) -> None: + """Test creating the updates table.""" + table = self.updates_table.create() + + # Verify table was created successfully + self.assertEqual(table.table_id, self.table_name) + self.assertEqual(table.dataset_id, self.dataset_id) + + # Verify schema is correct + expected_fields = { + "table_name": ("STRING", "REQUIRED"), + "record_id": ("INTEGER", "REPEATED"), + "field_name": ("STRING", "REQUIRED"), + "value_json": ("JSON", "REQUIRED"), + "replica_chunk_id": ("INTEGER", "REQUIRED"), + "update_order": ("INTEGER", "NULLABLE"), + "update_time_ns": ("INTEGER", "NULLABLE"), + } + + actual_fields = {field.name: (field.field_type, field.mode) for field in table.schema} + self.assertEqual(actual_fields, expected_fields) + + def test_create_table_already_exists(self) -> None: + """Test creating a table that already exists raises an error.""" + # Create table first time - should succeed + self.updates_table.create() + + # Try to create again - should raise Conflict + with self.assertRaises(Exception) as cm: + self.updates_table.create() + + # Check that it's a conflict-type error + self.assertIn("already exists", str(cm.exception).lower()) + + def test_append_records(self) -> None: + """Test appending ExpandedUpdateRecord objects to the table.""" + # Create the table first + self.updates_table.create() + + # Get test update records and expand them + update_records = _create_test_update_records() + expanded_records = UpdateRecordExpander.expand_updates(update_records) + + # Append the records + job = self.updates_table.append(expanded_records) + + # Verify the job completed successfully + self.assertIsNone(job.errors) + + # Verify records were inserted by querying the table + query = f"SELECT COUNT(*) as count FROM `{self.table_fqn}`" + result = list(self.client.query(query).result()) + record_count = result[0].count + + # Should have 8 total expanded records based on the test data + # (1 + 2 + 1 + 1 + 2 + 1 from each update record type) + self.assertEqual(record_count, 8) + + # Verify some specific data was inserted correctly + query = f""" + SELECT table_name, record_id, field_name, replica_chunk_id + FROM `{self.table_fqn}` + """ + # WHERE table_name = 'DiaForcedSource' + results = list(self.client.query(query).result()) + + print(results) # Debug print to see what was inserted + # Should have one DiaForcedSource record + # self.assertEqual(len(results), 1) + # row = results[0] + # self.assertEqual(row.table_name, "DiaForcedSource") + # self.assertEqual(row.record_id, [200001, 12345, 42]) # Composite key as list + # self.assertEqual(row.field_name, "timeWithdrawnMjdTai") + # self.assertEqual(row.replica_chunk_id, self.replica_chunk_id) + + def test_append_empty_records(self) -> None: + """Test appending empty list of records.""" + # Create the table first + self.updates_table.create() + + # Append empty list + job = self.updates_table.append([]) + + # Verify the job completed successfully + self.assertIsNone(job.errors) + + # Verify no records were inserted + query = f"SELECT COUNT(*) as count FROM `{self.table_fqn}`" + result = list(self.client.query(query).result()) + record_count = result[0].count + self.assertEqual(record_count, 0) + + +if __name__ == "__main__": + unittest.main() From db06460fb149f2eba183201e85777d34ef1562cc Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 18 Feb 2026 14:23:41 -0600 Subject: [PATCH 15/49] Add preliminary implementation of update records table dedup in BQ --- .../updates/update_record_expander.py | 24 ++++------ .../ppdb/bigquery/updates/update_records.py | 1 - .../ppdb/bigquery/updates/updates_table.py | 38 +++++++++++++-- python/lsst/dax/ppdb/tests/_updates.py | 26 ++++++++++ tests/test_update_record_expander.py | 20 ++++---- tests/test_updates_table.py | 48 +++++++++++++++++-- 6 files changed, 121 insertions(+), 36 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py index e9b6e911..55d1ba33 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py @@ -42,12 +42,12 @@ class UpdateRecordExpander: } _RECORD_ID_FIELD_MAPPING = { - "reassign_diasource_to_diaobject": "diaSourceId", - "reassign_diasource_to_ssobject": "diaSourceId", - "withdraw_diasource": "diaSourceId", + "reassign_diasource_to_diaobject": ["diaSourceId"], + "reassign_diasource_to_ssobject": ["diaSourceId"], + "withdraw_diasource": ["diaSourceId"], "withdraw_diaforcedsource": ["diaObjectId", "visit", "detector"], - "close_diaobject_validity": "diaObjectId", - "update_n_dia_sources": "diaObjectId", + "close_diaobject_validity": ["diaObjectId"], + "update_n_dia_sources": ["diaObjectId"], } @classmethod @@ -75,7 +75,7 @@ def get_update_fields(cls, update_type: str) -> list[str]: return cls._UPDATE_FIELD_MAPPING[update_type] @classmethod - def get_record_id_field_names(cls, update_type: str) -> str | list[str]: + def get_record_id_fields(cls, update_type: str) -> str | list[str]: """Get the field name(s) that serve as the record ID for a given update type. @@ -105,9 +105,6 @@ def get_record_id_field(cls, update_type: str) -> str | list[str]: """Get the field name(s) that serve as the record ID for a given update type. - This method is an alias for get_record_id_field_names for backward - compatibility. - Parameters ---------- update_type : `str` @@ -115,16 +112,15 @@ def get_record_id_field(cls, update_type: str) -> str | list[str]: Returns ------- - field_name : `str` or `list` [ `str` ] - Name of the field that contains the record ID for this update type, - or list of field names for composite keys. + field_name : `list` [ `str` ] + List of the fields that contain the record ID for this update type. Raises ------ ValueError If the update_type is not recognized. """ - return cls.get_record_id_field_names(update_type) + return cls.get_record_id_fields(update_type) @classmethod def expand_single_record( @@ -192,7 +188,7 @@ def _get_record_id(cls, update_record: ApdbUpdateRecord) -> list[int]: single-element list. For composite keys, a multi-element list. """ update_type = update_record.update_type - id_fields = cls.get_record_id_field_names(update_type) + id_fields = cls.get_record_id_fields(update_type) # Handle both single field (string) and composite fields (list) if isinstance(id_fields, str): diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_records.py b/python/lsst/dax/ppdb/bigquery/updates/update_records.py index 47c63f01..ae986fb3 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_records.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_records.py @@ -33,7 +33,6 @@ """Default filename for the update records JSON file.""" -# Move to `updateRecords.py` to follow camelcase convention class UpdateRecords(BaseModel): """Data model for APDB update records.""" diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_table.py b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py index c39d1592..8cc51e0c 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_table.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py @@ -30,11 +30,8 @@ class UpdatesTable: - """ - Manage a BigQuery updates table for `ExpandedUpdateRecord` rows. - - This class is responsible for creating the updates table with the correct - schema and appending expanded update records into it. + """Manage the table in BigQuery used for inserting and deduplicating + expanded update records which contain one update per row. """ def __init__(self, client: bigquery.Client, table_fqn: str) -> None: @@ -152,3 +149,34 @@ def append(self, records: Iterable[ExpandedUpdateRecord]) -> bigquery.LoadJob: raise RuntimeError(f"BigQuery load failed: {job.errors}") return job + + def deduplicate_to(self, target_table_fqn: str) -> bigquery.QueryJob: + """ + Deduplicate this table's records to a target table. + + Keeps the record with the latest update_time_ns for each unique + combination of (table_name, record_id, field_name). + """ + query = f""" + CREATE OR REPLACE TABLE `{target_table_fqn}` + AS + SELECT * EXCEPT(row_num) + FROM ( + SELECT *, + ROW_NUMBER() OVER ( + PARTITION BY table_name, + ARRAY_TO_STRING( + ARRAY(SELECT CAST(elem AS STRING) FROM UNNEST(record_id) AS elem), + ',' + ), + field_name + ORDER BY update_time_ns DESC + ) as row_num + FROM `{self._table_fqn}` + ) + WHERE row_num = 1 + """ + + job = self._client.query(query) + job.result() + return job diff --git a/python/lsst/dax/ppdb/tests/_updates.py b/python/lsst/dax/ppdb/tests/_updates.py index 52069359..a5a62002 100644 --- a/python/lsst/dax/ppdb/tests/_updates.py +++ b/python/lsst/dax/ppdb/tests/_updates.py @@ -122,6 +122,32 @@ def _create_test_update_records() -> UpdateRecords: ) ) + # Add duplicate records for testing deduplication + # Duplicate of the first record but with later timestamp (should be kept) + records.append( + ApdbReassignDiaSourceToDiaObjectRecord( + update_time_ns=test_update_time_ns + 1000000000, # 1 second later + update_order=0, + diaSourceId=100001, + diaObjectId=400001, # Different target object + ra=45.0, + dec=-30.0, + midpointMjdTai=60000.0, + ) + ) + + # Duplicate of the nDiaSources update but with earlier timestamp (should be discarded) + records.append( + ApdbUpdateNDiaSourcesRecord( + update_time_ns=test_update_time_ns - 1000000000, # 1 second earlier + update_order=5, + diaObjectId=200002, + nDiaSources=8, # Different value but older timestamp + ra=45.0, + dec=-30.0, + ) + ) + return UpdateRecords( replica_chunk_id=test_replica_chunk_id, record_count=len(records), diff --git a/tests/test_update_record_expander.py b/tests/test_update_record_expander.py index 6da65484..8818565e 100644 --- a/tests/test_update_record_expander.py +++ b/tests/test_update_record_expander.py @@ -78,28 +78,24 @@ def test_get_update_fields(self) -> None: def test_get_record_id_field_names(self) -> None: """Test get_record_id_field class method.""" self.assertEqual( - UpdateRecordExpander.get_record_id_field_names("reassign_diasource_to_diaobject"), ["diaSourceId"] + UpdateRecordExpander.get_record_id_fields("reassign_diasource_to_diaobject"), ["diaSourceId"] ) self.assertEqual( - UpdateRecordExpander.get_record_id_field_names("reassign_diasource_to_ssobject"), ["diaSourceId"] + UpdateRecordExpander.get_record_id_fields("reassign_diasource_to_ssobject"), ["diaSourceId"] ) + self.assertEqual(UpdateRecordExpander.get_record_id_fields("withdraw_diasource"), ["diaSourceId"]) self.assertEqual( - UpdateRecordExpander.get_record_id_field_names("withdraw_diasource"), ["diaSourceId"] - ) - self.assertEqual( - UpdateRecordExpander.get_record_id_field_names("withdraw_diaforcedsource"), + UpdateRecordExpander.get_record_id_fields("withdraw_diaforcedsource"), ["diaObjectId", "visit", "detector"], ) self.assertEqual( - UpdateRecordExpander.get_record_id_field_names("close_diaobject_validity"), ["diaObjectId"] - ) - self.assertEqual( - UpdateRecordExpander.get_record_id_field_names("update_n_dia_sources"), ["diaObjectId"] + UpdateRecordExpander.get_record_id_fields("close_diaobject_validity"), ["diaObjectId"] ) + self.assertEqual(UpdateRecordExpander.get_record_id_fields("update_n_dia_sources"), ["diaObjectId"]) # Test unknown update type with self.assertRaises(ValueError) as cm: - UpdateRecordExpander.get_record_id_field_names("unknown_update_type") + UpdateRecordExpander.get_record_id_fields("unknown_update_type") self.assertIn("Unknown update_type: unknown_update_type", str(cm.exception)) def test_reassign_diasource_to_diaobject(self) -> None: @@ -287,7 +283,7 @@ def test_update_records_all(self) -> None: # - 1 from ApdbWithdrawDiaForcedSourceRecord # - 2 from ApdbCloseDiaObjectValidityRecord # - 1 from ApdbUpdateNDiaSourcesRecord - self.assertEqual(len(expanded), 8) + self.assertEqual(len(expanded), 10) # Verify all expanded records have correct replica_chunk_id for record in expanded: diff --git a/tests/test_updates_table.py b/tests/test_updates_table.py index 67372af6..9672500c 100644 --- a/tests/test_updates_table.py +++ b/tests/test_updates_table.py @@ -122,9 +122,9 @@ def test_append_records(self) -> None: result = list(self.client.query(query).result()) record_count = result[0].count - # Should have 8 total expanded records based on the test data - # (1 + 2 + 1 + 1 + 2 + 1 from each update record type) - self.assertEqual(record_count, 8) + # Should have 10 total expanded records based on the test data + # (1 + 2 + 1 + 1 + 2 + 1 from original records + 2 duplicates) + self.assertEqual(record_count, 10) # Verify some specific data was inserted correctly query = f""" @@ -139,7 +139,7 @@ def test_append_records(self) -> None: # self.assertEqual(len(results), 1) # row = results[0] # self.assertEqual(row.table_name, "DiaForcedSource") - # self.assertEqual(row.record_id, [200001, 12345, 42]) # Composite key as list + # self.assertEqual(row.record_id, [200001, 12345, 42]) # self.assertEqual(row.field_name, "timeWithdrawnMjdTai") # self.assertEqual(row.replica_chunk_id, self.replica_chunk_id) @@ -160,6 +160,46 @@ def test_append_empty_records(self) -> None: record_count = result[0].count self.assertEqual(record_count, 0) + def test_deduplicate_records(self) -> None: + """Test deduplication functionality.""" + # Create the source table + self.updates_table.create() + + # Get test records (which now include duplicates) and expand them + update_records = _create_test_update_records() + expanded_records = UpdateRecordExpander.expand_updates(update_records) + + # Append all records (including duplicates) + self.updates_table.append(expanded_records) + + # Count original records + query = f"SELECT COUNT(*) as count FROM `{self.table_fqn}`" + original_count = list(self.client.query(query).result())[0].count + + # Create deduplicated table + dedup_table_fqn = f"{self.table_fqn}_dedup" + self.updates_table.deduplicate_to(dedup_table_fqn) + + # Count deduplicated records + query = f"SELECT COUNT(*) as count FROM `{dedup_table_fqn}`" + dedup_count = list(self.client.query(query).result())[0].count + + # Should have fewer records after deduplication + self.assertLess(dedup_count, original_count) + + # Verify specific deduplication behavior: + # Check that the later timestamp record is kept for diaSourceId=100001 + query = f""" + SELECT value_json + FROM `{dedup_table_fqn}` + WHERE ARRAY_TO_STRING( + ARRAY(SELECT CAST(elem AS STRING) FROM UNNEST(record_id) AS elem), ',' + ) = '100001' AND field_name = 'diaObjectId' + """ + result = list(self.client.query(query).result()) + self.assertEqual(len(result), 1) + self.assertEqual(result[0].value_json, 400001) # Should be the later update + if __name__ == "__main__": unittest.main() From f3a1924b0a0163bf388a18ca0650996ccb5ae96e Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 18 Feb 2026 15:42:43 -0600 Subject: [PATCH 16/49] Use a hashed value of record ID for deduplication --- .../updates/update_record_expander.py | 21 ++++++++++++++ .../ppdb/bigquery/updates/updates_table.py | 28 +++++++++++++++---- tests/test_updates_table.py | 9 +++--- 3 files changed, 47 insertions(+), 11 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py index 55d1ba33..f5c95fdf 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py @@ -21,6 +21,8 @@ from __future__ import annotations +import hashlib + from lsst.dax.apdb.apdbUpdateRecord import ApdbUpdateRecord from .expanded_update_record import ExpandedUpdateRecord @@ -100,6 +102,23 @@ def get_record_id_fields(cls, update_type: str) -> str | list[str]: return cls._RECORD_ID_FIELD_MAPPING[update_type] + @classmethod + def _compute_record_id_hash(cls, record_id: list[int]) -> str: + """Compute MD5 hash of a record_id list for deduplication. + + Parameters + ---------- + record_id : list[int] + The record ID as a list of integers. + + Returns + ------- + str + Full 64-character hexadecimal MD5 hash of the record_id list. + """ + record_id_str = ",".join(str(x) for x in record_id) + return hashlib.md5(record_id_str.encode()).hexdigest() + @classmethod def get_record_id_field(cls, update_type: str) -> str | list[str]: """Get the field name(s) that serve as the record ID for a given update @@ -149,6 +168,7 @@ def expand_single_record( # Get the record ID record_id = cls._get_record_id(update_record) + record_id_hash = cls._compute_record_id_hash(record_id) expanded_records = [] for field_name in field_names: @@ -162,6 +182,7 @@ def expand_single_record( expanded_record = ExpandedUpdateRecord( table_name=table_name, record_id=record_id, + record_id_hash=record_id_hash, field_name=field_name, value_json=value, replica_chunk_id=replica_chunk_id, diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_table.py b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py index 8cc51e0c..5553d342 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_table.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py @@ -21,6 +21,7 @@ from __future__ import annotations +import hashlib from collections.abc import Iterable from typing import Any @@ -46,6 +47,23 @@ def __init__(self, client: bigquery.Client, table_fqn: str) -> None: self._client: bigquery.Client = client self._table_fqn: str = table_fqn + @staticmethod + def _compute_record_id_hash(record_id: list[int]) -> str: + """Compute MD5 hash of a record_id list for deduplication. + + Parameters + ---------- + record_id : list[int] + The record ID as a list of integers. + + Returns + ------- + str + Full 64-character hexadecimal MD5 hash of the record_id list. + """ + record_id_str = ",".join(str(x) for x in record_id) + return hashlib.md5(record_id_str.encode()).hexdigest() + @property def table_fqn(self) -> str: """ @@ -78,6 +96,7 @@ def create(self) -> bigquery.Table: - table_name: STRING (REQUIRED) - record_id: ARRAY (REQUIRED) + - record_id_hash: STRING (REQUIRED) - field_name: STRING (REQUIRED) - value_json: JSON (REQUIRED) - replica_chunk_id: INT64 (REQUIRED) @@ -87,6 +106,7 @@ def create(self) -> bigquery.Table: schema: list[bigquery.SchemaField] = [ bigquery.SchemaField("table_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("record_id", "INT64", mode="REPEATED"), + bigquery.SchemaField("record_id_hash", "STRING", mode="REQUIRED"), bigquery.SchemaField("field_name", "STRING", mode="REQUIRED"), bigquery.SchemaField("value_json", "JSON", mode="REQUIRED"), bigquery.SchemaField("replica_chunk_id", "INT64", mode="REQUIRED"), @@ -125,6 +145,7 @@ def append(self, records: Iterable[ExpandedUpdateRecord]) -> bigquery.LoadJob: { "table_name": r.table_name, "record_id": r.record_id, + "record_id_hash": self._compute_record_id_hash(r.record_id), "field_name": r.field_name, "value_json": r.value_json, "replica_chunk_id": r.replica_chunk_id, @@ -164,12 +185,7 @@ def deduplicate_to(self, target_table_fqn: str) -> bigquery.QueryJob: FROM ( SELECT *, ROW_NUMBER() OVER ( - PARTITION BY table_name, - ARRAY_TO_STRING( - ARRAY(SELECT CAST(elem AS STRING) FROM UNNEST(record_id) AS elem), - ',' - ), - field_name + PARTITION BY table_name, record_id_hash, field_name ORDER BY update_time_ns DESC ) as row_num FROM `{self._table_fqn}` diff --git a/tests/test_updates_table.py b/tests/test_updates_table.py index 9672500c..8d3d821f 100644 --- a/tests/test_updates_table.py +++ b/tests/test_updates_table.py @@ -80,6 +80,7 @@ def test_create_table(self) -> None: expected_fields = { "table_name": ("STRING", "REQUIRED"), "record_id": ("INTEGER", "REPEATED"), + "record_id_hash": ("STRING", "REQUIRED"), "field_name": ("STRING", "REQUIRED"), "value_json": ("JSON", "REQUIRED"), "replica_chunk_id": ("INTEGER", "REQUIRED"), @@ -187,14 +188,12 @@ def test_deduplicate_records(self) -> None: # Should have fewer records after deduplication self.assertLess(dedup_count, original_count) - # Verify specific deduplication behavior: - # Check that the later timestamp record is kept for diaSourceId=100001 + # Verify specific deduplication behavior + record_id_hash = UpdatesTable._compute_record_id_hash([100001]) query = f""" SELECT value_json FROM `{dedup_table_fqn}` - WHERE ARRAY_TO_STRING( - ARRAY(SELECT CAST(elem AS STRING) FROM UNNEST(record_id) AS elem), ',' - ) = '100001' AND field_name = 'diaObjectId' + WHERE record_id_hash = '{record_id_hash}' AND field_name = 'diaObjectId' """ result = list(self.client.query(query).result()) self.assertEqual(len(result), 1) From be9175f4aba2aee8f6ffc8edfee3f9ae240f5b3b Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 18 Feb 2026 16:53:41 -0600 Subject: [PATCH 17/49] WIP on update merge implementation --- pyproject.toml | 2 +- .../dax/ppdb/bigquery/updates/__init__.py | 1 + .../dax/ppdb/bigquery/updates/sql/__init__.py | 0 .../updates/sql/merge_diaobject_updates.sql | 46 +++++++ .../ppdb/bigquery/updates/updates_merger.py | 93 +++++++++++++ tests/test_updates_merger.py | 126 ++++++++++++++++++ 6 files changed, 267 insertions(+), 1 deletion(-) create mode 100644 python/lsst/dax/ppdb/bigquery/updates/sql/__init__.py create mode 100644 python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaobject_updates.sql create mode 100644 python/lsst/dax/ppdb/bigquery/updates/updates_merger.py create mode 100644 tests/test_updates_merger.py diff --git a/pyproject.toml b/pyproject.toml index 5f082778..c5860c60 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -54,7 +54,7 @@ where = ["python"] zip-safe = true [tool.setuptools.package-data] -"lsst.dax.ppdb" = ["py.typed"] +"lsst.dax.ppdb" = ["py.typed", "bigquery/updates/sql/*.sql"] [tool.setuptools.dynamic] version = { attr = "lsst_versions.get_lsst_version" } diff --git a/python/lsst/dax/ppdb/bigquery/updates/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/__init__.py index a21b1add..342c6069 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/__init__.py +++ b/python/lsst/dax/ppdb/bigquery/updates/__init__.py @@ -23,3 +23,4 @@ from .expanded_update_record import ExpandedUpdateRecord from .update_record_expander import UpdateRecordExpander from .updates_table import UpdatesTable +from .updates_merger import UpdatesMerger, DiaObjectUpdatesMerger diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/sql/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaobject_updates.sql b/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaobject_updates.sql new file mode 100644 index 00000000..143f86a5 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaobject_updates.sql @@ -0,0 +1,46 @@ +-- merge_diaobject_updates.sql +-- +-- Query parameters: +-- @updates_table STRING -- table FQN, e.g. "project.dataset.prod_next" +-- @target_dataset STRING -- dataset FQN, e.g. "project.dataset" +-- +-- Do NOT include backticks in parameter values. + +DECLARE sql STRING; + +SET sql = """ +MERGE `{target_dataset}.DiaObject` T +USING ( + WITH patch AS ( + SELECT + record_id[OFFSET(0)] AS diaObjectId, + + ANY_VALUE( + CASE WHEN field_name = 'validityEndMjdTai' + THEN CAST(JSON_VALUE(value_json) AS FLOAT64) + END + ) AS validityEndMjdTai_value, + COUNTIF(field_name = 'validityEndMjdTai') > 0 AS validityEndMjdTai_present, + + ANY_VALUE( + CASE WHEN field_name = 'nDiaSources' + THEN CAST(JSON_VALUE(value_json) AS INT64) + END + ) AS nDiaSources_value, + COUNTIF(field_name = 'nDiaSources') > 0 AS nDiaSources_present + + FROM `{updates_table}` + WHERE table_name = 'DiaObject' + AND field_name IN ('validityEndMjdTai', 'nDiaSources') + GROUP BY diaObjectId + ) + SELECT * FROM patch +) P +ON T.diaObjectId = P.diaObjectId +WHEN MATCHED THEN +UPDATE SET + validityEndMjdTai = IF(P.validityEndMjdTai_present, P.validityEndMjdTai_value, T.validityEndMjdTai), + nDiaSources = IF(P.nDiaSources_present, P.nDiaSources_value, T.nDiaSources) +"""; + +EXECUTE IMMEDIATE sql; \ No newline at end of file diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py new file mode 100644 index 00000000..61bd2927 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -0,0 +1,93 @@ +# This file is part of dax_ppdb +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +from abc import ABC + +from google.cloud import bigquery + +from lsst.resources import ResourcePath + +_SQL_RESOURCE_PACKAGE = "lsst.dax.ppdb.bigquery.updates.sql" + + +class UpdatesMerger(ABC): + """Abstract base class for merging expanded update records into target + tables in BigQuery. + """ + + TABLE_NAME: str + """Logical name of the target table this merger applies to + (e.g., 'DiaObject').""" + + SQL_RESOURCE_NAME: str + """Base name of the SQL file (without .sql extension) containing the MERGE + statement for this merger. The SQL file must be located in the + `lsst.dax.ppdb.bigquery.updates.sql` package.""" + + def __init__(self, client: bigquery.Client) -> None: + """ + Parameters + ---------- + client + BigQuery client. + """ + self._client: bigquery.Client = client + + def merge(self, *, updates_table_fqn: str, target_dataset_fqn: str) -> bigquery.QueryJob: + """ + Apply updates from the updates table specified by `updates_table_fqn` + to the target table in the `target_dataset_fqn` dataset. + + Parameters + ---------- + updates_table_fqn + Fully-qualified BigQuery table name containing updates. + target_dataset_fqn + Fully-qualified BigQuery dataset name containing the target table. + + Returns + ------- + google.cloud.bigquery.job.QueryJob + The completed BigQuery job. + """ + try: + sql_resource_path = f"resource://{_SQL_RESOURCE_PACKAGE}/{self.SQL_RESOURCE_NAME}.sql" + print(f"Reading SQL from resource: {sql_resource_path}") + sql_text = ResourcePath(sql_resource_path).read().decode("utf-8") + except Exception as e: + raise RuntimeError(f"Failed to read SQL resource at {sql_resource_path}") from e + + sql_text = sql_text.format(updates_table=updates_table_fqn, target_dataset=target_dataset_fqn) + + job = self._client.query(sql_text) + job.result() + + return job + + +class DiaObjectUpdatesMerger(UpdatesMerger): + """Merger for DiaObject updates.""" + + TABLE_NAME = "DiaObject" + + SQL_RESOURCE_NAME = "merge_diaobject_updates" diff --git a/tests/test_updates_merger.py b/tests/test_updates_merger.py new file mode 100644 index 00000000..381b0adc --- /dev/null +++ b/tests/test_updates_merger.py @@ -0,0 +1,126 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import io +import json +import unittest +import uuid + +try: + from google.cloud import bigquery +except ImportError: + bigquery = None + +from lsst.dax.ppdb.bigquery.updates import DiaObjectUpdatesMerger, UpdateRecordExpander, UpdatesTable +from lsst.dax.ppdb.tests._updates import _create_test_update_records + + +@unittest.skipIf(bigquery is None, "google-cloud-bigquery not available") +class TestUpdatesMerger(unittest.TestCase): + """Test UpdatesMerger functionality.""" + + def setUp(self): + self.client = bigquery.Client() + self.dataset_id = f"test_merger_{uuid.uuid4().hex[:8]}" + self.project_id = self.client.project + self.updates_table_fqn = f"{self.project_id}.{self.dataset_id}.updates" + self.target_dataset_fqn = f"{self.project_id}.{self.dataset_id}" + dataset = bigquery.Dataset(f"{self.project_id}.{self.dataset_id}") + dataset.default_table_expiration_ms = 3600000 + self.client.create_dataset(dataset) + + def tearDown(self): + try: + self.client.delete_dataset(self.dataset_id, delete_contents=True, not_found_ok=True) + except Exception: + pass + + def _create_target_table(self): + schema = [ + bigquery.SchemaField("diaObjectId", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("validityEndMjdTai", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("nDiaSources", "INTEGER", mode="NULLABLE"), + ] + table_fqn = f"{self.target_dataset_fqn}.DiaObject" + table = bigquery.Table(table_fqn, schema=schema) + self.client.create_table(table) + rows = [ + {"diaObjectId": 200001, "validityEndMjdTai": None, "nDiaSources": 3}, + {"diaObjectId": 200002, "validityEndMjdTai": None, "nDiaSources": 7}, + {"diaObjectId": 200003, "validityEndMjdTai": 59000.0, "nDiaSources": 2}, + ] + buf = self._json_rows_to_buf(rows) + job = self.client.load_table_from_file( + buf, + table_fqn, + job_config=bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON), + ) + job.result() + + def _json_rows_to_buf(self, rows): + buf = io.StringIO() + for row in rows: + buf.write(json.dumps(row) + "\n") + buf.seek(0) + return buf + + def test_merge_diaobject(self): + self._create_target_table() + updates_table = UpdatesTable(self.client, self.updates_table_fqn) + updates_table.create() + update_records = _create_test_update_records() + expanded = UpdateRecordExpander.expand_updates(update_records) + updates_table.append(expanded) + dedup_fqn = f"{self.updates_table_fqn}_dedup" + updates_table.deduplicate_to(dedup_fqn) + table_fqn = f"{self.target_dataset_fqn}.DiaObject" + query = f"SELECT * FROM `{table_fqn}` ORDER BY diaObjectId" + before = {r.diaObjectId: r for r in self.client.query(query).result()} + print("Before merge:", before) + merger = DiaObjectUpdatesMerger(self.client) + merger.merge(updates_table_fqn=dedup_fqn, target_dataset_fqn=self.target_dataset_fqn) + after = {r.diaObjectId: r for r in self.client.query(query).result()} + print("After merge:", after) + self.assertEqual(after[200001].validityEndMjdTai, 59580.0) + self.assertEqual(after[200001].nDiaSources, 5) + self.assertIsNone(after[200002].validityEndMjdTai) + self.assertEqual(after[200002].nDiaSources, 10) + self.assertEqual(after[200003].validityEndMjdTai, before[200003].validityEndMjdTai) + self.assertEqual(after[200003].nDiaSources, before[200003].nDiaSources) + + def test_merge_no_updates(self): + self._create_target_table() + updates_table = UpdatesTable(self.client, self.updates_table_fqn) + updates_table.create() + dedup_fqn = f"{self.updates_table_fqn}_dedup" + updates_table.deduplicate_to(dedup_fqn) + table_fqn = f"{self.target_dataset_fqn}.DiaObject" + before = {r.diaObjectId: r for r in self.client.query(f"SELECT * FROM `{table_fqn}`").result()} + merger = DiaObjectUpdatesMerger(self.client) + merger.merge(updates_table_fqn=dedup_fqn, target_table_fqn=table_fqn) + after = {r.diaObjectId: r for r in self.client.query(f"SELECT * FROM `{table_fqn}`").result()} + for obj_id in before: + self.assertEqual(before[obj_id].validityEndMjdTai, after[obj_id].validityEndMjdTai) + self.assertEqual(before[obj_id].nDiaSources, after[obj_id].nDiaSources) + + +if __name__ == "__main__": + unittest.main() From 20e3b2b47fce963a9b0590220631686547611095 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Feb 2026 16:17:56 -0600 Subject: [PATCH 18/49] Add google-cloud-bigquery requirement --- pyproject.toml | 1 + requirements.txt | 3 ++- 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index c5860c60..3f1c500d 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -44,6 +44,7 @@ test = [ "pytest-openfiles >= 0.5.0" ] gcp = [ + "google-cloud-bigquery", "lsst-dax-ppdbx-gcp" ] diff --git a/requirements.txt b/requirements.txt index ba5c456c..f40c2d15 100644 --- a/requirements.txt +++ b/requirements.txt @@ -7,5 +7,6 @@ lsst-dax-apdb @ git+https://github.com/lsst/dax_apdb@main lsst-utils @ git+https://github.com/lsst/utils@main lsst-resources[s3] @ git+https://github.com/lsst/resources@main lsst-felis @ git+https://github.com/lsst/felis@main -lsst-dax-ppdbx-gcp @ git+https://github.com/lsst-dm/dax_ppdbx_gcp@tickets/DM-54070 lsst-sdm-schemas @ git+https://github.com/lsst/sdm_schemas@main +lsst-dax-ppdbx-gcp @ git+https://github.com/lsst-dm/dax_ppdbx_gcp@tickets/DM-54070 +google-cloud-bigquery From ecca19b8ab6cd48851183dee30ed9c5b30a0eba4 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Feb 2026 16:57:03 -0600 Subject: [PATCH 19/49] Rearrange tests to guard against missing google deps --- pyproject.toml | 2 +- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 4 +- python/lsst/dax/ppdb/tests/_bigquery.py | 140 ++++++ python/lsst/dax/ppdb/tests/_ppdb.py | 5 +- python/lsst/dax/ppdb/tests/config/__init__.py | 0 .../lsst/dax/ppdb/tests}/config/schema.yaml | 0 tests/test_ppdb_bigquery.py | 428 +----------------- tests/test_ppdb_sql.py | 15 +- tests/test_update_record_expander.py | 16 +- tests/test_update_records.py | 346 ++++++++++++++ tests/test_updates_merger.py | 12 +- tests/test_updates_table.py | 14 +- 12 files changed, 534 insertions(+), 448 deletions(-) create mode 100644 python/lsst/dax/ppdb/tests/_bigquery.py create mode 100644 python/lsst/dax/ppdb/tests/config/__init__.py rename {tests => python/lsst/dax/ppdb/tests}/config/schema.yaml (100%) create mode 100644 tests/test_update_records.py diff --git a/pyproject.toml b/pyproject.toml index 3f1c500d..930ac8f0 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ where = ["python"] zip-safe = true [tool.setuptools.package-data] -"lsst.dax.ppdb" = ["py.typed", "bigquery/updates/sql/*.sql"] +"lsst.dax.ppdb" = ["py.typed", "bigquery/updates/sql/*.sql", "tests/config/*.yaml"] [tool.setuptools.dynamic] version = { attr = "lsst_versions.get_lsst_version" } diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index a6c72fa9..82d2205c 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -46,7 +46,6 @@ from ..sql import PpdbSqlBase, PpdbSqlBaseConfig from .manifest import Manifest, TableStats from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended -from .updates.update_records import UpdateRecords __all__ = ["ConfigValidationError", "PpdbBigQuery", "PpdbBigQueryConfig"] @@ -597,6 +596,9 @@ def _handle_updates( Serializes the ApdbUpdateRecord objects into a dictionary structure for processing. """ + # Import inlined here to avoid triggering google cloud imports + from .updates.update_records import UpdateRecords + update_records = UpdateRecords( replica_chunk_id=replica_chunk.id, records=apdb_update_records, diff --git a/python/lsst/dax/ppdb/tests/_bigquery.py b/python/lsst/dax/ppdb/tests/_bigquery.py new file mode 100644 index 00000000..1861396e --- /dev/null +++ b/python/lsst/dax/ppdb/tests/_bigquery.py @@ -0,0 +1,140 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import gc +import shutil +import tempfile +from typing import Any + +from lsst.dax.apdb import ( + ApdbConfig, +) +from lsst.dax.apdb.sql import ApdbSql +from lsst.dax.ppdb import PpdbConfig +from lsst.dax.ppdb.bigquery import PpdbBigQuery + +try: + import testing.postgresql +except ImportError: + testing = None + +from lsst.dax.ppdb.tests import TEST_SCHEMA_RESOURCE_PATH + +TEST_CONFIG = { + "db_drop": True, + "validate_config": False, + "delete_existing_dirs": True, + "bucket_name": "ppdb-test", + "object_prefix": "data/test", + "dataset_id": "test_dataset", + "project_id": "test_project", +} + + +class _SqliteMixin: + """Mixin class to provide Sqlite-specific setup/teardown and instance + creation. + """ + + def setUp(self) -> None: + self.tempdir = tempfile.mkdtemp() + self.apdb_url = f"sqlite:///{self.tempdir}/apdb.sqlite3" + self.ppdb_url = f"sqlite:///{self.tempdir}/ppdb.sqlite3" + + def tearDown(self) -> None: + shutil.rmtree(self.tempdir, ignore_errors=True) + + def make_instance(self, **kwargs: Any) -> PpdbConfig: + """Make config class instance used in all tests.""" + kw = { + **TEST_CONFIG, + "db_url": self.ppdb_url, + "felis_path": TEST_SCHEMA_RESOURCE_PATH, + "replication_dir": self.tempdir, + } + bq_config = PpdbBigQuery.init_bigquery( + **kw, + ) # type: ignore[arg-type] + return bq_config + + def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: + """Make APDB instance for tests.""" + kw = { + "schema_file": TEST_SCHEMA_RESOURCE_PATH, + "ss_schema_file": "", + "db_url": self.apdb_url, + "enable_replica": True, + } + kw.update(kwargs) + return ApdbSql.init_database(**kw) # type: ignore[arg-type] + + +class _PostgresMixin: + """Mixin class to provide Postgres-specific setup/teardown and instance + creation. + """ + + postgresql: Any + + @classmethod + def setUpClass(cls) -> None: + # Create the postgres test server. + cls.postgresql = testing.postgresql.PostgresqlFactory(cache_initialized_db=True) + super().setUpClass() + + @classmethod + def tearDownClass(cls) -> None: + # Clean up any lingering SQLAlchemy engines/connections + # so they're closed before we shut down the server. + gc.collect() + cls.postgresql.clear_cache() + super().tearDownClass() + + def setUp(self) -> None: + self.server = self.postgresql() + self.tempdir = tempfile.mkdtemp() + + def tearDown(self) -> None: + self.server = self.postgresql() + shutil.rmtree(self.tempdir, ignore_errors=True) + + def make_instance(self, **kwargs: Any) -> PpdbConfig: + """Make config class instance used in all tests.""" + kw = { + **TEST_CONFIG, + "db_url": self.server.url(), + "db_schema": "ppdb_test", + "felis_path": TEST_SCHEMA_RESOURCE_PATH, + "replication_dir": self.tempdir, + } + bq_config = PpdbBigQuery.init_bigquery(**kw) # type: ignore[arg-type] + return bq_config + + def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: + kw = { + "schema_file": TEST_SCHEMA_RESOURCE_PATH, + "ss_schema_file": "", + "db_url": self.server.url(), + "namespace": "apdb", + "enable_replica": True, + } + kw.update(kwargs) + return ApdbSql.init_database(**kw) # type: ignore[arg-type] diff --git a/python/lsst/dax/ppdb/tests/_ppdb.py b/python/lsst/dax/ppdb/tests/_ppdb.py index 2535d187..6a2e38d8 100644 --- a/python/lsst/dax/ppdb/tests/_ppdb.py +++ b/python/lsst/dax/ppdb/tests/_ppdb.py @@ -21,7 +21,7 @@ from __future__ import annotations -__all__ = ["ApdbMixin", "PpdbTest"] +__all__ = ["TEST_SCHEMA_RESOURCE_PATH", "ApdbMixin", "PpdbTest"] import unittest from abc import ABC, abstractmethod @@ -60,6 +60,9 @@ class TestCaseMixin: """Do-nothing definition of mixin base class for regular execution.""" +TEST_SCHEMA_RESOURCE_PATH = "resource://lsst.dax.ppdb.tests.config/schema.yaml" + + def _make_region(xyz: tuple[float, float, float] = (1.0, 1.0, -1.0)) -> Region: """Make a region to use in tests""" pointing_v = UnitVector3d(*xyz) diff --git a/python/lsst/dax/ppdb/tests/config/__init__.py b/python/lsst/dax/ppdb/tests/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/tests/config/schema.yaml b/python/lsst/dax/ppdb/tests/config/schema.yaml similarity index 100% rename from tests/config/schema.yaml rename to python/lsst/dax/ppdb/tests/config/schema.yaml diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index 5bf253bd..b23381e5 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -19,139 +19,16 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import gc -import json -import os -import posixpath -import shutil -import tempfile import unittest -import uuid -from typing import Any -import pytest - -from lsst.dax.apdb import ( - Apdb, - ApdbConfig, - ApdbReplica, - apdbUpdateRecord, -) -from lsst.dax.apdb.sql import ApdbSql -from lsst.dax.ppdb import Ppdb, PpdbConfig -from lsst.dax.ppdb.bigquery import PpdbBigQuery -from lsst.dax.ppdb.bigquery.updates import UpdateRecords -from lsst.dax.ppdb.replicator import Replicator -from lsst.dax.ppdb.tests import ApdbMixin, PpdbTest +from lsst.dax.ppdb.tests import PpdbTest +from lsst.dax.ppdb.tests._bigquery import _PostgresMixin, _SqliteMixin try: import testing.postgresql except ImportError: testing = None -TEST_SCHEMA = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config/schema.yaml") - -TEST_CONFIG = { - "db_drop": True, - "validate_config": False, - "delete_existing_dirs": True, - "bucket_name": "ppdb-test", - "object_prefix": "data/test", - "dataset_id": "test_dataset", - "project_id": "test_project", -} - - -class _SqliteMixin: - """Mixin class to provide Sqlite-specific setup/teardown and instance - creation. - """ - - def setUp(self) -> None: - self.tempdir = tempfile.mkdtemp() - self.apdb_url = f"sqlite:///{self.tempdir}/apdb.sqlite3" - self.ppdb_url = f"sqlite:///{self.tempdir}/ppdb.sqlite3" - - def tearDown(self) -> None: - shutil.rmtree(self.tempdir, ignore_errors=True) - - def make_instance(self, **kwargs: Any) -> PpdbConfig: - """Make config class instance used in all tests.""" - kw = { - **TEST_CONFIG, - "db_url": self.ppdb_url, - "felis_path": TEST_SCHEMA, - "replication_dir": self.tempdir, - } - bq_config = PpdbBigQuery.init_bigquery( - **kw, - ) # type: ignore[arg-type] - return bq_config - - def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: - """Make APDB instance for tests.""" - kw = { - "schema_file": TEST_SCHEMA, - "ss_schema_file": "", - "db_url": self.apdb_url, - "enable_replica": True, - } - kw.update(kwargs) - return ApdbSql.init_database(**kw) # type: ignore[arg-type] - - -class _PostgresMixin: - """Mixin class to provide Postgres-specific setup/teardown and instance - creation. - """ - - postgresql: Any - - @classmethod - def setUpClass(cls) -> None: - # Create the postgres test server. - cls.postgresql = testing.postgresql.PostgresqlFactory(cache_initialized_db=True) - super().setUpClass() - - @classmethod - def tearDownClass(cls) -> None: - # Clean up any lingering SQLAlchemy engines/connections - # so they're closed before we shut down the server. - gc.collect() - cls.postgresql.clear_cache() - super().tearDownClass() - - def setUp(self) -> None: - self.server = self.postgresql() - self.tempdir = tempfile.mkdtemp() - - def tearDown(self) -> None: - self.server = self.postgresql() - shutil.rmtree(self.tempdir, ignore_errors=True) - - def make_instance(self, **kwargs: Any) -> PpdbConfig: - """Make config class instance used in all tests.""" - kw = { - **TEST_CONFIG, - "db_url": self.server.url(), - "db_schema": "ppdb_test", - "felis_path": TEST_SCHEMA, - "replication_dir": self.tempdir, - } - bq_config = PpdbBigQuery.init_bigquery(**kw) # type: ignore[arg-type] - return bq_config - - def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: - kw = { - "schema_file": TEST_SCHEMA, - "ss_schema_file": "", - "db_url": self.server.url(), - "namespace": "apdb", - "enable_replica": True, - } - kw.update(kwargs) - return ApdbSql.init_database(**kw) # type: ignore[arg-type] - class SqliteTestCase(_SqliteMixin, PpdbTest, unittest.TestCase): """A test case for the PpdbBigQuery class using a SQLite backend.""" @@ -160,304 +37,3 @@ class SqliteTestCase(_SqliteMixin, PpdbTest, unittest.TestCase): @unittest.skipUnless(testing is not None, "testing.postgresql module not found") class PostgresTestCase(_PostgresMixin, PpdbTest, unittest.TestCase): """A test case for the PpdbBigQuery class using a Postgres backend.""" - - -def generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: - """Generate a unique bucket name for testing.""" - test_id = uuid.uuid4().hex[:16] - return f"{test_prefix}-{test_id}" - - -class UpdateRecordsTestCase(_PostgresMixin, ApdbMixin, unittest.TestCase): - """A test case for the handling of APDB record updates by PpdbBigQuery and - related classes including the ChunkUploader. - """ - - include_update_records = True - - def setUp(self): - super().setUp() - - # Make APDB instance and fill it with test data. - apdb_config = self.make_apdb_instance() - apdb = Apdb.from_config(apdb_config) - self._fill_apdb(apdb) # FIXME: Only include replica chunks with the updates - apdb_replica = ApdbReplica.from_config(apdb_config) - - # Make PPDB instance. - self.ppdb_config = self.make_instance() - self.ppdb = Ppdb.from_config(self.ppdb_config) - assert isinstance(self.ppdb, PpdbBigQuery) - - # Replicate APDB replica chunks to the PPDB. - replicator = Replicator( - apdb_replica, self.ppdb, update=False, min_wait_time=0, max_wait_time=0, check_interval=0 - ) - replicator.run(exit_on_empty=True) - - def test_json_serialization(self) -> None: - """Test that the APDB update records are correctly saved to a JSON file - in the replication output and can be read back as valid UpdateRecords - objects. - """ - update_records_path = self.ppdb.replication_path / "2021/03/01/1614600000" / "update_records.json" - self.assertTrue(update_records_path.exists(), "Update records file not found in replication output") - - update_records = UpdateRecords.from_json_file(update_records_path) - print("\n" + str(update_records)) - - self.assertEqual( - update_records.replica_chunk_id, - 1614600000, - "Unexpected replica chunk ID in deserialized update records", - ) - - self.assertEqual(update_records.record_count, 3, "Unexpected number of update records deserialized") - - self.assertEqual( - len(update_records.records), 3, "Unexpected number of update records in the deserialized object" - ) - - for record in update_records.records: - self.assertIsInstance( - record, - apdbUpdateRecord.ApdbUpdateRecord, - "Deserialized record is not an instance of ApdbUpdateRecord", - ) - - update_record = update_records.records[0] - self.assertIsInstance( - update_record, - apdbUpdateRecord.ApdbReassignDiaSourceToSSObjectRecord, - "Deserialized record is not an instance of ApdbReassignDiaSourceToSSObjectRecord", - ) - assert isinstance(update_record, apdbUpdateRecord.ApdbReassignDiaSourceToSSObjectRecord) - self.assertEqual( - update_record.diaSourceId, - 700, - "Unexpected diaSourceId in deserialized ApdbReassignDiaSourceToSSObjectRecord", - ) - self.assertEqual( - update_record.ssObjectId, - 1, - "Unexpected ssObjectId in deserialized ApdbReassignDiaSourceToSSObjectRecord", - ) - self.assertEqual( - update_record.update_time_ns, - 1614600037000000000, - "Unexpected update_time_ns in deserialized ApdbReassignDiaSourceToSSObjectRecord", - ) - self.assertEqual( - update_record.update_order, - 0, - "Unexpected update_order in deserialized ApdbReassignDiaSourceToSSObjectRecord", - ) - self.assertEqual( - update_record.midpointMjdTai, - 60000.0, - "Unexpected midpointMjdTai in deserialized ApdbReassignDiaSourceToSSObjectRecord", - ) - self.assertEqual( - update_record.ssObjectReassocTimeMjdTai, - 59274.50042824074, - "Unexpected ssObjectReassocTimeMjdTai in deserialized ApdbReassignDiaSourceToSSObjectRecord", - ) - self.assertNotEqual( - update_record.ra, - 0.0, - "Unexpected ra in deserialized ApdbReassignDiaSourceToSSObjectRecord, should not be 0.0", - ) - self.assertNotEqual( - update_record.dec, - 0.0, - "Unexpected dec in deserialized ApdbReassignDiaSourceToSSObjectRecord, should not be 0.0", - ) - - update_record = update_records.records[1] - self.assertIsInstance( - update_record, - apdbUpdateRecord.ApdbCloseDiaObjectValidityRecord, - "Deserialized record is not an instance of ApdbCloseDiaObjectValidityRecord", - ) - self.assertEqual( - update_record.diaObjectId, - 200, - "Unexpected diaObjectId in deserialized ApdbCloseDiaObjectValidityRecord", - ) - self.assertNotEqual( - update_record.ra, - 0.0, - "Unexpected ra in deserialized ApdbCloseDiaObjectValidityRecord, should not be 0.0", - ) - self.assertNotEqual( - update_record.dec, - 0.0, - "Unexpected dec in deserialized ApdbCloseDiaObjectValidityRecord, should not be 0.0", - ) - self.assertEqual( - update_record.update_time_ns, - 1614600037000000000, - "Unexpected update_time_ns in deserialized ApdbCloseDiaObjectValidityRecord", - ) - self.assertEqual( - update_record.update_order, - 1, - "Unexpected update_order in deserialized ApdbCloseDiaObjectValidityRecord", - ) - self.assertEqual( - update_record.validityEndMjdTai, - 59274.50042824074, - "Unexpected validityEndMjdTai in deserialized ApdbCloseDiaObjectValidityRecord", - ) - self.assertIsNone( - update_record.nDiaSources, - "Unexpected nDiaSources in deserialized ApdbCloseDiaObjectValidityRecord, expected None", - ) - - update_record = update_records.records[2] - self.assertIsInstance( - update_record, - apdbUpdateRecord.ApdbWithdrawDiaForcedSourceRecord, - "Deserialized record is not an instance of ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertEqual( - update_record.diaObjectId, - 200, - "Unexpected diaObjectId in deserialized ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertEqual( - update_record.visit, - 7, - "Unexpected visit in deserialized ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertEqual( - update_record.detector, - 1, - "Unexpected detector in deserialized ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertNotEqual( - update_record.ra, - 0.0, - "Unexpected ra in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", - ) - self.assertNotEqual( - update_record.dec, - 0.0, - "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", - ) - self.assertEqual( - update_record.midpointMjdTai, - 60000.0, - "Unexpected midpointMjdTai in deserialized ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertEqual( - update_record.update_time_ns, - 1614600037000000000, - "Unexpected update_time_ns in deserialized ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertEqual( - update_record.update_order, - 2, - "Unexpected update_order in deserialized ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertEqual( - update_record.timeWithdrawnMjdTai, - 59274.50042824074, - "Unexpected timeWithdrawnMjdTai in deserialized ApdbWithdrawDiaForcedSourceRecord", - ) - self.assertNotEqual( - update_record.ra, - 0.0, - "Unexpected ra in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", - ) - self.assertNotEqual( - update_record.dec, - 0.0, - "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", - ) - - @pytest.mark.skipif( - pytest.importorskip("lsst.dax.ppdbx.gcp", reason="dax_ppdbx_gcp is not installed") is None, - reason="", - ) - def test_chunk_uploader(self) -> None: - """Test that the update records are correctly uploaded to Google Cloud - Storage after replication. - """ - from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader - from lsst.dax.ppdbx.gcp.gcs import StorageClient - - # Change the configuration to use a unique test bucket name to avoid - # conflicts - ppdb_config_copy = self.ppdb_config.model_copy() - ppdb_config_copy.bucket_name = generate_test_bucket_name("ppdb-test-gcs-upload") - - # Patch the ChunkUploader to print the message that would be published - # to the Pub/Sub topic instead of publishing, because there is no - # support for that service in the test environment. - class DummyChunkUploader(ChunkUploader): - def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: - message = { - "dataset": self.dataset_id, - "chunk_id": str(chunk_id), - "folder": f"gs://{posixpath.join(bucket_name, chunk_prefix)}", - } - print(f"Dummy publish to Pub/Sub topic: {message}") - - # Create the test GCS bucket - storage_client = StorageClient(ppdb_config_copy.bucket_name) - try: - storage_client.create_bucket() - except Exception as e: - self.fail(f"Failed to create test GCS bucket: {e}") - - # Configure and run the uploader - uploader = DummyChunkUploader( - ppdb_config_copy, - wait_interval=0, - exit_on_empty=True, - exit_on_error=True, - ) - print(f"Uploader will copy files to {uploader.bucket_name}/{uploader.prefix}/") - uploader.run() - - # Retrieve the update records file - update_records_files = storage_client.list_files("**/update_records.json") - self.assertEqual( - len(update_records_files), - 1, - f"Expected exactly one update_records.json file in GCS, found " - f"{len(update_records_files)}: {update_records_files}", - ) - update_records_str = storage_client.read_as_string(update_records_files[0]) - - # Print the contents of the update records file for debugging - update_records_json = json.loads(update_records_str) - print(f"Contents of update_records.json in GCS:\n{json.dumps(update_records_json, indent=2)}") - - # Load the update records into the data model and perform a few basic - # checks (test_json_serialization already tests this in detail, so we - # just check a few key fields here). - update_records = UpdateRecords.model_validate(update_records_json) - self.assertEqual( - update_records.replica_chunk_id, - 1614600000, - "Unexpected replica chunk ID in update records file from GCS", - ) - self.assertEqual( - update_records.record_count, - 3, - f"Expected record_count of 3 in update records file from GCS, found " - f"{update_records.record_count}", - ) - self.assertEqual( - len(update_records.records), - 3, - f"Expected 3 update records in the file from GCS, found {len(update_records.records)}", - ) - - # Delete the test GCS bucket - try: - storage_client.delete_bucket(force=True) - except Exception as e: - self.fail(f"Failed to delete test GCS bucket: {e}") diff --git a/tests/test_ppdb_sql.py b/tests/test_ppdb_sql.py index f8675079..b6a6a1ab 100644 --- a/tests/test_ppdb_sql.py +++ b/tests/test_ppdb_sql.py @@ -20,7 +20,6 @@ # along with this program. If not, see . import gc -import os import shutil import tempfile import unittest @@ -30,15 +29,13 @@ from lsst.dax.apdb.sql import ApdbSql from lsst.dax.ppdb import PpdbConfig from lsst.dax.ppdb.sql import PpdbSql -from lsst.dax.ppdb.tests import PpdbTest +from lsst.dax.ppdb.tests import TEST_SCHEMA_RESOURCE_PATH, PpdbTest try: import testing.postgresql except ImportError: testing = None -TEST_SCHEMA = os.path.join(os.path.abspath(os.path.dirname(__file__)), "config/schema.yaml") - class ApdbSQLiteTestCase(PpdbTest, unittest.TestCase): """A test case for PpdbSql class using SQLite backend.""" @@ -55,11 +52,11 @@ def tearDown(self) -> None: def make_instance(self, **kwargs: Any) -> PpdbConfig: """Make config class instance used in all tests.""" - return PpdbSql.init_database(db_url=self.ppdb_url, schema_file=TEST_SCHEMA, **kwargs) + return PpdbSql.init_database(db_url=self.ppdb_url, schema_file=TEST_SCHEMA_RESOURCE_PATH, **kwargs) def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: kw = { - "schema_file": TEST_SCHEMA, + "schema_file": TEST_SCHEMA_RESOURCE_PATH, "ss_schema_file": "", "db_url": self.apdb_url, "enable_replica": True, @@ -98,11 +95,13 @@ def tearDown(self) -> None: def make_instance(self, **kwargs: Any) -> PpdbConfig: """Make config class instance used in all tests.""" - return PpdbSql.init_database(db_url=self.server.url(), schema_file=TEST_SCHEMA, **kwargs) + return PpdbSql.init_database( + db_url=self.server.url(), schema_file=TEST_SCHEMA_RESOURCE_PATH, **kwargs + ) def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: kw = { - "schema_file": TEST_SCHEMA, + "schema_file": TEST_SCHEMA_RESOURCE_PATH, "ss_schema_file": "", "db_url": self.server.url(), "namespace": "apdb", diff --git a/tests/test_update_record_expander.py b/tests/test_update_record_expander.py index 8818565e..843a025a 100644 --- a/tests/test_update_record_expander.py +++ b/tests/test_update_record_expander.py @@ -32,10 +32,16 @@ ApdbWithdrawDiaForcedSourceRecord, ApdbWithdrawDiaSourceRecord, ) -from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords -from lsst.dax.ppdb.tests._updates import _create_test_update_records +try: + from lsst.dax.ppdb.bigquery import updates + from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords + from lsst.dax.ppdb.tests._updates import _create_test_update_records +except ImportError: + updates = None + +@unittest.skipIf(updates is None, "Google Cloud environment not available") class UpdateRecordExpanderTestCase(unittest.TestCase): """Test UpdateRecordExpander functionality.""" @@ -50,6 +56,8 @@ def setUp(self) -> None: def test_get_update_fields(self) -> None: """Test get_update_fields class method.""" + from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander + # Test known update types self.assertEqual( UpdateRecordExpander.get_update_fields("reassign_diasource_to_diaobject"), ["diaObjectId"] @@ -77,6 +85,8 @@ def test_get_update_fields(self) -> None: def test_get_record_id_field_names(self) -> None: """Test get_record_id_field class method.""" + from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander + self.assertEqual( UpdateRecordExpander.get_record_id_fields("reassign_diasource_to_diaobject"), ["diaSourceId"] ) @@ -102,6 +112,8 @@ def test_reassign_diasource_to_diaobject(self) -> None: """Test expand_single_record with ApdbReassignDiaSourceToDiaObjectRecord. """ + from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander + record = ApdbReassignDiaSourceToDiaObjectRecord( update_time_ns=self.update_time_ns, update_order=0, diff --git a/tests/test_update_records.py b/tests/test_update_records.py new file mode 100644 index 00000000..e523544a --- /dev/null +++ b/tests/test_update_records.py @@ -0,0 +1,346 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import json +import posixpath +import unittest +import uuid + +import pytest + +try: + from lsst.dax.ppdb.bigquery import updates + from lsst.dax.ppdb.bigquery.updates import UpdateRecords +except ImportError: + updates = None + +from lsst.dax.apdb import ( + Apdb, + ApdbReplica, + apdbUpdateRecord, +) +from lsst.dax.ppdb import Ppdb +from lsst.dax.ppdb.bigquery import PpdbBigQuery +from lsst.dax.ppdb.replicator import Replicator +from lsst.dax.ppdb.tests import ApdbMixin +from lsst.dax.ppdb.tests._bigquery import _PostgresMixin + + +def _generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: + """Generate a unique bucket name for testing.""" + test_id = uuid.uuid4().hex[:16] + return f"{test_prefix}-{test_id}" + + +@unittest.skipIf(updates is None, "Google Cloud dependencies not available") +class UpdateRecordsTestCase(_PostgresMixin, ApdbMixin, unittest.TestCase): + """A test case for the handling of APDB record updates by PpdbBigQuery and + related classes including the ChunkUploader. + """ + + include_update_records = True + + def setUp(self): + super().setUp() + + # Make APDB instance and fill it with test data. + apdb_config = self.make_apdb_instance() + apdb = Apdb.from_config(apdb_config) + self._fill_apdb(apdb) # FIXME: Only include replica chunks with the updates + apdb_replica = ApdbReplica.from_config(apdb_config) + + # Make PPDB instance. + self.ppdb_config = self.make_instance() + self.ppdb = Ppdb.from_config(self.ppdb_config) + assert isinstance(self.ppdb, PpdbBigQuery) + + # Replicate APDB replica chunks to the PPDB. + replicator = Replicator( + apdb_replica, self.ppdb, update=False, min_wait_time=0, max_wait_time=0, check_interval=0 + ) + replicator.run(exit_on_empty=True) + + def test_json_serialization(self) -> None: + """Test that the APDB update records are correctly saved to a JSON file + in the replication output and can be read back as valid UpdateRecords + objects. + """ + update_records_path = self.ppdb.replication_path / "2021/03/01/1614600000" / "update_records.json" + self.assertTrue(update_records_path.exists(), "Update records file not found in replication output") + + update_records = UpdateRecords.from_json_file(update_records_path) + print("\n" + str(update_records)) + + self.assertEqual( + update_records.replica_chunk_id, + 1614600000, + "Unexpected replica chunk ID in deserialized update records", + ) + + self.assertEqual(update_records.record_count, 3, "Unexpected number of update records deserialized") + + self.assertEqual( + len(update_records.records), 3, "Unexpected number of update records in the deserialized object" + ) + + for record in update_records.records: + self.assertIsInstance( + record, + apdbUpdateRecord.ApdbUpdateRecord, + "Deserialized record is not an instance of ApdbUpdateRecord", + ) + + update_record = update_records.records[0] + self.assertIsInstance( + update_record, + apdbUpdateRecord.ApdbReassignDiaSourceToSSObjectRecord, + "Deserialized record is not an instance of ApdbReassignDiaSourceToSSObjectRecord", + ) + assert isinstance(update_record, apdbUpdateRecord.ApdbReassignDiaSourceToSSObjectRecord) + self.assertEqual( + update_record.diaSourceId, + 700, + "Unexpected diaSourceId in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.ssObjectId, + 1, + "Unexpected ssObjectId in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.update_time_ns, + 1614600037000000000, + "Unexpected update_time_ns in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.update_order, + 0, + "Unexpected update_order in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.midpointMjdTai, + 60000.0, + "Unexpected midpointMjdTai in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertEqual( + update_record.ssObjectReassocTimeMjdTai, + 59274.50042824074, + "Unexpected ssObjectReassocTimeMjdTai in deserialized ApdbReassignDiaSourceToSSObjectRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbReassignDiaSourceToSSObjectRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbReassignDiaSourceToSSObjectRecord, should not be 0.0", + ) + + update_record = update_records.records[1] + self.assertIsInstance( + update_record, + apdbUpdateRecord.ApdbCloseDiaObjectValidityRecord, + "Deserialized record is not an instance of ApdbCloseDiaObjectValidityRecord", + ) + self.assertEqual( + update_record.diaObjectId, + 200, + "Unexpected diaObjectId in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbCloseDiaObjectValidityRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbCloseDiaObjectValidityRecord, should not be 0.0", + ) + self.assertEqual( + update_record.update_time_ns, + 1614600037000000000, + "Unexpected update_time_ns in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertEqual( + update_record.update_order, + 1, + "Unexpected update_order in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertEqual( + update_record.validityEndMjdTai, + 59274.50042824074, + "Unexpected validityEndMjdTai in deserialized ApdbCloseDiaObjectValidityRecord", + ) + self.assertIsNone( + update_record.nDiaSources, + "Unexpected nDiaSources in deserialized ApdbCloseDiaObjectValidityRecord, expected None", + ) + + update_record = update_records.records[2] + self.assertIsInstance( + update_record, + apdbUpdateRecord.ApdbWithdrawDiaForcedSourceRecord, + "Deserialized record is not an instance of ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.diaObjectId, + 200, + "Unexpected diaObjectId in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.visit, + 7, + "Unexpected visit in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.detector, + 1, + "Unexpected detector in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) + self.assertEqual( + update_record.midpointMjdTai, + 60000.0, + "Unexpected midpointMjdTai in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.update_time_ns, + 1614600037000000000, + "Unexpected update_time_ns in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.update_order, + 2, + "Unexpected update_order in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertEqual( + update_record.timeWithdrawnMjdTai, + 59274.50042824074, + "Unexpected timeWithdrawnMjdTai in deserialized ApdbWithdrawDiaForcedSourceRecord", + ) + self.assertNotEqual( + update_record.ra, + 0.0, + "Unexpected ra in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) + self.assertNotEqual( + update_record.dec, + 0.0, + "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", + ) + + @pytest.mark.skipif( + pytest.importorskip("lsst.dax.ppdbx.gcp", reason="dax_ppdbx_gcp is not installed") is None, + reason="", + ) + def test_chunk_uploader(self) -> None: + """Test that the update records are correctly uploaded to Google Cloud + Storage after replication. + """ + from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader + from lsst.dax.ppdbx.gcp.gcs import StorageClient + + # Change the configuration to use a unique test bucket name to avoid + # conflicts + ppdb_config_copy = self.ppdb_config.model_copy() + ppdb_config_copy.bucket_name = _generate_test_bucket_name("ppdb-test-gcs-upload") + + # Patch the ChunkUploader to print the message that would be published + # to the Pub/Sub topic instead of publishing, because there is no + # support for that service in the test environment. + class DummyChunkUploader(ChunkUploader): + def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: + message = { + "dataset": self.dataset_id, + "chunk_id": str(chunk_id), + "folder": f"gs://{posixpath.join(bucket_name, chunk_prefix)}", + } + print(f"Dummy publish to Pub/Sub topic: {message}") + + # Create the test GCS bucket + storage_client = StorageClient(ppdb_config_copy.bucket_name) + try: + storage_client.create_bucket() + except Exception as e: + self.fail(f"Failed to create test GCS bucket: {e}") + + # Configure and run the uploader + uploader = DummyChunkUploader( + ppdb_config_copy, + wait_interval=0, + exit_on_empty=True, + exit_on_error=True, + ) + print(f"Uploader will copy files to {uploader.bucket_name}/{uploader.prefix}/") + uploader.run() + + # Retrieve the update records file + update_records_files = storage_client.list_files("**/update_records.json") + self.assertEqual( + len(update_records_files), + 1, + f"Expected exactly one update_records.json file in GCS, found " + f"{len(update_records_files)}: {update_records_files}", + ) + update_records_str = storage_client.read_as_string(update_records_files[0]) + + # Print the contents of the update records file for debugging + update_records_json = json.loads(update_records_str) + print(f"Contents of update_records.json in GCS:\n{json.dumps(update_records_json, indent=2)}") + + # Load the update records into the data model and perform a few basic + # checks (test_json_serialization already tests this in detail, so we + # just check a few key fields here). + update_records = UpdateRecords.model_validate(update_records_json) + self.assertEqual( + update_records.replica_chunk_id, + 1614600000, + "Unexpected replica chunk ID in update records file from GCS", + ) + self.assertEqual( + update_records.record_count, + 3, + f"Expected record_count of 3 in update records file from GCS, found " + f"{update_records.record_count}", + ) + self.assertEqual( + len(update_records.records), + 3, + f"Expected 3 update records in the file from GCS, found {len(update_records.records)}", + ) + + # Delete the test GCS bucket + try: + storage_client.delete_bucket(force=True) + except Exception as e: + self.fail(f"Failed to delete test GCS bucket: {e}") diff --git a/tests/test_updates_merger.py b/tests/test_updates_merger.py index 381b0adc..07705955 100644 --- a/tests/test_updates_merger.py +++ b/tests/test_updates_merger.py @@ -29,11 +29,15 @@ except ImportError: bigquery = None -from lsst.dax.ppdb.bigquery.updates import DiaObjectUpdatesMerger, UpdateRecordExpander, UpdatesTable -from lsst.dax.ppdb.tests._updates import _create_test_update_records +try: + from lsst.dax.ppdb.bigquery import updates + from lsst.dax.ppdb.bigquery.updates import DiaObjectUpdatesMerger, UpdateRecordExpander, UpdatesTable + from lsst.dax.ppdb.tests._updates import _create_test_update_records +except ImportError: + updates = None -@unittest.skipIf(bigquery is None, "google-cloud-bigquery not available") +@unittest.skipIf(bigquery is None or updates is None, "Google Cloud dependencies not available") class TestUpdatesMerger(unittest.TestCase): """Test UpdatesMerger functionality.""" @@ -115,7 +119,7 @@ def test_merge_no_updates(self): table_fqn = f"{self.target_dataset_fqn}.DiaObject" before = {r.diaObjectId: r for r in self.client.query(f"SELECT * FROM `{table_fqn}`").result()} merger = DiaObjectUpdatesMerger(self.client) - merger.merge(updates_table_fqn=dedup_fqn, target_table_fqn=table_fqn) + merger.merge(updates_table_fqn=dedup_fqn, target_dataset_fqn=self.target_dataset_fqn) after = {r.diaObjectId: r for r in self.client.query(f"SELECT * FROM `{table_fqn}`").result()} for obj_id in before: self.assertEqual(before[obj_id].validityEndMjdTai, after[obj_id].validityEndMjdTai) diff --git a/tests/test_updates_table.py b/tests/test_updates_table.py index 8d3d821f..aeb196b4 100644 --- a/tests/test_updates_table.py +++ b/tests/test_updates_table.py @@ -23,15 +23,19 @@ import uuid try: - from google.cloud import bigquery + from lsst.dax.ppdb.bigquery import updates + from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander, UpdatesTable + from lsst.dax.ppdb.tests._updates import _create_test_update_records except ImportError: - bigquery = None + updates = None -from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander, UpdatesTable -from lsst.dax.ppdb.tests._updates import _create_test_update_records +try: + from google.cloud import bigquery +except (ModuleNotFoundError, ImportError): + bigquery = None -@unittest.skipIf(bigquery is None, "google-cloud-bigquery not available") +@unittest.skipIf(updates is None or bigquery is None, "Google Cloud dependencies not available") class TestUpdatesTable(unittest.TestCase): """Test UpdatesTable functionality.""" From 74c4f7463e8134eb1e0950b5aff6a1bb68cc1a99 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Feb 2026 17:43:58 -0600 Subject: [PATCH 20/49] Add update merging support for DiaSource and DiaForcedSource tables --- .../dax/ppdb/bigquery/updates/__init__.py | 7 +- .../sql/merge_diaforcedsource_updates.sql | 42 +++++++ .../updates/sql/merge_diasource_updates.sql | 62 +++++++++ .../ppdb/bigquery/updates/updates_merger.py | 15 ++- tests/test_updates_merger.py | 119 +++++++++++++++++- 5 files changed, 242 insertions(+), 3 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaforcedsource_updates.sql create mode 100644 python/lsst/dax/ppdb/bigquery/updates/sql/merge_diasource_updates.sql diff --git a/python/lsst/dax/ppdb/bigquery/updates/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/__init__.py index 342c6069..7673a1ca 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/__init__.py +++ b/python/lsst/dax/ppdb/bigquery/updates/__init__.py @@ -23,4 +23,9 @@ from .expanded_update_record import ExpandedUpdateRecord from .update_record_expander import UpdateRecordExpander from .updates_table import UpdatesTable -from .updates_merger import UpdatesMerger, DiaObjectUpdatesMerger +from .updates_merger import ( + UpdatesMerger, + DiaObjectUpdatesMerger, + DiaSourceUpdatesMerger, + DiaForcedSourceUpdatesMerger, +) diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaforcedsource_updates.sql b/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaforcedsource_updates.sql new file mode 100644 index 00000000..8c60f86c --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaforcedsource_updates.sql @@ -0,0 +1,42 @@ +-- merge_diaforcedsource_updates.sql +-- +-- Query parameters: +-- @updates_table STRING -- table FQN, e.g. "project.dataset.prod_next" +-- @target_dataset STRING -- dataset FQN, e.g. "project.dataset" +-- +-- Do NOT include backticks in parameter values. + +DECLARE sql STRING; + +SET sql = """ +MERGE `{target_dataset}.DiaForcedSource` T +USING ( + WITH patch AS ( + SELECT + record_id[OFFSET(0)] AS diaObjectId, + record_id[OFFSET(1)] AS visit, + record_id[OFFSET(2)] AS detector, + + ANY_VALUE( + CASE WHEN field_name = 'timeWithdrawnMjdTai' + THEN CAST(JSON_VALUE(value_json) AS FLOAT64) + END + ) AS timeWithdrawnMjdTai_value, + COUNTIF(field_name = 'timeWithdrawnMjdTai') > 0 AS timeWithdrawnMjdTai_present + + FROM `{updates_table}` + WHERE table_name = 'DiaForcedSource' + AND field_name IN ('timeWithdrawnMjdTai') + GROUP BY diaObjectId, visit, detector + ) + SELECT * FROM patch +) P +ON T.diaObjectId = P.diaObjectId + AND T.visit = P.visit + AND T.detector = P.detector +WHEN MATCHED THEN +UPDATE SET + timeWithdrawnMjdTai = IF(P.timeWithdrawnMjdTai_present, P.timeWithdrawnMjdTai_value, T.timeWithdrawnMjdTai) +"""; + +EXECUTE IMMEDIATE sql; diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diasource_updates.sql b/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diasource_updates.sql new file mode 100644 index 00000000..5a39b877 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diasource_updates.sql @@ -0,0 +1,62 @@ +-- merge_diasource_updates.sql +-- +-- Query parameters: +-- @updates_table STRING -- table FQN, e.g. "project.dataset.prod_next" +-- @target_dataset STRING -- dataset FQN, e.g. "project.dataset" +-- +-- Do NOT include backticks in parameter values. + +DECLARE sql STRING; + +SET sql = """ +MERGE `{target_dataset}.DiaSource` T +USING ( + WITH patch AS ( + SELECT + record_id[OFFSET(0)] AS diaSourceId, + + ANY_VALUE( + CASE WHEN field_name = 'diaObjectId' + THEN CAST(JSON_VALUE(value_json) AS INT64) + END + ) AS diaObjectId_value, + COUNTIF(field_name = 'diaObjectId') > 0 AS diaObjectId_present, + + ANY_VALUE( + CASE WHEN field_name = 'ssObjectId' + THEN CAST(JSON_VALUE(value_json) AS INT64) + END + ) AS ssObjectId_value, + COUNTIF(field_name = 'ssObjectId') > 0 AS ssObjectId_present, + + ANY_VALUE( + CASE WHEN field_name = 'ssObjectReassocTimeMjdTai' + THEN CAST(JSON_VALUE(value_json) AS FLOAT64) + END + ) AS ssObjectReassocTimeMjdTai_value, + COUNTIF(field_name = 'ssObjectReassocTimeMjdTai') > 0 AS ssObjectReassocTimeMjdTai_present, + + ANY_VALUE( + CASE WHEN field_name = 'timeWithdrawnMjdTai' + THEN CAST(JSON_VALUE(value_json) AS FLOAT64) + END + ) AS timeWithdrawnMjdTai_value, + COUNTIF(field_name = 'timeWithdrawnMjdTai') > 0 AS timeWithdrawnMjdTai_present + + FROM `{updates_table}` + WHERE table_name = 'DiaSource' + AND field_name IN ('diaObjectId', 'ssObjectId', 'ssObjectReassocTimeMjdTai', 'timeWithdrawnMjdTai') + GROUP BY diaSourceId + ) + SELECT * FROM patch +) P +ON T.diaSourceId = P.diaSourceId +WHEN MATCHED THEN +UPDATE SET + diaObjectId = IF(P.diaObjectId_present, P.diaObjectId_value, T.diaObjectId), + ssObjectId = IF(P.ssObjectId_present, P.ssObjectId_value, T.ssObjectId), + ssObjectReassocTimeMjdTai = IF(P.ssObjectReassocTimeMjdTai_present, P.ssObjectReassocTimeMjdTai_value, T.ssObjectReassocTimeMjdTai), + timeWithdrawnMjdTai = IF(P.timeWithdrawnMjdTai_present, P.timeWithdrawnMjdTai_value, T.timeWithdrawnMjdTai) +"""; + +EXECUTE IMMEDIATE sql; diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py index 61bd2927..d1ea3f5c 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -89,5 +89,18 @@ class DiaObjectUpdatesMerger(UpdatesMerger): """Merger for DiaObject updates.""" TABLE_NAME = "DiaObject" - SQL_RESOURCE_NAME = "merge_diaobject_updates" + + +class DiaSourceUpdatesMerger(UpdatesMerger): + """Merger for DiaSource updates.""" + + TABLE_NAME = "DiaSource" + SQL_RESOURCE_NAME = "merge_diasource_updates" + + +class DiaForcedSourceUpdatesMerger(UpdatesMerger): + """Merger for DiaForcedSource updates.""" + + TABLE_NAME = "DiaForcedSource" + SQL_RESOURCE_NAME = "merge_diaforcedsource_updates" diff --git a/tests/test_updates_merger.py b/tests/test_updates_merger.py index 07705955..ed4dffa7 100644 --- a/tests/test_updates_merger.py +++ b/tests/test_updates_merger.py @@ -31,7 +31,13 @@ try: from lsst.dax.ppdb.bigquery import updates - from lsst.dax.ppdb.bigquery.updates import DiaObjectUpdatesMerger, UpdateRecordExpander, UpdatesTable + from lsst.dax.ppdb.bigquery.updates import ( + DiaForcedSourceUpdatesMerger, + DiaObjectUpdatesMerger, + DiaSourceUpdatesMerger, + UpdateRecordExpander, + UpdatesTable, + ) from lsst.dax.ppdb.tests._updates import _create_test_update_records except ImportError: updates = None @@ -110,6 +116,117 @@ def test_merge_diaobject(self): self.assertEqual(after[200003].validityEndMjdTai, before[200003].validityEndMjdTai) self.assertEqual(after[200003].nDiaSources, before[200003].nDiaSources) + def test_merge_diasource(self): + schema = [ + bigquery.SchemaField("diaSourceId", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("diaObjectId", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ssObjectId", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ssObjectReassocTimeMjdTai", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("timeWithdrawnMjdTai", "FLOAT", mode="NULLABLE"), + ] + table_fqn = f"{self.target_dataset_fqn}.DiaSource" + table = bigquery.Table(table_fqn, schema=schema) + self.client.create_table(table) + rows = [ + { + "diaSourceId": 100001, + "diaObjectId": 200001, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + { + "diaSourceId": 100002, + "diaObjectId": 200002, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + { + "diaSourceId": 100003, + "diaObjectId": 200003, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + { + "diaSourceId": 100004, + "diaObjectId": 200004, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + ] + job = self.client.load_table_from_file( + self._json_rows_to_buf(rows), + table_fqn, + job_config=bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON), + ) + job.result() + + updates_table = UpdatesTable(self.client, self.updates_table_fqn) + updates_table.create() + update_records = _create_test_update_records() + expanded = UpdateRecordExpander.expand_updates(update_records) + updates_table.append(expanded) + dedup_fqn = f"{self.updates_table_fqn}_dedup" + updates_table.deduplicate_to(dedup_fqn) + + query = f"SELECT * FROM `{table_fqn}` ORDER BY diaSourceId" + before = {r.diaSourceId: r for r in self.client.query(query).result()} + merger = DiaSourceUpdatesMerger(self.client) + merger.merge(updates_table_fqn=dedup_fqn, target_dataset_fqn=self.target_dataset_fqn) + after = {r.diaSourceId: r for r in self.client.query(query).result()} + + self.assertEqual(after[100001].diaObjectId, 400001) + self.assertEqual(after[100002].ssObjectId, 2001) + self.assertEqual(after[100002].ssObjectReassocTimeMjdTai, 59580.0) + self.assertEqual(after[100003].timeWithdrawnMjdTai, 59580.0) + self.assertEqual(after[100004].diaObjectId, before[100004].diaObjectId) + self.assertEqual(after[100004].ssObjectId, before[100004].ssObjectId) + self.assertEqual(after[100004].timeWithdrawnMjdTai, before[100004].timeWithdrawnMjdTai) + + def test_merge_diaforcedsource(self): + schema = [ + bigquery.SchemaField("diaObjectId", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("visit", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("detector", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("timeWithdrawnMjdTai", "FLOAT", mode="NULLABLE"), + ] + table_fqn = f"{self.target_dataset_fqn}.DiaForcedSource" + table = bigquery.Table(table_fqn, schema=schema) + self.client.create_table(table) + rows = [ + {"diaObjectId": 200001, "visit": 12345, "detector": 42, "timeWithdrawnMjdTai": None}, + {"diaObjectId": 200001, "visit": 12346, "detector": 42, "timeWithdrawnMjdTai": None}, + ] + job = self.client.load_table_from_file( + self._json_rows_to_buf(rows), + table_fqn, + job_config=bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON), + ) + job.result() + + updates_table = UpdatesTable(self.client, self.updates_table_fqn) + updates_table.create() + update_records = _create_test_update_records() + expanded = UpdateRecordExpander.expand_updates(update_records) + updates_table.append(expanded) + dedup_fqn = f"{self.updates_table_fqn}_dedup" + updates_table.deduplicate_to(dedup_fqn) + + query = f"SELECT * FROM `{table_fqn}` ORDER BY diaObjectId, visit, detector" + before = {(r.diaObjectId, r.visit, r.detector): r for r in self.client.query(query).result()} + merger = DiaForcedSourceUpdatesMerger(self.client) + merger.merge(updates_table_fqn=dedup_fqn, target_dataset_fqn=self.target_dataset_fqn) + after = {(r.diaObjectId, r.visit, r.detector): r for r in self.client.query(query).result()} + + self.assertEqual(after[(200001, 12345, 42)].timeWithdrawnMjdTai, 59580.0) + self.assertEqual( + after[(200001, 12346, 42)].timeWithdrawnMjdTai, + before[(200001, 12346, 42)].timeWithdrawnMjdTai, + ) + def test_merge_no_updates(self): self._create_target_table() updates_table = UpdatesTable(self.client, self.updates_table_fqn) From 300a228b632ce8c1eda7f7eb996a090e17b0cfa6 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Feb 2026 18:02:04 -0600 Subject: [PATCH 21/49] Remove requirements that we don't want installed by default in testing --- requirements.txt | 2 -- 1 file changed, 2 deletions(-) diff --git a/requirements.txt b/requirements.txt index f40c2d15..a8e9fceb 100644 --- a/requirements.txt +++ b/requirements.txt @@ -8,5 +8,3 @@ lsst-utils @ git+https://github.com/lsst/utils@main lsst-resources[s3] @ git+https://github.com/lsst/resources@main lsst-felis @ git+https://github.com/lsst/felis@main lsst-sdm-schemas @ git+https://github.com/lsst/sdm_schemas@main -lsst-dax-ppdbx-gcp @ git+https://github.com/lsst-dm/dax_ppdbx_gcp@tickets/DM-54070 -google-cloud-bigquery From 183bae56f782469c6ea3fba9b22faeb33b7d1650 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Feb 2026 18:03:46 -0600 Subject: [PATCH 22/49] ruff --- python/lsst/dax/ppdb/tests/_updates.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/python/lsst/dax/ppdb/tests/_updates.py b/python/lsst/dax/ppdb/tests/_updates.py index a5a62002..bc978ffe 100644 --- a/python/lsst/dax/ppdb/tests/_updates.py +++ b/python/lsst/dax/ppdb/tests/_updates.py @@ -136,7 +136,8 @@ def _create_test_update_records() -> UpdateRecords: ) ) - # Duplicate of the nDiaSources update but with earlier timestamp (should be discarded) + # Duplicate of the nDiaSources update but with earlier timestamp (should be + # discarded) records.append( ApdbUpdateNDiaSourcesRecord( update_time_ns=test_update_time_ns - 1000000000, # 1 second earlier From 61821dcff46375a869716426b955469f4e0fb857 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Mon, 23 Feb 2026 14:52:49 -0600 Subject: [PATCH 23/49] Add build tools to Dockerfile --- docker/Dockerfile.replication | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/docker/Dockerfile.replication b/docker/Dockerfile.replication index 12e8b3a5..a8e218f1 100644 --- a/docker/Dockerfile.replication +++ b/docker/Dockerfile.replication @@ -3,11 +3,14 @@ FROM python:3.12-slim-bookworm ENV DEBIAN_FRONTEND=noninteractive # Update and install OS dependencies -RUN apt-get -y update && \ - apt-get -y upgrade && \ - apt-get -y install --no-install-recommends git && \ - apt-get clean && \ - rm -rf /var/lib/apt/lists/* +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + build-essential \ + python3-dev \ + pkg-config \ + git \ + && apt-get clean \ + && rm -rf /var/lib/apt/lists/* # Install required python build dependencies RUN pip install --upgrade --no-cache-dir pip setuptools wheel uv From 5d1a3afddfe340727836381a359eb6e84e6fe5f2 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 24 Feb 2026 15:00:35 -0600 Subject: [PATCH 24/49] Move engine creation out of `make_database` method --- python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 3 ++- python/lsst/dax/ppdb/sql/_ppdb_sql.py | 3 ++- python/lsst/dax/ppdb/sql/_ppdb_sql_base.py | 3 +-- 3 files changed, 5 insertions(+), 4 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 82d2205c..2b6beab2 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -476,7 +476,8 @@ def init_bigquery( sql_config = PpdbSqlBaseConfig( db_url=db_url, schema_name=db_schema, felis_path=felis_path, felis_schema=felis_schema ) - cls.make_database(sql_config, sa_metadata, schema_version, db_drop) + engine = cls.make_engine(sql_config) + cls.make_database(engine, sql_config, sa_metadata, schema_version, db_drop) # Build config parameters. bq_config = PpdbBigQueryConfig( diff --git a/python/lsst/dax/ppdb/sql/_ppdb_sql.py b/python/lsst/dax/ppdb/sql/_ppdb_sql.py index 623cf651..320f579c 100644 --- a/python/lsst/dax/ppdb/sql/_ppdb_sql.py +++ b/python/lsst/dax/ppdb/sql/_ppdb_sql.py @@ -552,5 +552,6 @@ def init_database( isolation_level=isolation_level, connection_timeout=connection_timeout, ) - cls.make_database(config, sa_metadata, schema_version, drop) + engine = cls.make_engine(config) + cls.make_database(engine, config, sa_metadata, schema_version, drop) return config diff --git a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py index 836cab53..81f42dd7 100644 --- a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py +++ b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py @@ -171,6 +171,7 @@ def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: @classmethod def make_database( cls, + engine: sqlalchemy.engine.Engine, config: PpdbSqlBaseConfig, sa_metadata: sqlalchemy.schema.MetaData, schema_version: VersionTuple, @@ -189,8 +190,6 @@ def make_database( drop : `bool` If `True` then drop existing tables before creating new ones. """ - engine = cls.make_engine(config) - if config.schema_name is not None: dialect = engine.dialect quoted_schema = dialect.preparer(dialect).quote_schema(config.schema_name) From 8d48f473471acfe73767529466d61ec75cef7fb8 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 24 Feb 2026 15:14:07 -0600 Subject: [PATCH 25/49] Move building of connect args into separate method --- python/lsst/dax/ppdb/sql/_ppdb_sql_base.py | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py index 81f42dd7..fe1652bf 100644 --- a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py +++ b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py @@ -137,14 +137,7 @@ def __init__(self, config: PpdbSqlBaseConfig) -> None: self._check_code_version() @classmethod - def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: - """Make SQLALchemy engine based on configured parameters. - - Parameters - ---------- - config : `PpdbSqlBaseConfig` - Configuration object with SQL parameters. - """ + def _build_connect_args(cls, config: PpdbSqlBaseConfig) -> MutableMapping[str, Any]: kw: MutableMapping[str, Any] = {} conn_args: dict[str, Any] = {} if not config.use_connection_pool: @@ -159,7 +152,18 @@ def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: conn_args.update(timeout=config.connection_timeout) elif config.db_url.startswith(("postgresql", "mysql")): conn_args.update(connect_timeout=config.connection_timeout) - kw = {"connect_args": conn_args} + return {"connect_args": conn_args} + + @classmethod + def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: + """Make SQLALchemy engine based on configured parameters. + + Parameters + ---------- + config : `PpdbSqlBaseConfig` + Configuration object with SQL parameters. + """ + kw = cls._build_connect_args(config) engine = sqlalchemy.create_engine(config.db_url, **kw) if engine.dialect.name == "sqlite": From cb9931bb71e4f90f4461205eb58ea1e85631a536 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 24 Feb 2026 15:21:38 -0600 Subject: [PATCH 26/49] Move listener config to separate method --- python/lsst/dax/ppdb/sql/_ppdb_sql_base.py | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py index fe1652bf..ed953310 100644 --- a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py +++ b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py @@ -154,6 +154,12 @@ def _build_connect_args(cls, config: PpdbSqlBaseConfig) -> MutableMapping[str, A conn_args.update(connect_timeout=config.connection_timeout) return {"connect_args": conn_args} + @classmethod + def _config_listeners(cls, engine: sqlalchemy.engine.Engine) -> sqlalchemy.engine.Engine: + if engine.dialect.name == "sqlite": + # Need to enable foreign keys on every new connection. + sqlalchemy.event.listen(engine, "connect", _onSqlite3Connect) + @classmethod def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: """Make SQLALchemy engine based on configured parameters. @@ -165,10 +171,7 @@ def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: """ kw = cls._build_connect_args(config) engine = sqlalchemy.create_engine(config.db_url, **kw) - - if engine.dialect.name == "sqlite": - # Need to enable foreign keys on every new connection. - sqlalchemy.event.listen(engine, "connect", _onSqlite3Connect) + cls._config_listeners(engine) return engine From 41442f82edec5d7bf15469288da69759cdc3ca58 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Tue, 24 Feb 2026 17:14:43 -0600 Subject: [PATCH 27/49] Add support for getting db password from Google Secrets Manager This adds an option for getting the PPDB Postgres password from the Google Secrets Manager if the `PPDB_USE_SECRETS_MANAGER` environment variable is set to `true`. --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 39 +++++++++++++++++++ 1 file changed, 39 insertions(+) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 2b6beab2..02d33b81 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -21,6 +21,7 @@ import datetime import logging +import os import shutil from collections.abc import Collection, Iterable, Sequence from pathlib import Path @@ -137,6 +138,7 @@ def __init__(self, config: PpdbBigQueryConfig): self.parq_batch_size = config.parq_batch_size self.parq_compression = config.parq_compression self.delete_existing_dirs = config.delete_existing_dirs + self.project_id = config.project_id @property def metadata(self) -> ApdbMetadata: @@ -409,6 +411,43 @@ def filter_table_names(cls, original_table_names: Iterable[str]) -> Iterable[str # Only the metadata table is needed for the BigQuery-based PPDB. return ["metadata"] + @classmethod + def _get_secretmanager_password(cls, project_id: str, password_name: str = "ppdb-db-password") -> str: + from google.cloud import secretmanager + + client = secretmanager.SecretManagerServiceClient() + name = f"projects/{project_id}/secrets/{password_name}/versions/latest" + response = client.access_secret_version(request={"name": name}) + return response.payload.data.decode("UTF-8") + + @classmethod + def _use_secret_manager(cls) -> bool: + return os.getenv("PPDB_USE_SECRET_MANAGER", "false").lower() == "true" + + @classmethod + def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: + """Make SQLALchemy engine based on configured parameters. + + Parameters + ---------- + config : `PpdbSqlBaseConfig` + Configuration object with SQL parameters. + """ + kw = cls._build_connect_args(config) + db_url = sqlalchemy.make_url(config.db_url) + + if cls._use_secret_manager(): + _LOG.info("Using Secret Manager to retrieve database password") + if db_url.password is not None: + raise ValueError("Database URL should not include a password when using Secret Manager") + password = cls._get_secretmanager_password(config.project_id) + db_url = db_url.set(password=password) + + engine = sqlalchemy.create_engine(db_url, **kw) + cls._config_listeners(engine) + + return engine + @classmethod def init_bigquery( cls, From dfaa6560be818eb67c7907bc704d3b747ad55a8b Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 25 Feb 2026 15:11:26 -0600 Subject: [PATCH 28/49] Rearrange SQL init code --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 40 ++++++++++++++----- 1 file changed, 31 insertions(+), 9 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 02d33b81..a79c79de 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -39,6 +39,7 @@ monitor, schema_model, ) +from lsst.dax.apdb.sql import ApdbMetadataSql from lsst.dax.apdb.timer import Timer from .._arrow import write_parquet @@ -128,10 +129,10 @@ class PpdbBigQuery(Ppdb, PpdbSqlBase): """ def __init__(self, config: PpdbBigQueryConfig): - # Initialize the SQL interface for the PPDB. - PpdbSqlBase.__init__(self, config.sql) + # Initialize the SQL interface for the PPDB + self._init_sql(config) - # Read parameters from config. + # Read parameters from config if config.replication_dir is None: raise ValueError("Directory for chunk export is not set in configuration.") self.replication_path = config.replication_path @@ -151,6 +152,22 @@ def metadata(self) -> ApdbMetadata: """ return self._metadata + def _init_sql(self, config: PpdbBigQueryConfig) -> None: + sql_config = config.sql + self._sa_metadata, self._schema_version = self.read_schema( + sql_config.felis_path, sql_config.schema_name, sql_config.felis_schema, sql_config.db_url + ) + + self._engine = self._make_engine(config) # Includes Secrets Manager support + sa_metadata = sqlalchemy.MetaData(schema=sql_config.schema_name) + + meta_table = sqlalchemy.schema.Table("metadata", sa_metadata, autoload_with=self._engine) + self._metadata = ApdbMetadataSql(self._engine, meta_table) + + # Check schema amd code version compatibility. + self._check_schema_version(self._schema_version) + self._check_code_version() + def _generate_manifest( self, replica_chunk: ReplicaChunk, @@ -425,17 +442,19 @@ def _use_secret_manager(cls) -> bool: return os.getenv("PPDB_USE_SECRET_MANAGER", "false").lower() == "true" @classmethod - def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: + def _make_engine(cls, config: PpdbBigQueryConfig) -> sqlalchemy.engine.Engine: """Make SQLALchemy engine based on configured parameters. Parameters ---------- - config : `PpdbSqlBaseConfig` + config : `PpdbBigQueryConfig` Configuration object with SQL parameters. """ - kw = cls._build_connect_args(config) - db_url = sqlalchemy.make_url(config.db_url) + sql_config = config.sql + db_url = sqlalchemy.make_url(sql_config.db_url) + # If using Secret Manager, retrieve the password and update the + # database URL. if cls._use_secret_manager(): _LOG.info("Using Secret Manager to retrieve database password") if db_url.password is not None: @@ -443,7 +462,9 @@ def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: password = cls._get_secretmanager_password(config.project_id) db_url = db_url.set(password=password) + kw = cls._build_connect_args(sql_config) engine = sqlalchemy.create_engine(db_url, **kw) + cls._config_listeners(engine) return engine @@ -515,8 +536,6 @@ def init_bigquery( sql_config = PpdbSqlBaseConfig( db_url=db_url, schema_name=db_schema, felis_path=felis_path, felis_schema=felis_schema ) - engine = cls.make_engine(sql_config) - cls.make_database(engine, sql_config, sa_metadata, schema_version, db_drop) # Build config parameters. bq_config = PpdbBigQueryConfig( @@ -535,6 +554,9 @@ def init_bigquery( if stage_chunk_topic is not None: bq_config.stage_chunk_topic = stage_chunk_topic + engine = cls._make_engine(bq_config) + cls.make_database(engine, bq_config.sql, sa_metadata, schema_version, db_drop) + # Validate the config if requested. if validate_config: _LOG.info("validating BigQuery configuration") From bc9aeeb5da6868b3aaded81d45420f1fb549f2b9 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 25 Feb 2026 16:26:44 -0600 Subject: [PATCH 29/49] Rename the `config` module to `ppdb_config` This follows DM naming conventions, since the module defines the class `PpdbConfig`. --- python/lsst/dax/ppdb/__init__.py | 2 +- python/lsst/dax/ppdb/_factory.py | 2 +- python/lsst/dax/ppdb/bigquery/chunk_uploader.py | 2 +- python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 2 +- python/lsst/dax/ppdb/ppdb.py | 2 +- python/lsst/dax/ppdb/{config.py => ppdb_config.py} | 0 python/lsst/dax/ppdb/tests/_ppdb.py | 2 +- 7 files changed, 6 insertions(+), 6 deletions(-) rename python/lsst/dax/ppdb/{config.py => ppdb_config.py} (100%) diff --git a/python/lsst/dax/ppdb/__init__.py b/python/lsst/dax/ppdb/__init__.py index d8aeb139..2f4dab94 100644 --- a/python/lsst/dax/ppdb/__init__.py +++ b/python/lsst/dax/ppdb/__init__.py @@ -19,7 +19,7 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from .config import * +from .ppdb_config import * from .ppdb import * from .replicator import * from .version import * # Generated by sconsUtils diff --git a/python/lsst/dax/ppdb/_factory.py b/python/lsst/dax/ppdb/_factory.py index aee2ee52..c3774778 100644 --- a/python/lsst/dax/ppdb/_factory.py +++ b/python/lsst/dax/ppdb/_factory.py @@ -26,8 +26,8 @@ from typing import TYPE_CHECKING if TYPE_CHECKING: - from .config import PpdbConfig from .ppdb import Ppdb + from .ppdb_config import PpdbConfig def config_type_for_name(type_name: str) -> type[PpdbConfig]: diff --git a/python/lsst/dax/ppdb/bigquery/chunk_uploader.py b/python/lsst/dax/ppdb/bigquery/chunk_uploader.py index d23a123c..6bef4383 100644 --- a/python/lsst/dax/ppdb/bigquery/chunk_uploader.py +++ b/python/lsst/dax/ppdb/bigquery/chunk_uploader.py @@ -42,7 +42,7 @@ ) from e -from ..config import PpdbConfig +from ..ppdb_config import PpdbConfig from .manifest import Manifest from .ppdb_bigquery import PpdbBigQuery, PpdbBigQueryConfig from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index a79c79de..f1cfc8fd 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -43,8 +43,8 @@ from lsst.dax.apdb.timer import Timer from .._arrow import write_parquet -from ..config import PpdbConfig from ..ppdb import Ppdb, PpdbReplicaChunk +from ..ppdb_config import PpdbConfig from ..sql import PpdbSqlBase, PpdbSqlBaseConfig from .manifest import Manifest, TableStats from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended diff --git a/python/lsst/dax/ppdb/ppdb.py b/python/lsst/dax/ppdb/ppdb.py index 31b6a315..e175bb7f 100644 --- a/python/lsst/dax/ppdb/ppdb.py +++ b/python/lsst/dax/ppdb/ppdb.py @@ -33,7 +33,7 @@ from lsst.resources import ResourcePathExpression from ._factory import ppdb_from_config -from .config import PpdbConfig +from .ppdb_config import PpdbConfig @dataclass(frozen=True) diff --git a/python/lsst/dax/ppdb/config.py b/python/lsst/dax/ppdb/ppdb_config.py similarity index 100% rename from python/lsst/dax/ppdb/config.py rename to python/lsst/dax/ppdb/ppdb_config.py diff --git a/python/lsst/dax/ppdb/tests/_ppdb.py b/python/lsst/dax/ppdb/tests/_ppdb.py index 6a2e38d8..ffb50e84 100644 --- a/python/lsst/dax/ppdb/tests/_ppdb.py +++ b/python/lsst/dax/ppdb/tests/_ppdb.py @@ -44,8 +44,8 @@ from lsst.dax.apdb.tests.data_factory import makeForcedSourceCatalog, makeObjectCatalog, makeSourceCatalog from lsst.sphgeom import Angle, Circle, Region, UnitVector3d -from ..config import PpdbConfig from ..ppdb import Ppdb, PpdbReplicaChunk +from ..ppdb_config import PpdbConfig from ..replicator import Replicator if TYPE_CHECKING: From 331cf4a16fc2b2f5d9784862ce33fb8c66460242 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 25 Feb 2026 16:24:24 -0600 Subject: [PATCH 30/49] Move the method for getting promotable chunks to PpdbBigQuery This functionality is moved into this repository, so that the cloud functions may access it. --- pyproject.toml | 2 +- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 35 +++++++++++++++++++ python/lsst/dax/ppdb/config/__init__.py | 0 python/lsst/dax/ppdb/config/sql/__init__.py | 0 .../config/sql/select_promotable_chunks.sql | 24 +++++++++++++ 5 files changed, 60 insertions(+), 1 deletion(-) create mode 100644 python/lsst/dax/ppdb/config/__init__.py create mode 100644 python/lsst/dax/ppdb/config/sql/__init__.py create mode 100644 python/lsst/dax/ppdb/config/sql/select_promotable_chunks.sql diff --git a/pyproject.toml b/pyproject.toml index 930ac8f0..ac527e91 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ where = ["python"] zip-safe = true [tool.setuptools.package-data] -"lsst.dax.ppdb" = ["py.typed", "bigquery/updates/sql/*.sql", "tests/config/*.yaml"] +"lsst.dax.ppdb" = ["py.typed", "bigquery/updates/sql/*.sql", "tests/config/*.yaml", "config/sql/*.sql"] [tool.setuptools.dynamic] version = { attr = "lsst_versions.get_lsst_version" } diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index f1cfc8fd..1fed0f8c 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -41,6 +41,7 @@ ) from lsst.dax.apdb.sql import ApdbMetadataSql from lsst.dax.apdb.timer import Timer +from lsst.resources import ResourcePath from .._arrow import write_parquet from ..ppdb import Ppdb, PpdbReplicaChunk @@ -674,3 +675,37 @@ def _handle_updates( replica_chunk.id, chunk_dir / "update_records.json", ) + + def get_promotable_chunks(self) -> list[int]: + """ + Return the first uninterrupted sequence of staged chunks such that all + prior chunks are promoted. + + Returns + ------- + chunk_ids : `list`[`int`] + A list of tuples containing the `apdb_replica_chunk` values of the + promotable chunks. + + Notes + ----- + This query finds the contiguous sequence of ``staged`` chunks beginning + with the earliest chunk that is not yet ``promoted``, and ending just + before the first chunk that is not ``staged``. If no such ending + exists, all `staged` chunks from that point onward are returned. If no + chunks are `staged` after the first non-`promoted` chunk, an empty list + is returned. + """ + table = self.get_table("PpdbReplicaChunk") + quoted_table_name = ( + self._engine.dialect.identifier_preparer.quote(table.schema) + + "." + + self._engine.dialect.identifier_preparer.quote(table.name) + ) + sql_resource_path = "resource://lsst.dax.ppdb.config.sql/select_promotable_chunks.sql" + sql_text = ResourcePath(sql_resource_path).read().decode("utf-8") + sql_text = sql_text.format(table_name=quoted_table_name) + with self._engine.connect() as conn: + result = conn.execute(sqlalchemy.text(sql_text)) + chunk_ids = [row[0] for row in result] + return chunk_ids diff --git a/python/lsst/dax/ppdb/config/__init__.py b/python/lsst/dax/ppdb/config/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/lsst/dax/ppdb/config/sql/__init__.py b/python/lsst/dax/ppdb/config/sql/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/python/lsst/dax/ppdb/config/sql/select_promotable_chunks.sql b/python/lsst/dax/ppdb/config/sql/select_promotable_chunks.sql new file mode 100644 index 00000000..e776a1f0 --- /dev/null +++ b/python/lsst/dax/ppdb/config/sql/select_promotable_chunks.sql @@ -0,0 +1,24 @@ +WITH start AS ( +SELECT MIN(apdb_replica_chunk) AS s +FROM {table_name} +WHERE status <> 'promoted' + AND status <> 'skipped' +), +stop AS ( +SELECT MIN(p.apdb_replica_chunk) AS e +FROM {table_name} p +JOIN start ON TRUE +WHERE start.s IS NOT NULL + AND p.apdb_replica_chunk >= start.s + AND p.status <> 'staged' + AND status <> 'skipped' +) +SELECT p.apdb_replica_chunk +FROM {table_name} p +JOIN start ON TRUE +LEFT JOIN stop ON TRUE +WHERE start.s IS NOT NULL +AND p.status = 'staged' +AND p.apdb_replica_chunk >= start.s +AND (stop.e IS NULL OR p.apdb_replica_chunk < stop.e) +ORDER BY p.apdb_replica_chunk; From 3df612ea6c2a1ec90139f70fc9a80d3bd6166091 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 25 Feb 2026 16:34:02 -0600 Subject: [PATCH 31/49] Move SQL files into `config/sql` dir --- pyproject.toml | 2 +- python/lsst/dax/ppdb/bigquery/updates/sql/__init__.py | 0 python/lsst/dax/ppdb/bigquery/updates/updates_merger.py | 4 ++-- .../updates => config}/sql/merge_diaforcedsource_updates.sql | 0 .../updates => config}/sql/merge_diaobject_updates.sql | 0 .../updates => config}/sql/merge_diasource_updates.sql | 0 6 files changed, 3 insertions(+), 3 deletions(-) delete mode 100644 python/lsst/dax/ppdb/bigquery/updates/sql/__init__.py rename python/lsst/dax/ppdb/{bigquery/updates => config}/sql/merge_diaforcedsource_updates.sql (100%) rename python/lsst/dax/ppdb/{bigquery/updates => config}/sql/merge_diaobject_updates.sql (100%) rename python/lsst/dax/ppdb/{bigquery/updates => config}/sql/merge_diasource_updates.sql (100%) diff --git a/pyproject.toml b/pyproject.toml index ac527e91..937bb005 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ where = ["python"] zip-safe = true [tool.setuptools.package-data] -"lsst.dax.ppdb" = ["py.typed", "bigquery/updates/sql/*.sql", "tests/config/*.yaml", "config/sql/*.sql"] +"lsst.dax.ppdb" = ["py.typed", "tests/config/*.yaml", "config/sql/*.sql"] [tool.setuptools.dynamic] version = { attr = "lsst_versions.get_lsst_version" } diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/sql/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py index d1ea3f5c..d1c6fdfb 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -27,7 +27,7 @@ from lsst.resources import ResourcePath -_SQL_RESOURCE_PACKAGE = "lsst.dax.ppdb.bigquery.updates.sql" +_SQL_RESOURCE_PACKAGE = "lsst.dax.ppdb.config.sql" class UpdatesMerger(ABC): @@ -42,7 +42,7 @@ class UpdatesMerger(ABC): SQL_RESOURCE_NAME: str """Base name of the SQL file (without .sql extension) containing the MERGE statement for this merger. The SQL file must be located in the - `lsst.dax.ppdb.bigquery.updates.sql` package.""" + `lsst.dax.ppdb.config.sql` package.""" def __init__(self, client: bigquery.Client) -> None: """ diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaforcedsource_updates.sql b/python/lsst/dax/ppdb/config/sql/merge_diaforcedsource_updates.sql similarity index 100% rename from python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaforcedsource_updates.sql rename to python/lsst/dax/ppdb/config/sql/merge_diaforcedsource_updates.sql diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaobject_updates.sql b/python/lsst/dax/ppdb/config/sql/merge_diaobject_updates.sql similarity index 100% rename from python/lsst/dax/ppdb/bigquery/updates/sql/merge_diaobject_updates.sql rename to python/lsst/dax/ppdb/config/sql/merge_diaobject_updates.sql diff --git a/python/lsst/dax/ppdb/bigquery/updates/sql/merge_diasource_updates.sql b/python/lsst/dax/ppdb/config/sql/merge_diasource_updates.sql similarity index 100% rename from python/lsst/dax/ppdb/bigquery/updates/sql/merge_diasource_updates.sql rename to python/lsst/dax/ppdb/config/sql/merge_diasource_updates.sql From 295e1dca0dc9c6a2a92446d5c777118b83156e68 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 25 Feb 2026 17:27:16 -0600 Subject: [PATCH 32/49] Add `sql_resource` moduel for accessing SQL files as resources --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 10 ++-- python/lsst/dax/ppdb/bigquery/sql_resource.py | 57 +++++++++++++++++++ .../ppdb/bigquery/updates/updates_merger.py | 19 ++----- 3 files changed, 68 insertions(+), 18 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/sql_resource.py diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 1fed0f8c..cf4032c3 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -41,7 +41,6 @@ ) from lsst.dax.apdb.sql import ApdbMetadataSql from lsst.dax.apdb.timer import Timer -from lsst.resources import ResourcePath from .._arrow import write_parquet from ..ppdb import Ppdb, PpdbReplicaChunk @@ -49,6 +48,7 @@ from ..sql import PpdbSqlBase, PpdbSqlBaseConfig from .manifest import Manifest, TableStats from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended +from .sql_resource import SqlResource __all__ = ["ConfigValidationError", "PpdbBigQuery", "PpdbBigQueryConfig"] @@ -702,10 +702,10 @@ def get_promotable_chunks(self) -> list[int]: + "." + self._engine.dialect.identifier_preparer.quote(table.name) ) - sql_resource_path = "resource://lsst.dax.ppdb.config.sql/select_promotable_chunks.sql" - sql_text = ResourcePath(sql_resource_path).read().decode("utf-8") - sql_text = sql_text.format(table_name=quoted_table_name) + + sql = SqlResource("select_promotable_chunks", {"table_name": quoted_table_name}).sql + with self._engine.connect() as conn: - result = conn.execute(sqlalchemy.text(sql_text)) + result = conn.execute(sqlalchemy.text(sql)) chunk_ids = [row[0] for row in result] return chunk_ids diff --git a/python/lsst/dax/ppdb/bigquery/sql_resource.py b/python/lsst/dax/ppdb/bigquery/sql_resource.py new file mode 100644 index 00000000..2548bd3c --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/sql_resource.py @@ -0,0 +1,57 @@ +# This file is part of dax_ppdb +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + + +from lsst.resources import ResourcePath + + +class SqlResource: + """Class for loading SQL query text from a resource file and optionally + formatting it with provided arguments. + + Parameters + ---------- + sql_resource_name : `str` + Base name of the SQL file (without .sql extension) containing the + query. + The SQL file must be located in the `lsst.dax.ppdb.config.sql` package. + format_args : `dict` [ `str`, `str` ], optional + Optional dictionary of arguments for formatting the SQL text. + """ + + SQL_RESOURCE_PACKAGE = "lsst.dax.ppdb.config.sql" + + def __init__(self, sql_resource_name: str, format_args: dict[str, str] | None = None) -> None: + sql_resource_path = f"resource://{self.SQL_RESOURCE_PACKAGE}/{sql_resource_name}.sql" + sql = ResourcePath(sql_resource_path).read().decode("utf-8") + if format_args is not None: + try: + sql = sql.format(**format_args) + except Exception as e: + raise RuntimeError( + f"Failed to format SQL resource at {sql_resource_path} with arguments {format_args}" + ) from e + self._sql = sql + + @property + def sql(self) -> str: + """SQL query string (`str`).""" + return self._sql diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py index d1c6fdfb..dfa85e02 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -25,9 +25,7 @@ from google.cloud import bigquery -from lsst.resources import ResourcePath - -_SQL_RESOURCE_PACKAGE = "lsst.dax.ppdb.config.sql" +from ..sql_resource import SqlResource class UpdatesMerger(ABC): @@ -70,16 +68,11 @@ def merge(self, *, updates_table_fqn: str, target_dataset_fqn: str) -> bigquery. google.cloud.bigquery.job.QueryJob The completed BigQuery job. """ - try: - sql_resource_path = f"resource://{_SQL_RESOURCE_PACKAGE}/{self.SQL_RESOURCE_NAME}.sql" - print(f"Reading SQL from resource: {sql_resource_path}") - sql_text = ResourcePath(sql_resource_path).read().decode("utf-8") - except Exception as e: - raise RuntimeError(f"Failed to read SQL resource at {sql_resource_path}") from e - - sql_text = sql_text.format(updates_table=updates_table_fqn, target_dataset=target_dataset_fqn) - - job = self._client.query(sql_text) + sql = SqlResource( + self.SQL_RESOURCE_NAME, + format_args={"updates_table": updates_table_fqn, "target_dataset": target_dataset_fqn}, + ).sql + job = self._client.query(sql) job.result() return job From c9d375d801c84b4f8d78d72f0b83a52420d42423 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 26 Feb 2026 15:10:35 -0600 Subject: [PATCH 33/49] Add GCS URI to PpdbReplicaChunkExtended model and database --- .../lsst/dax/ppdb/bigquery/chunk_uploader.py | 30 +++++++++++-------- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 19 ++++++++---- .../bigquery/ppdb_replica_chunk_extended.py | 20 +++++++++++++ 3 files changed, 52 insertions(+), 17 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/chunk_uploader.py b/python/lsst/dax/ppdb/bigquery/chunk_uploader.py index 6bef4383..8228ad34 100644 --- a/python/lsst/dax/ppdb/bigquery/chunk_uploader.py +++ b/python/lsst/dax/ppdb/bigquery/chunk_uploader.py @@ -270,7 +270,7 @@ def _process_chunk(self, replica_chunk: PpdbReplicaChunkExtended) -> None: ) try: - # 1) Upload the files to GCS for non-empty chunks. + # 1) Upload the files to GCS for non-empty chunks if upload_file_list: gcs_names = {path: posixpath.join(gcs_prefix, path.name) for path in upload_file_list} try: @@ -284,7 +284,7 @@ def _process_chunk(self, replica_chunk: PpdbReplicaChunkExtended) -> None: except* UploadError as eg: raise ChunkUploadError(chunk_id, f"{len(eg.exceptions)} upload(s) failed") from eg - # 2) Upload manifest, even for empty chunks. + # 2) Upload manifest, even for empty chunks try: self.storage.upload_from_string( posixpath.join(gcs_prefix, replica_chunk.manifest_name), @@ -293,23 +293,29 @@ def _process_chunk(self, replica_chunk: PpdbReplicaChunkExtended) -> None: except UploadError as e: raise ChunkUploadError(chunk_id, "Manifest upload failed") from e - # 3) Update status in the database, but not for empty chunks. - # They have already been marked as skipped during export. + # Next two steps are inapplicable to empty chunks. if not is_empty: + # 3) Update status and GCS URI in the database + gcs_prefix = posixpath.join(self.bucket_name, gcs_prefix) + updated_replica_chunk = replica_chunk.with_new_status(ChunkStatus.UPLOADED).with_new_gcs_uri( + f"gs://{gcs_prefix}" + ) try: - self._bq.store_chunk(replica_chunk.with_new_status(ChunkStatus.UPLOADED), True) + self._bq.store_chunk(updated_replica_chunk, True) + _LOG.info( + "Updated replica chunk %d in database with status 'uploaded' and GCS URI: %s", + chunk_id, + gcs_prefix, + ) except Exception as e: - raise ChunkUploadError( - chunk_id, "failed to update replica chunk status in database" - ) from e + raise ChunkUploadError(chunk_id, "Failed to update replica chunk in database") from e - # 4) Publish Pub/Sub staging message to trigger BigQuery load, but - # not for empty chunks. (Empty chunks do not need to be staged.) - if not is_empty: + # 4) Publish Pub/Sub event to trigger staging of the chunk in + # BigQuery try: self._post_to_stage_chunk_topic(self.bucket_name, gcs_prefix, chunk_id) except Exception as e: - raise ChunkUploadError(chunk_id, "failed to publish staging message") from e + raise ChunkUploadError(chunk_id, "Failed to publish staging message") from e except ChunkUploadError as err: try: diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index cf4032c3..6ddc9066 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -336,6 +336,7 @@ def get_replica_chunks_ext( table.columns["replica_time"], table.columns["status"], # Extended column table.columns["directory"], # Extended column + table.columns["gcs_uri"], # Extended column ).order_by(table.columns["last_update_time"]) if start_chunk_id is not None: query = query.where(table.columns["apdb_replica_chunk"] >= start_chunk_id) @@ -355,6 +356,7 @@ def get_replica_chunks_ext( replica_time=replica_time, status=row[4], directory=Path(row[5]), + gcs_uri=row[6], ) ) return ids @@ -382,6 +384,7 @@ def store_chunk(self, replica_chunk: PpdbReplicaChunkExtended, update: bool) -> "replica_time": replica_chunk.replica_time_dt_utc, "status": replica_chunk.status, "directory": str(replica_chunk.directory), + "gcs_uri": replica_chunk.gcs_uri, } if update: self.upsert(connection, table, row, "apdb_replica_chunk") @@ -419,6 +422,12 @@ def create_replica_chunk_table(cls, table_name: str | None = None) -> schema_mod datatype=felis.datamodel.DataType.string, nullable=True, # We might want to allow NULL if an error occurs when exporting. ), + schema_model.Column( + name="gcs_uri", + id=f"#{table_name}.gcs_uri", + datatype=felis.datamodel.DataType.string, + nullable=True, + ), ] ) return replica_chunk_table @@ -684,17 +693,17 @@ def get_promotable_chunks(self) -> list[int]: Returns ------- chunk_ids : `list`[`int`] - A list of tuples containing the `apdb_replica_chunk` values of the - promotable chunks. + A list of tuples containing the ``apdb_replica_chunk`` values of + the promotable chunks. Notes ----- This query finds the contiguous sequence of ``staged`` chunks beginning with the earliest chunk that is not yet ``promoted``, and ending just before the first chunk that is not ``staged``. If no such ending - exists, all `staged` chunks from that point onward are returned. If no - chunks are `staged` after the first non-`promoted` chunk, an empty list - is returned. + exists, all ``staged`` chunks from that point onward are returned. If + no chunks are ``staged`` after the first non-``promoted`` chunk, an + empty list is returned. """ table = self.get_table("PpdbReplicaChunk") quoted_table_name = ( diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_replica_chunk_extended.py b/python/lsst/dax/ppdb/bigquery/ppdb_replica_chunk_extended.py index bd8d6422..2ad8f0f7 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_replica_chunk_extended.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_replica_chunk_extended.py @@ -59,6 +59,10 @@ class PpdbReplicaChunkExtended(PpdbReplicaChunk): directory: Path """Directory where the exported replica chunk data is stored locally.""" + gcs_uri: str | None = None + """GCS URI where the replica chunk data is stored, or `None` if not + uploaded yet.""" + @property def manifest_name(self) -> str: """Filename of the manifest file for this chunk.""" @@ -127,3 +131,19 @@ def with_new_status(self, new_status: ChunkStatus) -> PpdbReplicaChunkExtended: The new chunk with the updated status. """ return dataclasses.replace(self, status=new_status) + + def with_new_gcs_uri(self, new_gcs_uri: str) -> PpdbReplicaChunkExtended: + """Create a new `PpdbReplicaChunkExtended` with the same properties as + this one, but with a different GCS URI. + + Parameters + ---------- + new_gcs_uri : `str` + The new GCS URI to set. + + Returns + ------- + new_chunk : `PpdbReplicaChunkExtended` + The new chunk with the updated GCS URI. + """ + return dataclasses.replace(self, gcs_uri=new_gcs_uri) From da7d8239ce1c22fdec6adffcc7301f1c5a467f05 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 26 Feb 2026 15:51:13 -0600 Subject: [PATCH 34/49] Add `UpdatesManager` for applying updates from JSON files in GCS --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 52 ++++ .../dax/ppdb/bigquery/updates/__init__.py | 7 +- .../ppdb/bigquery/updates/updates_manager.py | 107 +++++++ .../ppdb/bigquery/updates/updates_merger.py | 16 +- .../ppdb/bigquery/updates/updates_table.py | 8 +- tests/test_updates_manager.py | 260 ++++++++++++++++++ 6 files changed, 442 insertions(+), 8 deletions(-) create mode 100644 python/lsst/dax/ppdb/bigquery/updates/updates_manager.py create mode 100644 tests/test_updates_manager.py diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 6ddc9066..fa15c20a 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -142,6 +142,8 @@ def __init__(self, config: PpdbBigQueryConfig): self.delete_existing_dirs = config.delete_existing_dirs self.project_id = config.project_id + self._config = config + @property def metadata(self) -> ApdbMetadata: """Implement `Ppdb` interface to return APDB metadata object. @@ -361,6 +363,56 @@ def get_replica_chunks_ext( ) return ids + def get_replica_chunks_ext_by_ids(self, chunk_ids: Sequence[int]) -> Sequence[PpdbReplicaChunkExtended]: + """Find replica chunks for a list of chunk IDs. + + Parameters + ---------- + chunk_ids : `~collections.abc.Sequence` [ `int` ] + Replica chunk IDs to retrieve. + + Returns + ------- + chunks : `~collections.abc.Sequence` [ `PpdbReplicaChunkExtended` ] + List of matching chunks ordered by ``last_update_time``. + """ + if not chunk_ids: + return [] + + table = self.get_table("PpdbReplicaChunk") + query = ( + sqlalchemy.sql.select( + table.columns["apdb_replica_chunk"], + table.columns["last_update_time"], + table.columns["unique_id"], + table.columns["replica_time"], + table.columns["status"], + table.columns["directory"], + table.columns["gcs_uri"], + ) + .where(table.columns["apdb_replica_chunk"].in_(chunk_ids)) + .order_by(table.columns["apdb_replica_chunk"]) + ) + + chunks: list[PpdbReplicaChunkExtended] = [] + with self._engine.connect() as conn: + result = conn.execution_options(stream_results=True, max_row_buffer=10000).execute(query) + for row in result: + last_update_time = self.to_astropy_tai(row[1]) + replica_time = self.to_astropy_tai(row[3]) + chunks.append( + PpdbReplicaChunkExtended( + id=row[0], + last_update_time=last_update_time, + unique_id=row[2], + replica_time=replica_time, + status=row[4], + directory=Path(row[5]), + gcs_uri=row[6], + ) + ) + return chunks + def store_chunk(self, replica_chunk: PpdbReplicaChunkExtended, update: bool) -> None: """Insert or replace single record in PpdbReplicaChunk table, including the status and directory of the replica chunk. diff --git a/python/lsst/dax/ppdb/bigquery/updates/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/__init__.py index 7673a1ca..ef9abea9 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/__init__.py +++ b/python/lsst/dax/ppdb/bigquery/updates/__init__.py @@ -19,13 +19,14 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -from .update_records import UpdateRecords from .expanded_update_record import ExpandedUpdateRecord -from .update_record_expander import UpdateRecordExpander -from .updates_table import UpdatesTable from .updates_merger import ( UpdatesMerger, DiaObjectUpdatesMerger, DiaSourceUpdatesMerger, DiaForcedSourceUpdatesMerger, ) +from .update_records import UpdateRecords +from .update_record_expander import UpdateRecordExpander +from .updates_table import UpdatesTable +from .updates_manager import UpdatesManager diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py b/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py new file mode 100644 index 00000000..dc605f10 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py @@ -0,0 +1,107 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import posixpath +import urllib +from collections.abc import Sequence + +from google.cloud import bigquery, storage + +from ..ppdb_bigquery import PpdbBigQuery +from .update_record_expander import UpdateRecordExpander +from .update_records import DEFAULT_FILENAME, UpdateRecords +from .updates_merger import ( + DiaForcedSourceUpdatesMerger, + DiaObjectUpdatesMerger, + DiaSourceUpdatesMerger, + UpdatesMerger, +) +from .updates_table import UpdatesTable + +DEFAULT_MERGERS = (DiaObjectUpdatesMerger, DiaSourceUpdatesMerger, DiaForcedSourceUpdatesMerger) + + +class UpdatesManager: + """Class responsible for managing the process of applying updates to the + PPDB database, including merging updates and inserting them into the + database. + """ + + def __init__( + self, + ppdb: PpdbBigQuery, + mergers: Sequence[type[UpdatesMerger]] = DEFAULT_MERGERS, + updates_table_name="updates", + deduplicated_updates_table_name="updates_deduplicated", + ) -> None: + self._ppdb = ppdb + self._mergers = mergers + self._deduplicated_updates_table_name = deduplicated_updates_table_name + + self._bq_client = bigquery.Client() + + self._updates_table = UpdatesTable( + self._bq_client, + f"{self._ppdb._config.project_id}.{self._ppdb._config.dataset_id}.{updates_table_name}", + ) + self._updates_table.create() + + self._gcs_client = storage.Client() + self._bucket = self._gcs_client.bucket(self._ppdb._config.bucket_name) + + def apply_updates(self, replica_chunk_ids: Sequence[int], table_name_postfix: str | None = None) -> None: + replica_chunks = self._ppdb.get_replica_chunks_ext_by_ids(replica_chunk_ids) + for replica_chunk in replica_chunks: + if replica_chunk.gcs_uri is None: + raise ValueError(f"Replica chunk {replica_chunk.id} does not have a GCS URI") + + # Parse the GCS URI to get the bucket name and object name + parsed_uri = urllib.parse.urlparse(replica_chunk.gcs_uri) + bucket_name = parsed_uri.netloc + object_name = posixpath.join(parsed_uri.path.lstrip("/"), DEFAULT_FILENAME) + + # Get the blob from the bucket + bucket = self._gcs_client.bucket(bucket_name) + blob = bucket.blob(object_name) + content = blob.download_as_text() + + # Expand the update records into the appropriate format for + # inserting into the updates table + update_records = UpdateRecords.from_json_string(content) + expanded_update_records = UpdateRecordExpander.expand_updates(update_records) + self._updates_table.insert(expanded_update_records) + + # Deduplicate the update records to a new table + deduplicated_updates_table_fqn = ( + f"{self._ppdb.project_id}.{self._ppdb._config.dataset_id}.{self._deduplicated_updates_table_name}" + ) + self._updates_table.deduplicate_to(deduplicated_updates_table_fqn) + + # Merge the deduplicated updates into the target tables + for merger in self._mergers: + merger_instance = merger(self._bq_client) + if table_name_postfix: + # Apply a postfix like "_next" to the target table + merger_instance.target_table_name += f"_{table_name_postfix}" + target_dataset_fqn = f"{self._ppdb._config.project_id}.{self._ppdb._config.dataset_id}" + merger_instance.merge( + updates_table_fqn=deduplicated_updates_table_fqn, target_dataset_fqn=target_dataset_fqn + ) diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py index dfa85e02..d03ab69a 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -42,14 +42,28 @@ class UpdatesMerger(ABC): statement for this merger. The SQL file must be located in the `lsst.dax.ppdb.config.sql` package.""" - def __init__(self, client: bigquery.Client) -> None: + def __init__(self, client: bigquery.Client, target_table_name: str = None) -> None: """ Parameters ---------- client BigQuery client. + target_table_name + Optional name of the target table. If not provided, the class-level + TABLE_NAME will be used. """ self._client: bigquery.Client = client + self._target_table_name = target_table_name or self.TABLE_NAME + + @property + def target_table_name(self) -> str: + """Get the name of the target table this merger applies to.""" + return self._target_table_name + + @target_table_name.setter + def target_table_name(self, value: str) -> None: + """Set the name of the target table this merger applies to.""" + self._target_table_name = value def merge(self, *, updates_table_fqn: str, target_dataset_fqn: str) -> bigquery.QueryJob: """ diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_table.py b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py index 5553d342..b8054bbf 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_table.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_table.py @@ -117,14 +117,14 @@ def create(self) -> bigquery.Table: table = bigquery.Table(self._table_fqn, schema=schema) return self._client.create_table(table) - def append(self, records: Iterable[ExpandedUpdateRecord]) -> bigquery.LoadJob: + def insert(self, records: Iterable[ExpandedUpdateRecord]) -> bigquery.LoadJob: """ - Append `ExpandedUpdateRecord` rows into the updates table. + Insert `ExpandedUpdateRecord` rows into the updates table. Parameters ---------- records - Iterable of update records to append. + Iterable of update records to insert. Returns ------- @@ -155,7 +155,7 @@ def append(self, records: Iterable[ExpandedUpdateRecord]) -> bigquery.LoadJob: for r in records ] - print("Appending rows to BigQuery:", rows) # Debug print to verify the data being loaded + print("Inserting rows into BigQuery:", rows) # Debug print to verify the data being loaded job = self._client.load_table_from_json( rows, diff --git a/tests/test_updates_manager.py b/tests/test_updates_manager.py new file mode 100644 index 00000000..b536612b --- /dev/null +++ b/tests/test_updates_manager.py @@ -0,0 +1,260 @@ +# This file is part of dax_ppdb. +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (http://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +import io +import json +import posixpath +import unittest +import uuid +from collections.abc import Collection, Sequence + +import astropy +import felis +from google.cloud import bigquery, storage + +from lsst.dax.apdb import ( + ApdbTableData, + ReplicaChunk, +) +from lsst.dax.ppdb import Ppdb +from lsst.dax.ppdb.bigquery import PpdbBigQuery, updates +from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader +from lsst.dax.ppdb.tests._bigquery import _PostgresMixin +from lsst.dax.ppdb.tests._updates import _create_test_update_records + + +def _generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: + """Generate a unique bucket name for testing.""" + test_id = uuid.uuid4().hex[:16] + return f"{test_prefix}-{test_id}" + + +class UpdatesManagerTestCase(_PostgresMixin, unittest.TestCase): + """A test case for the handling of APDB record updates by PpdbBigQuery and + related classes including the ChunkUploader. + """ + + def setUp(self): + super().setUp() + + # Create the PPDB config + self.ppdb_config = self.make_instance() + + # Set up BigQuery client and test dataset + self.bq_client = bigquery.Client() + self.ppdb_config.project_id = self.bq_client.project + self.ppdb_config.dataset_id = f"test_updates_manager_{uuid.uuid4().hex[:8]}" + self.target_dataset_fqn = f"{self.ppdb_config.project_id}.{self.ppdb_config.dataset_id}" + self._create_test_dataset(self.bq_client, self.ppdb_config.dataset_id) + + # Generate a unique bucket name for the test and create it + self.ppdb_config.bucket_name = _generate_test_bucket_name("ppdb-updates-manager-test") + storage_client = storage.Client() + try: + bucket = storage_client.bucket(self.ppdb_config.bucket_name) + bucket.create(location="US") + except Exception as e: + self.fail(f"Failed to create test GCS bucket: {e}") + + # Create the PPDB instance + self.ppdb = Ppdb.from_config(self.ppdb_config) + assert isinstance(self.ppdb, PpdbBigQuery) + + def tearDown(self): + # Cleanup the test dataset + try: + self.bq_client.delete_dataset( + self.ppdb_config.dataset_id, delete_contents=True, not_found_ok=True + ) + except Exception as e: + print(f"Failed to delete test dataset: {e}") + + # Cleanup the test GCS bucket + storage_client = storage.Client() + try: + bucket = storage_client.bucket(self.ppdb_config.bucket_name) + blobs = list(bucket.list_blobs()) + for blob in blobs: + blob.delete() + bucket.delete() + except Exception as e: + print(f"Failed to delete test GCS bucket: {e}") + + super().tearDown() + + def _json_rows_to_buf(self, rows): + buf = io.StringIO() + for row in rows: + buf.write(json.dumps(row) + "\n") + buf.seek(0) + return buf + + def _create_test_dataset(self, client: bigquery.Client, dataset_id: str) -> None: + dataset = bigquery.Dataset(f"{client.project}.{dataset_id}") + client.create_dataset(dataset, exists_ok=False) + + # Create DiaObject table + schema = [ + bigquery.SchemaField("diaObjectId", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("validityEndMjdTai", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("nDiaSources", "INTEGER", mode="NULLABLE"), + ] + table_fqn = f"{self.target_dataset_fqn}.DiaObject" + table = bigquery.Table(table_fqn, schema=schema) + client.create_table(table) + rows = [ + {"diaObjectId": 200001, "validityEndMjdTai": None, "nDiaSources": 3}, + {"diaObjectId": 200002, "validityEndMjdTai": None, "nDiaSources": 7}, + {"diaObjectId": 200003, "validityEndMjdTai": 59000.0, "nDiaSources": 2}, + ] + buf = self._json_rows_to_buf(rows) + job = client.load_table_from_file( + buf, + table_fqn, + job_config=bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON), + ) + job.result() + + # Create test DiaSource table + schema = [ + bigquery.SchemaField("diaSourceId", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("diaObjectId", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ssObjectId", "INTEGER", mode="NULLABLE"), + bigquery.SchemaField("ssObjectReassocTimeMjdTai", "FLOAT", mode="NULLABLE"), + bigquery.SchemaField("timeWithdrawnMjdTai", "FLOAT", mode="NULLABLE"), + ] + table_fqn = f"{self.target_dataset_fqn}.DiaSource" + table = bigquery.Table(table_fqn, schema=schema) + self.bq_client.create_table(table) + rows = [ + { + "diaSourceId": 100001, + "diaObjectId": 200001, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + { + "diaSourceId": 100002, + "diaObjectId": 200002, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + { + "diaSourceId": 100003, + "diaObjectId": 200003, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + { + "diaSourceId": 100004, + "diaObjectId": 200004, + "ssObjectId": None, + "ssObjectReassocTimeMjdTai": None, + "timeWithdrawnMjdTai": None, + }, + ] + job = client.load_table_from_file( + self._json_rows_to_buf(rows), + table_fqn, + job_config=bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON), + ) + job.result() + + # Create test DiaForcedSource table + schema = [ + bigquery.SchemaField("diaObjectId", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("visit", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("detector", "INTEGER", mode="REQUIRED"), + bigquery.SchemaField("timeWithdrawnMjdTai", "FLOAT", mode="NULLABLE"), + ] + table_fqn = f"{self.target_dataset_fqn}.DiaForcedSource" + table = bigquery.Table(table_fqn, schema=schema) + self.bq_client.create_table(table) + rows = [ + {"diaObjectId": 200001, "visit": 12345, "detector": 42, "timeWithdrawnMjdTai": None}, + {"diaObjectId": 200001, "visit": 12346, "detector": 42, "timeWithdrawnMjdTai": None}, + ] + job = self.bq_client.load_table_from_file( + self._json_rows_to_buf(rows), + table_fqn, + job_config=bigquery.LoadJobConfig(source_format=bigquery.SourceFormat.NEWLINE_DELIMITED_JSON), + ) + job.result() + + def test_apply_updates(self): + """Test that the update records are correctly uploaded to Google Cloud + Storage after replication. + """ + # Patch the ChunkUploader to print the message that would be published + # to the Pub/Sub topic instead of publishing, because there is no + # support for that service in the test environment. + dataset_id = self.ppdb_config.dataset_id + + class DummyChunkUploader(ChunkUploader): + def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: + message = { + "dataset": dataset_id, + "chunk_id": str(chunk_id), + "folder": f"gs://{posixpath.join(bucket_name, chunk_prefix)}", + } + print(f"Dummy publish to Pub/Sub topic: {message}") + + class DummyApdbTableData(ApdbTableData): + def column_names(self) -> Sequence[str]: + return [] + + def column_defs(self) -> Sequence[tuple[str, felis.datamodel.DataType]]: + return [] + + def rows(self) -> Collection[tuple]: + return [] + + # Create and store the test update records + update_records = _create_test_update_records() + self.ppdb.store( + ReplicaChunk( + id=update_records.replica_chunk_id, + last_update_time=astropy.time.Time("2021-01-01T00:01:00", format="isot", scale="tai"), + unique_id=uuid.uuid4(), + ), + objects=DummyApdbTableData(), + sources=DummyApdbTableData(), + forced_sources=DummyApdbTableData(), + update_records=update_records.records, + update=True, + ) + + # Configure and run the uploader + uploader = DummyChunkUploader( + self.ppdb_config, + wait_interval=0, + exit_on_empty=True, + exit_on_error=True, + ) + print(f"Uploader will copy files to {uploader.bucket_name}/{uploader.prefix}") + uploader.run() + + # Apply the updates to the target tables + updates_manager = updates.UpdatesManager(self.ppdb) + updates_manager.apply_updates([update_records.replica_chunk_id]) From 3264f36f9c8ce6e148ed7b57eb90255149b48c10 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 27 Feb 2026 17:36:26 -0600 Subject: [PATCH 35/49] Add .scratch to .gitignore --- .gitignore | 3 +++ 1 file changed, 3 insertions(+) diff --git a/.gitignore b/.gitignore index a83e3ca4..1487de3e 100644 --- a/.gitignore +++ b/.gitignore @@ -27,3 +27,6 @@ pytest_session.txt # VS Code .vscode + +# Scratch directory +.scratch From 73869d53ee2fa92c21599cc0016dd49912716076 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 27 Feb 2026 17:36:54 -0600 Subject: [PATCH 36/49] Cleanup some test classes (WIP) and other minor changes --- pyproject.toml | 2 +- python/lsst/dax/ppdb/bigquery/__init__.py | 1 + python/lsst/dax/ppdb/bigquery/sql_resource.py | 5 +- python/lsst/dax/ppdb/config/__init__.py | 0 .../schemas/test_apdb_schema.yaml} | 0 python/lsst/dax/ppdb/config/sql/__init__.py | 0 python/lsst/dax/ppdb/tests/_bigquery.py | 51 +++++++++++++-- python/lsst/dax/ppdb/tests/_ppdb.py | 2 +- tests/test_ppdb_bigquery.py | 6 +- tests/test_update_records.py | 61 +++++++---------- tests/test_updates_manager.py | 65 +++++++++---------- tests/test_updates_merger.py | 6 +- tests/test_updates_table.py | 20 +++--- 13 files changed, 121 insertions(+), 98 deletions(-) delete mode 100644 python/lsst/dax/ppdb/config/__init__.py rename python/lsst/dax/ppdb/{tests/config/schema.yaml => config/schemas/test_apdb_schema.yaml} (100%) delete mode 100644 python/lsst/dax/ppdb/config/sql/__init__.py diff --git a/pyproject.toml b/pyproject.toml index 937bb005..24e1e41a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -55,7 +55,7 @@ where = ["python"] zip-safe = true [tool.setuptools.package-data] -"lsst.dax.ppdb" = ["py.typed", "tests/config/*.yaml", "config/sql/*.sql"] +"lsst.dax.ppdb" = ["py.typed", "config/schemas/*.yaml", "config/sql/*.sql"] [tool.setuptools.dynamic] version = { attr = "lsst_versions.get_lsst_version" } diff --git a/python/lsst/dax/ppdb/bigquery/__init__.py b/python/lsst/dax/ppdb/bigquery/__init__.py index 19d8c17d..e7b3071b 100644 --- a/python/lsst/dax/ppdb/bigquery/__init__.py +++ b/python/lsst/dax/ppdb/bigquery/__init__.py @@ -20,5 +20,6 @@ # along with this program. If not, see . from .manifest import Manifest +from .chunk_uploader import ChunkUploader from .ppdb_bigquery import PpdbBigQuery, PpdbBigQueryConfig from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended diff --git a/python/lsst/dax/ppdb/bigquery/sql_resource.py b/python/lsst/dax/ppdb/bigquery/sql_resource.py index 2548bd3c..99d4f89d 100644 --- a/python/lsst/dax/ppdb/bigquery/sql_resource.py +++ b/python/lsst/dax/ppdb/bigquery/sql_resource.py @@ -37,10 +37,9 @@ class SqlResource: Optional dictionary of arguments for formatting the SQL text. """ - SQL_RESOURCE_PACKAGE = "lsst.dax.ppdb.config.sql" - def __init__(self, sql_resource_name: str, format_args: dict[str, str] | None = None) -> None: - sql_resource_path = f"resource://{self.SQL_RESOURCE_PACKAGE}/{sql_resource_name}.sql" + # FIXME: Move the config dir into a resources dir (similar to obs_lsst) + sql_resource_path = f"resource://lsst.dax.ppdb/config/sql/{sql_resource_name}.sql" sql = ResourcePath(sql_resource_path).read().decode("utf-8") if format_args is not None: try: diff --git a/python/lsst/dax/ppdb/config/__init__.py b/python/lsst/dax/ppdb/config/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/lsst/dax/ppdb/tests/config/schema.yaml b/python/lsst/dax/ppdb/config/schemas/test_apdb_schema.yaml similarity index 100% rename from python/lsst/dax/ppdb/tests/config/schema.yaml rename to python/lsst/dax/ppdb/config/schemas/test_apdb_schema.yaml diff --git a/python/lsst/dax/ppdb/config/sql/__init__.py b/python/lsst/dax/ppdb/config/sql/__init__.py deleted file mode 100644 index e69de29b..00000000 diff --git a/python/lsst/dax/ppdb/tests/_bigquery.py b/python/lsst/dax/ppdb/tests/_bigquery.py index 1861396e..551c1e2e 100644 --- a/python/lsst/dax/ppdb/tests/_bigquery.py +++ b/python/lsst/dax/ppdb/tests/_bigquery.py @@ -20,23 +20,27 @@ # along with this program. If not, see . import gc +import posixpath import shutil import tempfile +import uuid from typing import Any +from google.cloud import storage + from lsst.dax.apdb import ( ApdbConfig, ) from lsst.dax.apdb.sql import ApdbSql from lsst.dax.ppdb import PpdbConfig -from lsst.dax.ppdb.bigquery import PpdbBigQuery +from lsst.dax.ppdb.bigquery import ChunkUploader, PpdbBigQuery +from lsst.dax.ppdb.tests._ppdb import TEST_SCHEMA_RESOURCE_PATH try: import testing.postgresql except ImportError: testing = None -from lsst.dax.ppdb.tests import TEST_SCHEMA_RESOURCE_PATH TEST_CONFIG = { "db_drop": True, @@ -49,7 +53,42 @@ } -class _SqliteMixin: +def generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: + """Generate a unique bucket name for testing.""" + test_id = uuid.uuid4().hex[:16] + return f"{test_prefix}-{test_id}" + + +def delete_test_bucket(bucket_or_bucket_name: str | storage.Bucket) -> None: + storage_client = storage.Client() + try: + if isinstance(bucket_or_bucket_name, str): + bucket = storage_client.bucket(bucket_or_bucket_name) + else: + bucket = bucket_or_bucket_name + blobs = list(bucket.list_blobs()) + for blob in blobs: + blob.delete() + bucket.delete() + except Exception as e: + print(f"Failed to delete test GCS bucket: {e}") + + +class ChunkUploaderWithoutPubSub(ChunkUploader): + """A dummy implementation of the ChunkUploader that does not actually + post messages to Pub/Sub. + """ + + def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: + message = { + "dataset": None, + "chunk_id": str(chunk_id), + "folder": f"gs://{posixpath.join(bucket_name, chunk_prefix)}", + } + print(f"Dummy publish to Pub/Sub topic: {message}") + + +class SqliteMixin: """Mixin class to provide Sqlite-specific setup/teardown and instance creation. """ @@ -87,7 +126,7 @@ def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: return ApdbSql.init_database(**kw) # type: ignore[arg-type] -class _PostgresMixin: +class PostgresMixin: """Mixin class to provide Postgres-specific setup/teardown and instance creation. """ @@ -116,10 +155,10 @@ def tearDown(self) -> None: self.server = self.postgresql() shutil.rmtree(self.tempdir, ignore_errors=True) - def make_instance(self, **kwargs: Any) -> PpdbConfig: + def make_instance(self, config_dict: dict[str, Any] = TEST_CONFIG, **kwargs: Any) -> PpdbConfig: """Make config class instance used in all tests.""" kw = { - **TEST_CONFIG, + **config_dict, "db_url": self.server.url(), "db_schema": "ppdb_test", "felis_path": TEST_SCHEMA_RESOURCE_PATH, diff --git a/python/lsst/dax/ppdb/tests/_ppdb.py b/python/lsst/dax/ppdb/tests/_ppdb.py index ffb50e84..245639ea 100644 --- a/python/lsst/dax/ppdb/tests/_ppdb.py +++ b/python/lsst/dax/ppdb/tests/_ppdb.py @@ -60,7 +60,7 @@ class TestCaseMixin: """Do-nothing definition of mixin base class for regular execution.""" -TEST_SCHEMA_RESOURCE_PATH = "resource://lsst.dax.ppdb.tests.config/schema.yaml" +TEST_SCHEMA_RESOURCE_PATH = "resource://lsst.dax.ppdb/config/schemas/test_apdb_schema.yaml" def _make_region(xyz: tuple[float, float, float] = (1.0, 1.0, -1.0)) -> Region: diff --git a/tests/test_ppdb_bigquery.py b/tests/test_ppdb_bigquery.py index b23381e5..43a870d9 100644 --- a/tests/test_ppdb_bigquery.py +++ b/tests/test_ppdb_bigquery.py @@ -22,7 +22,7 @@ import unittest from lsst.dax.ppdb.tests import PpdbTest -from lsst.dax.ppdb.tests._bigquery import _PostgresMixin, _SqliteMixin +from lsst.dax.ppdb.tests._bigquery import PostgresMixin, SqliteMixin try: import testing.postgresql @@ -30,10 +30,10 @@ testing = None -class SqliteTestCase(_SqliteMixin, PpdbTest, unittest.TestCase): +class SqliteTestCase(SqliteMixin, PpdbTest, unittest.TestCase): """A test case for the PpdbBigQuery class using a SQLite backend.""" @unittest.skipUnless(testing is not None, "testing.postgresql module not found") -class PostgresTestCase(_PostgresMixin, PpdbTest, unittest.TestCase): +class PostgresTestCase(PostgresMixin, PpdbTest, unittest.TestCase): """A test case for the PpdbBigQuery class using a Postgres backend.""" diff --git a/tests/test_update_records.py b/tests/test_update_records.py index e523544a..c6a1dd8e 100644 --- a/tests/test_update_records.py +++ b/tests/test_update_records.py @@ -20,11 +20,10 @@ # along with this program. If not, see . import json -import posixpath import unittest -import uuid import pytest +from google.cloud import storage try: from lsst.dax.ppdb.bigquery import updates @@ -41,17 +40,16 @@ from lsst.dax.ppdb.bigquery import PpdbBigQuery from lsst.dax.ppdb.replicator import Replicator from lsst.dax.ppdb.tests import ApdbMixin -from lsst.dax.ppdb.tests._bigquery import _PostgresMixin - - -def _generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: - """Generate a unique bucket name for testing.""" - test_id = uuid.uuid4().hex[:16] - return f"{test_prefix}-{test_id}" +from lsst.dax.ppdb.tests._bigquery import ( + ChunkUploaderWithoutPubSub, + PostgresMixin, + delete_test_bucket, + generate_test_bucket_name, +) @unittest.skipIf(updates is None, "Google Cloud dependencies not available") -class UpdateRecordsTestCase(_PostgresMixin, ApdbMixin, unittest.TestCase): +class UpdateRecordsTestCase(PostgresMixin, ApdbMixin, unittest.TestCase): """A test case for the handling of APDB record updates by PpdbBigQuery and related classes including the ChunkUploader. """ @@ -259,6 +257,8 @@ def test_json_serialization(self) -> None: "Unexpected dec in deserialized ApdbWithdrawDiaForcedSourceRecord, should not be 0.0", ) + # FIXME: This should be in a separate test case and probably a separate + # module as well. @pytest.mark.skipif( pytest.importorskip("lsst.dax.ppdbx.gcp", reason="dax_ppdbx_gcp is not installed") is None, reason="", @@ -267,36 +267,21 @@ def test_chunk_uploader(self) -> None: """Test that the update records are correctly uploaded to Google Cloud Storage after replication. """ - from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader - from lsst.dax.ppdbx.gcp.gcs import StorageClient - # Change the configuration to use a unique test bucket name to avoid # conflicts - ppdb_config_copy = self.ppdb_config.model_copy() - ppdb_config_copy.bucket_name = _generate_test_bucket_name("ppdb-test-gcs-upload") - - # Patch the ChunkUploader to print the message that would be published - # to the Pub/Sub topic instead of publishing, because there is no - # support for that service in the test environment. - class DummyChunkUploader(ChunkUploader): - def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: - message = { - "dataset": self.dataset_id, - "chunk_id": str(chunk_id), - "folder": f"gs://{posixpath.join(bucket_name, chunk_prefix)}", - } - print(f"Dummy publish to Pub/Sub topic: {message}") + self.ppdb_config.bucket_name = generate_test_bucket_name("ppdb-test-gcs-upload") # Create the test GCS bucket - storage_client = StorageClient(ppdb_config_copy.bucket_name) + storage_client = storage.Client() try: - storage_client.create_bucket() + bucket = storage_client.bucket(self.ppdb_config.bucket_name) + bucket.create(location="US") except Exception as e: self.fail(f"Failed to create test GCS bucket: {e}") # Configure and run the uploader - uploader = DummyChunkUploader( - ppdb_config_copy, + uploader = ChunkUploaderWithoutPubSub( + self.ppdb_config, wait_interval=0, exit_on_empty=True, exit_on_error=True, @@ -304,15 +289,18 @@ def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_ print(f"Uploader will copy files to {uploader.bucket_name}/{uploader.prefix}/") uploader.run() - # Retrieve the update records file - update_records_files = storage_client.list_files("**/update_records.json") + # Retrieve the update records file[] + blobs = list(bucket.list_blobs(match_glob="**/update_records.json")) + update_records_files = [b.name for b in blobs] self.assertEqual( len(update_records_files), 1, f"Expected exactly one update_records.json file in GCS, found " f"{len(update_records_files)}: {update_records_files}", ) - update_records_str = storage_client.read_as_string(update_records_files[0]) + + # Download the contents of the update records file as a string + update_records_str = blobs[0].download_as_text() # Print the contents of the update records file for debugging update_records_json = json.loads(update_records_str) @@ -339,8 +327,9 @@ def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_ f"Expected 3 update records in the file from GCS, found {len(update_records.records)}", ) + # FIXME: This should be in a tearDown() method. # Delete the test GCS bucket try: - storage_client.delete_bucket(force=True) + delete_test_bucket(bucket) except Exception as e: - self.fail(f"Failed to delete test GCS bucket: {e}") + raise RuntimeError(f"Failed to delete test GCS bucket: {e}") diff --git a/tests/test_updates_manager.py b/tests/test_updates_manager.py index b536612b..74d36d8b 100644 --- a/tests/test_updates_manager.py +++ b/tests/test_updates_manager.py @@ -21,7 +21,6 @@ import io import json -import posixpath import unittest import uuid from collections.abc import Collection, Sequence @@ -36,18 +35,15 @@ ) from lsst.dax.ppdb import Ppdb from lsst.dax.ppdb.bigquery import PpdbBigQuery, updates -from lsst.dax.ppdb.bigquery.chunk_uploader import ChunkUploader -from lsst.dax.ppdb.tests._bigquery import _PostgresMixin +from lsst.dax.ppdb.tests._bigquery import ( + ChunkUploaderWithoutPubSub, + PostgresMixin, + generate_test_bucket_name, +) from lsst.dax.ppdb.tests._updates import _create_test_update_records -def _generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: - """Generate a unique bucket name for testing.""" - test_id = uuid.uuid4().hex[:16] - return f"{test_prefix}-{test_id}" - - -class UpdatesManagerTestCase(_PostgresMixin, unittest.TestCase): +class UpdatesManagerTestCase(PostgresMixin, unittest.TestCase): """A test case for the handling of APDB record updates by PpdbBigQuery and related classes including the ChunkUploader. """ @@ -55,18 +51,30 @@ class UpdatesManagerTestCase(_PostgresMixin, unittest.TestCase): def setUp(self): super().setUp() - # Create the PPDB config - self.ppdb_config = self.make_instance() - # Set up BigQuery client and test dataset self.bq_client = bigquery.Client() - self.ppdb_config.project_id = self.bq_client.project - self.ppdb_config.dataset_id = f"test_updates_manager_{uuid.uuid4().hex[:8]}" - self.target_dataset_fqn = f"{self.ppdb_config.project_id}.{self.ppdb_config.dataset_id}" - self._create_test_dataset(self.bq_client, self.ppdb_config.dataset_id) - # Generate a unique bucket name for the test and create it - self.ppdb_config.bucket_name = _generate_test_bucket_name("ppdb-updates-manager-test") + bucket_name = generate_test_bucket_name("ppdb-updates-manager-test") + dataset_id = f"test_updates_manager_{uuid.uuid4().hex[:8]}" + project_id = self.bq_client.project + config = { + "db_drop": True, + "validate_config": False, + "delete_existing_dirs": True, + "bucket_name": bucket_name, + "object_prefix": "data/test", + "dataset_id": dataset_id, + "project_id": project_id, + } + + # Setup the Postgres database and create the config instance + self.ppdb_config = self.make_instance(config) + + # Create the test dataset and tables in BigQuery + self.target_dataset_fqn = f"{project_id}.{dataset_id}" + self._create_test_dataset(self.bq_client, dataset_id) + + # Create the test GCS bucket storage_client = storage.Client() try: bucket = storage_client.bucket(self.ppdb_config.bucket_name) @@ -79,7 +87,7 @@ def setUp(self): assert isinstance(self.ppdb, PpdbBigQuery) def tearDown(self): - # Cleanup the test dataset + # Delete the test dataset try: self.bq_client.delete_dataset( self.ppdb_config.dataset_id, delete_contents=True, not_found_ok=True @@ -87,7 +95,7 @@ def tearDown(self): except Exception as e: print(f"Failed to delete test dataset: {e}") - # Cleanup the test GCS bucket + # Delete the test GCS bucket storage_client = storage.Client() try: bucket = storage_client.bucket(self.ppdb_config.bucket_name) @@ -206,19 +214,6 @@ def test_apply_updates(self): """Test that the update records are correctly uploaded to Google Cloud Storage after replication. """ - # Patch the ChunkUploader to print the message that would be published - # to the Pub/Sub topic instead of publishing, because there is no - # support for that service in the test environment. - dataset_id = self.ppdb_config.dataset_id - - class DummyChunkUploader(ChunkUploader): - def _post_to_stage_chunk_topic(self, bucket_name: str, chunk_prefix: str, chunk_id: int) -> None: - message = { - "dataset": dataset_id, - "chunk_id": str(chunk_id), - "folder": f"gs://{posixpath.join(bucket_name, chunk_prefix)}", - } - print(f"Dummy publish to Pub/Sub topic: {message}") class DummyApdbTableData(ApdbTableData): def column_names(self) -> Sequence[str]: @@ -246,7 +241,7 @@ def rows(self) -> Collection[tuple]: ) # Configure and run the uploader - uploader = DummyChunkUploader( + uploader = ChunkUploaderWithoutPubSub( self.ppdb_config, wait_interval=0, exit_on_empty=True, diff --git a/tests/test_updates_merger.py b/tests/test_updates_merger.py index ed4dffa7..5fdca1bb 100644 --- a/tests/test_updates_merger.py +++ b/tests/test_updates_merger.py @@ -98,7 +98,7 @@ def test_merge_diaobject(self): updates_table.create() update_records = _create_test_update_records() expanded = UpdateRecordExpander.expand_updates(update_records) - updates_table.append(expanded) + updates_table.insert(expanded) dedup_fqn = f"{self.updates_table_fqn}_dedup" updates_table.deduplicate_to(dedup_fqn) table_fqn = f"{self.target_dataset_fqn}.DiaObject" @@ -168,7 +168,7 @@ def test_merge_diasource(self): updates_table.create() update_records = _create_test_update_records() expanded = UpdateRecordExpander.expand_updates(update_records) - updates_table.append(expanded) + updates_table.insert(expanded) dedup_fqn = f"{self.updates_table_fqn}_dedup" updates_table.deduplicate_to(dedup_fqn) @@ -211,7 +211,7 @@ def test_merge_diaforcedsource(self): updates_table.create() update_records = _create_test_update_records() expanded = UpdateRecordExpander.expand_updates(update_records) - updates_table.append(expanded) + updates_table.insert(expanded) dedup_fqn = f"{self.updates_table_fqn}_dedup" updates_table.deduplicate_to(dedup_fqn) diff --git a/tests/test_updates_table.py b/tests/test_updates_table.py index aeb196b4..bddbbd7b 100644 --- a/tests/test_updates_table.py +++ b/tests/test_updates_table.py @@ -107,8 +107,8 @@ def test_create_table_already_exists(self) -> None: # Check that it's a conflict-type error self.assertIn("already exists", str(cm.exception).lower()) - def test_append_records(self) -> None: - """Test appending ExpandedUpdateRecord objects to the table.""" + def test_insert_records(self) -> None: + """Test insertion of expanded records into the table.""" # Create the table first self.updates_table.create() @@ -116,8 +116,8 @@ def test_append_records(self) -> None: update_records = _create_test_update_records() expanded_records = UpdateRecordExpander.expand_updates(update_records) - # Append the records - job = self.updates_table.append(expanded_records) + # Insert the records + job = self.updates_table.insert(expanded_records) # Verify the job completed successfully self.assertIsNone(job.errors) @@ -148,13 +148,13 @@ def test_append_records(self) -> None: # self.assertEqual(row.field_name, "timeWithdrawnMjdTai") # self.assertEqual(row.replica_chunk_id, self.replica_chunk_id) - def test_append_empty_records(self) -> None: - """Test appending empty list of records.""" + def test_insert_empty_records(self) -> None: + """Test insertion of empty record list.""" # Create the table first self.updates_table.create() - # Append empty list - job = self.updates_table.append([]) + # Insert empty list + job = self.updates_table.insert([]) # Verify the job completed successfully self.assertIsNone(job.errors) @@ -174,8 +174,8 @@ def test_deduplicate_records(self) -> None: update_records = _create_test_update_records() expanded_records = UpdateRecordExpander.expand_updates(update_records) - # Append all records (including duplicates) - self.updates_table.append(expanded_records) + # Insert all records (including duplicates) + self.updates_table.insert(expanded_records) # Count original records query = f"SELECT COUNT(*) as count FROM `{self.table_fqn}`" From b34a1383e1bfa18de38a018a7c54c4bce8e8cc32 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 27 Feb 2026 17:37:18 -0600 Subject: [PATCH 37/49] Add BigQuery classes from dax_ppdbx_gcp --- python/lsst/dax/ppdb/bigquery/query_runner.py | 161 +++++++++++ .../ppdb/bigquery/replica_chunk_promoter.py | 258 ++++++++++++++++++ 2 files changed, 419 insertions(+) create mode 100644 python/lsst/dax/ppdb/bigquery/query_runner.py create mode 100644 python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py diff --git a/python/lsst/dax/ppdb/bigquery/query_runner.py b/python/lsst/dax/ppdb/bigquery/query_runner.py new file mode 100644 index 00000000..e4e57166 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/query_runner.py @@ -0,0 +1,161 @@ +# This file is part of dax_ppdbx_gcp +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +import os +from typing import TypeAlias + +__all__ = [ + "QueryRunner", +] + +import logging + +from google.cloud import bigquery + +AnyBigQueryJob: TypeAlias = ( + bigquery.job.QueryJob + | bigquery.job.LoadJob + | bigquery.job.CopyJob + | bigquery.job.ExtractJob + | bigquery.job.UnknownJob +) + + +class QueryRunner: + """Class to run BigQuery queries with logging. + + Parameters + ---------- + project_id : `str` + Google Cloud project ID. + dataset_id : `str` + BigQuery dataset ID. + """ + + def __init__(self, project_id: str, dataset_id: str): + self._project_id = project_id + self._dataset_id = dataset_id + self._bq_client = bigquery.Client(project=project_id) + self._dataset = self._bq_client.get_dataset(f"{project_id}.{dataset_id}") + self._location = self._dataset.location + + @classmethod + def from_env(cls) -> QueryRunner: + """Create a QueryRunner instance using environment variables. + + Returns + ------- + query_runner: `QueryRunner` + An instance of QueryRunner initialized with project and dataset IDs + from environment variables. + """ + project_id = os.environ.get("PROJECT_ID") + if project_id is None: + raise OSError("Environment variable 'PROJECT_ID' is not set") + + dataset_id = os.environ.get("DATASET_ID") + if dataset_id is None: + raise OSError("Environment variable 'DATASET_ID' is not set") + + return cls(project_id, dataset_id) + + @property + def project_id(self) -> str: + """Google Cloud project ID (`str`, read-only).""" + return self._project_id + + @property + def dataset(self) -> bigquery.Dataset: + """Dataset reference (`bigquery.Dataset`, read-only).""" + return self._dataset + + @property + def dataset_id(self) -> str: + """Dataset ID (`str`, read-only).""" + return self._dataset_id + + @property + def location(self) -> str: + """Dataset location, typically the region where it is hosted (`str`, + read-only). + """ + return self._location + + @classmethod + def log_job(cls, job: AnyBigQueryJob, label: str, level: int = logging.DEBUG) -> None: + """Log details of a BigQuery job. + + Parameters + ---------- + job : `bigquery.job.QueryJob` + The BigQuery job to log. + label : `str` + A label for the job, typically indicating the type of operation + (e.g., "insert", "delete", "copy"). + level : `int`, optional + The logging level to use for the log message. Defaults to + `logging.DEBUG`. + """ + logging.log( + level, + "BQ %s: job_id=%s location=%s state=%s bytes_processed=%s bytes_billed=%s slot_millis=%s " + "dml_rows=%s reference_tables=%s", + label, + job.job_id, + job.location, + job.state, + getattr(job, "total_bytes_processed", None), + getattr(job, "total_bytes_billed", None), + getattr(job, "slot_millis", None), + getattr(job, "num_dml_affected_rows", None), + getattr(job, "referenced_tables", None), + ) + + def run_job( + self, label: str, sql: str, job_config: bigquery.QueryJobConfig | None = None + ) -> bigquery.job.QueryJob: + """Run a BigQuery job with the given SQL and configuration. + + Parameters + ---------- + label : `str` + A label for the job, typically indicating the type of operation + (e.g., "insert", "delete", "copy"). + sql : `str` + The SQL query to execute. + job_config : `bigquery.QueryJobConfig`, optional + Configuration for the job, such as query parameters or write + dispositions. If not provided, a default configuration will be + used. + + Returns + ------- + job: `bigquery.job.QueryJob` + The BigQuery job object representing the executed query. This can + be used to check the status of the job, retrieve results, or log + additional details. + """ + job = self._bq_client.query(sql, job_config=job_config, location=self.dataset.location) + job.result() # Wait for the job to complete + self.log_job(job, label) + return job diff --git a/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py new file mode 100644 index 00000000..9a3fde92 --- /dev/null +++ b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py @@ -0,0 +1,258 @@ +# This file is part of dax_ppdbx_gcp +# +# Developed for the LSST Data Management System. +# This product includes software developed by the LSST Project +# (https://www.lsst.org). +# See the COPYRIGHT file at the top-level directory of this distribution +# for details of code ownership. +# +# This program is free software: you can redistribute it and/or modify +# it under the terms of the GNU General Public License as published by +# the Free Software Foundation, either version 3 of the License, or +# (at your option) any later version. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the +# GNU General Public License for more details. +# +# You should have received a copy of the GNU General Public License +# along with this program. If not, see . + +from __future__ import annotations + +__all__ = [ + "NoPromotableChunksError", + "ReplicaChunkPromoter", +] + +import logging +from collections.abc import Callable + +from google.api_core.exceptions import NotFound +from google.cloud import bigquery + +from .query_runner import QueryRunner + + +class NoPromotableChunksError(Exception): + """Exception raised when there are no promotable chunks available.""" + + pass + + +class ReplicaChunkPromoter: + """Class to promote replica chunks in BigQuery. + + Parameters + ---------- + promotable_chunks: `list`[`int`] + Sequence of tuples containing the APDB replica chunk IDs to promote. + runner : `QueryRunner`, optional + An instance of `QueryRunner` to execute queries. If not provided, a new + instance will be created using environment variables. + table_names : `list`[`str`], optional + List of table names to promote with standard default. + """ + + def __init__( + self, + promotable_chunks: list[int], + runner: QueryRunner | None = None, + table_names: list[str] | None = None, + ): + self._promotable_chunks = promotable_chunks + self._runner = runner or QueryRunner.from_env() + # DM-52326: Hard-coded table names; these should be passed in from + # config. + self._table_names = table_names or ["DiaObject", "DiaSource", "DiaForcedSource"] + self._bq_client = bigquery.Client(project=self._runner.project_id) + self._phases = { + "build_tmp": self._copy_to_promoted_tmp, + "promote_prod": self._promote_tmp_to_prod, + "delete_staged_chunks": self._delete_staged_chunks, + "cleanup": self._cleanup_promoted_tmp, + } + + @property + def project_id(self) -> str: + """Google Cloud project ID (`str`, read-only).""" + return self._runner.project_id + + @property + def dataset_id(self) -> str: + """Dataset ID (`str`, read-only).""" + return self._runner.dataset_id + + @property + def table_names(self) -> list[str]: + """List of table names to promote (`list`[`str`], read-only).""" + return self._table_names + + @property + def promotable_chunks(self) -> list[int]: + """List of promotable chunks (`list[`int`], + read-only). + """ + return self._promotable_chunks + + @promotable_chunks.setter + def promotable_chunks(self, chunks: list[int]) -> None: + if not chunks: + raise NoPromotableChunksError("No promotable chunks provided") + self._promotable_chunks = chunks + + @property + def runner(self) -> QueryRunner: + """Runner for executing BigQuery jobs (`QueryRunner`, read-only).""" + return self._runner + + @property + def bq_client(self) -> bigquery.Client: + """Client for interacting with BigQuery (`bigquery.Client`, + read-only). + """ + return self._bq_client + + @property + def phases(self) -> dict[str, Callable]: + """Phases of the promotion process as a dictionary mapping phase names + to their corresponding class methods (`dict`[`str`, `Callable`], + read-only). + """ + return self._phases + + @property + def table_prod_refs(self) -> list[str]: + """Fully-qualified production table references (`list`[`str`], + read-only). + """ + return [f"{self.project_id}.{self.dataset_id}.{table_name}" for table_name in self.table_names] + + @property + def table_staging_refs(self) -> list[str]: + """Fully-qualified staging table references (`list`[`str`], + read-only). + """ + return [ + f"{self.project_id}.{self.dataset_id}._{table_name}_staging" for table_name in self.table_names + ] + + @property + def table_promoted_tmp_refs(self) -> list[str]: + """Fully-qualified promoted temporary table references (`list`[`str`], + read-only). + """ + return [ + f"{self.project_id}.{self.dataset_id}._{table_name}_promoted_tmp" + for table_name in self.table_names + ] + + def _execute_phase(self, phase: str) -> None: + """Execute a specific promotion phase. + + Parameters + ---------- + phase : `str` + The name of the promotion phase to execute. This should be one of + the keys in the `phases` property. + """ + if phase not in self.phases: + raise ValueError(f"Unknown promotion phase: {phase}") + logging.debug("Executing promotion phase: %s", phase) + self._phases[phase]() + + def _copy_to_promoted_tmp(self) -> None: + """ + Build ``_{table_name}_promoted_tmp`` efficiently by cloning prod and + inserting only staged rows for the given replica chunk IDs. + """ + job_cfg = bigquery.QueryJobConfig( + query_parameters=[bigquery.ArrayQueryParameter("ids", "INT64", self.promotable_chunks)] + ) + + for prod_ref, tmp_ref, stage_ref in zip( + self.table_prod_refs, self.table_promoted_tmp_refs, self.table_staging_refs, strict=False + ): + # Drop any existing tmp table (should not exist but just to be + # safe) + self.runner.run_job("drop_tmp", f"DROP TABLE IF EXISTS `{tmp_ref}`") + + # Clone prod table structure and data (zero-copy) + self.runner.run_job("clone_prod", f"CREATE TABLE `{tmp_ref}` CLONE `{prod_ref}`") + + # Build ordered target list from the cloned tmp schema + tmp_schema = self.bq_client.get_table(tmp_ref).schema + target_names = [f.name for f in tmp_schema if f.name != "apdb_replica_chunk"] + target_list_sql = ", ".join(f"`{n}`" for n in target_names) + + # Build source list, handling geo_point conversion + source_list_sql = ", ".join( + "ST_GEOGPOINT(s.`ra`, s.`dec`)" if n == "geo_point" else f"s.`{n}`" for n in target_names + ) + + # Insert staged rows into tmp, excluding apdb_replica_chunk column + sql = f""" + INSERT INTO `{tmp_ref}` ({target_list_sql}) + SELECT {source_list_sql} + FROM `{stage_ref}` AS s + WHERE s.apdb_replica_chunk IN UNNEST(@ids) + """ + logging.debug("SQL for inserting staged rows into %s: %s", tmp_ref, sql) + self.runner.run_job("insert_staged_to_tmp", sql, job_config=job_cfg) + + def _promote_tmp_to_prod(self) -> None: + """ + Swap each prod table with its corresponding *_promoted_tmp by replacing + prod contents in a single atomic copy job. This preserves schema, + partitioning, and clustering with zero-copy when in the same dataset. + """ + for prod_ref, tmp_ref in zip(self.table_prod_refs, self.table_promoted_tmp_refs, strict=False): + # Ensure tmp exists + try: + self.bq_client.get_table(tmp_ref) + except NotFound as e: + raise RuntimeError(f"Missing tmp table for promotion: {tmp_ref}") from e + + # Atomic zero-copy replacement of prod with tmp + copy_cfg = bigquery.CopyJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) + job = self.bq_client.copy_table( + tmp_ref, prod_ref, job_config=copy_cfg, location=self._runner.location + ) + job.result() + QueryRunner.log_job(job, "promote_tmp_to_prod") + + def _cleanup_promoted_tmp(self) -> None: + """Drop the promotion temporary tables.""" + for tmp_ref in self.table_promoted_tmp_refs: + self.bq_client.delete_table(tmp_ref, not_found_ok=True) + logging.debug("Dropped %s (if it existed)", tmp_ref) + + def _delete_staged_chunks(self) -> None: + """Delete only rows for the promoted replica chunk IDs from each + staging table. + """ + job_config = bigquery.QueryJobConfig( + query_parameters=[bigquery.ArrayQueryParameter("ids", "INT64", self.promotable_chunks)] + ) + + for staging_ref in self.table_staging_refs: + try: + sql = f"DELETE FROM `{staging_ref}` WHERE apdb_replica_chunk IN UNNEST(@ids)" + self.runner.run_job("delete_staged_chunks", sql, job_config=job_config) + logging.debug( + "Deleted %d chunk(s) from staging table %s", len(self.promotable_chunks), staging_ref + ) + except NotFound: + logging.warning("Staging table %s does not exist, skipping delete", staging_ref) + + def promote_chunks(self) -> None: + """Promote APDB replica chunks into production.""" + try: + for phase in ("build_tmp", "promote_prod", "delete_staged_chunks"): + self._execute_phase(phase) + finally: + try: + self._execute_phase("cleanup") + except Exception: + logging.exception("Cleanup of temporary tables failed") From 380a0a61e9909c678306119685c5ddea64920b7f Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Wed, 18 Mar 2026 17:50:09 -0500 Subject: [PATCH 38/49] WIP: Integrate application of updates into promotion process --- pyproject.toml | 6 +- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 23 +++++-- python/lsst/dax/ppdb/bigquery/query_runner.py | 21 ------ .../ppdb/bigquery/replica_chunk_promoter.py | 64 ++++++++++++++----- .../ppdb/bigquery/updates/updates_manager.py | 19 ++++-- .../ppdb/bigquery/updates/updates_merger.py | 6 +- .../sql/merge_diaforcedsource_updates.sql | 18 +----- .../config/sql/merge_diaobject_updates.sql | 18 +----- .../config/sql/merge_diasource_updates.sql | 16 +---- requirements.txt | 7 +- 10 files changed, 95 insertions(+), 103 deletions(-) diff --git a/pyproject.toml b/pyproject.toml index 24e1e41a..de2ff161 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -23,10 +23,12 @@ classifiers = [ keywords = ["lsst"] dependencies = [ "astropy", + "google-cloud-bigquery", "pyarrow", "pydantic >=2,<3", "pyyaml >= 5.1", "sqlalchemy", + "lsst-dax-ppdbx-gcp", "lsst-felis", "lsst-sdm-schemas", "lsst-utils", @@ -43,10 +45,6 @@ test = [ "pytest >= 3.2", "pytest-openfiles >= 0.5.0" ] -gcp = [ - "google-cloud-bigquery", - "lsst-dax-ppdbx-gcp" -] [tool.setuptools.packages.find] where = ["python"] diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index fa15c20a..ab34bedf 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -48,6 +48,7 @@ from ..sql import PpdbSqlBase, PpdbSqlBaseConfig from .manifest import Manifest, TableStats from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended +from .query_runner import QueryRunner from .sql_resource import SqlResource __all__ = ["ConfigValidationError", "PpdbBigQuery", "PpdbBigQueryConfig"] @@ -144,16 +145,26 @@ def __init__(self, config: PpdbBigQueryConfig): self._config = config + self._query_runner: QueryRunner | None = None + @property def metadata(self) -> ApdbMetadata: - """Implement `Ppdb` interface to return APDB metadata object. + """APDB metadata object from `Ppdb` interface (`ApdbMetadata`).""" + return self._metadata - Returns - ------- - metadata : `ApdbMetadata` - APDB metadata object. + @property + def config(self) -> PpdbBigQueryConfig: + """PPDB config associated with this instance.""" + return self._config + + @property + def query_runner(self) -> QueryRunner: + """Query runner for executing SQL in BigQuery + (`~lsst.dax.ppdb.bigquery.QueryRunner`). """ - return self._metadata + if not self._query_runner: + self._query_runner = QueryRunner(self.config.project_id, self.config.dataset_id) + return self._query_runner def _init_sql(self, config: PpdbBigQueryConfig) -> None: sql_config = config.sql diff --git a/python/lsst/dax/ppdb/bigquery/query_runner.py b/python/lsst/dax/ppdb/bigquery/query_runner.py index e4e57166..40eaf023 100644 --- a/python/lsst/dax/ppdb/bigquery/query_runner.py +++ b/python/lsst/dax/ppdb/bigquery/query_runner.py @@ -21,7 +21,6 @@ from __future__ import annotations -import os from typing import TypeAlias __all__ = [ @@ -59,26 +58,6 @@ def __init__(self, project_id: str, dataset_id: str): self._dataset = self._bq_client.get_dataset(f"{project_id}.{dataset_id}") self._location = self._dataset.location - @classmethod - def from_env(cls) -> QueryRunner: - """Create a QueryRunner instance using environment variables. - - Returns - ------- - query_runner: `QueryRunner` - An instance of QueryRunner initialized with project and dataset IDs - from environment variables. - """ - project_id = os.environ.get("PROJECT_ID") - if project_id is None: - raise OSError("Environment variable 'PROJECT_ID' is not set") - - dataset_id = os.environ.get("DATASET_ID") - if dataset_id is None: - raise OSError("Environment variable 'DATASET_ID' is not set") - - return cls(project_id, dataset_id) - @property def project_id(self) -> str: """Google Cloud project ID (`str`, read-only).""" diff --git a/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py index 9a3fde92..2e39b270 100644 --- a/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py +++ b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py @@ -32,7 +32,9 @@ from google.api_core.exceptions import NotFound from google.cloud import bigquery +from .ppdb_bigquery import PpdbBigQuery from .query_runner import QueryRunner +from .updates import UpdatesManager class NoPromotableChunksError(Exception): @@ -46,34 +48,39 @@ class ReplicaChunkPromoter: Parameters ---------- - promotable_chunks: `list`[`int`] - Sequence of tuples containing the APDB replica chunk IDs to promote. - runner : `QueryRunner`, optional - An instance of `QueryRunner` to execute queries. If not provided, a new - instance will be created using environment variables. + ppdb : `PpdbBigQuery` + Interface to the PPDB in BigQuery. table_names : `list`[`str`], optional - List of table names to promote with standard default. + List of table names to promote or if None a default list will be used. """ def __init__( self, - promotable_chunks: list[int], - runner: QueryRunner | None = None, + ppdb: PpdbBigQuery, table_names: list[str] | None = None, ): - self._promotable_chunks = promotable_chunks - self._runner = runner or QueryRunner.from_env() + self._ppdb = ppdb + self._runner = ppdb.query_runner # DM-52326: Hard-coded table names; these should be passed in from # config. self._table_names = table_names or ["DiaObject", "DiaSource", "DiaForcedSource"] self._bq_client = bigquery.Client(project=self._runner.project_id) self._phases = { + "get_promotable_chunks": self._get_promotable_chunks, "build_tmp": self._copy_to_promoted_tmp, + "apply_record_updates": self._apply_record_updates, "promote_prod": self._promote_tmp_to_prod, "delete_staged_chunks": self._delete_staged_chunks, - "cleanup": self._cleanup_promoted_tmp, + "mark_promoted": self._mark_chunks_promoted, } + self._promotable_chunks: list[int] = [] + + @property + def ppdb(self) -> PpdbBigQuery: + """PPDB interface to BigQuery.""" + return self._ppdb + @property def project_id(self) -> str: """Google Cloud project ID (`str`, read-only).""" @@ -96,6 +103,11 @@ def promotable_chunks(self) -> list[int]: """ return self._promotable_chunks + @property + def promotable_chunk_count(self) -> int: + """Count of promotable chunks that were found in the database.""" + return len(self.promotable_chunks) + @promotable_chunks.setter def promotable_chunks(self, chunks: list[int]) -> None: if not chunks: @@ -162,6 +174,11 @@ def _execute_phase(self, phase: str) -> None: logging.debug("Executing promotion phase: %s", phase) self._phases[phase]() + def _get_promotable_chunks(self) -> None: + """Get list of promotable chunks from the database.""" + self.promotable_chunks = self.ppdb.get_promotable_chunks() + logging.info("Promotable chunk count: %s", len(self.promotable_chunks)) + def _copy_to_promoted_tmp(self) -> None: """ Build ``_{table_name}_promoted_tmp`` efficiently by cloning prod and @@ -217,12 +234,12 @@ def _promote_tmp_to_prod(self) -> None: # Atomic zero-copy replacement of prod with tmp copy_cfg = bigquery.CopyJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) job = self.bq_client.copy_table( - tmp_ref, prod_ref, job_config=copy_cfg, location=self._runner.location + tmp_ref, prod_ref, job_config=copy_cfg, location=self.runner.location ) job.result() QueryRunner.log_job(job, "promote_tmp_to_prod") - def _cleanup_promoted_tmp(self) -> None: + def _cleanup(self) -> None: """Drop the promotion temporary tables.""" for tmp_ref in self.table_promoted_tmp_refs: self.bq_client.delete_table(tmp_ref, not_found_ok=True) @@ -246,13 +263,26 @@ def _delete_staged_chunks(self) -> None: except NotFound: logging.warning("Staging table %s does not exist, skipping delete", staging_ref) + def _apply_record_updates(self) -> None: + """Apply record updates to the promoted temporary tables.""" + updates_manager = UpdatesManager(self.ppdb, table_name_postfix="_promoted_tmp") + updates_manager.apply_updates(self._promotable_chunks) + + def _mark_chunks_promoted(self) -> None: + """Mark the replica chunks as promoted in the database.""" + self.ppdb.mark_chunks_promoted(self._promotable_chunks) + def promote_chunks(self) -> None: - """Promote APDB replica chunks into production.""" + """Promote APDB replica chunks into production by executing a series of + phases. + """ try: - for phase in ("build_tmp", "promote_prod", "delete_staged_chunks"): + for phase in self._phases.keys(): self._execute_phase(phase) finally: try: - self._execute_phase("cleanup") + # Cleanup is always executed separately, not as an ordered + # phase. + self._cleanup() except Exception: - logging.exception("Cleanup of temporary tables failed") + logging.exception("Cleanup of chunk promotion failed") diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py b/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py index dc605f10..fd9374e1 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py @@ -36,7 +36,11 @@ ) from .updates_table import UpdatesTable -DEFAULT_MERGERS = (DiaObjectUpdatesMerger, DiaSourceUpdatesMerger, DiaForcedSourceUpdatesMerger) +DEFAULT_MERGERS = ( + DiaObjectUpdatesMerger, + DiaSourceUpdatesMerger, + DiaForcedSourceUpdatesMerger, +) class UpdatesManager: @@ -51,6 +55,7 @@ def __init__( mergers: Sequence[type[UpdatesMerger]] = DEFAULT_MERGERS, updates_table_name="updates", deduplicated_updates_table_name="updates_deduplicated", + table_name_postfix: str | None = None, ) -> None: self._ppdb = ppdb self._mergers = mergers @@ -62,12 +67,16 @@ def __init__( self._bq_client, f"{self._ppdb._config.project_id}.{self._ppdb._config.dataset_id}.{updates_table_name}", ) + + # TODO: Catch error if already exists self._updates_table.create() self._gcs_client = storage.Client() self._bucket = self._gcs_client.bucket(self._ppdb._config.bucket_name) - def apply_updates(self, replica_chunk_ids: Sequence[int], table_name_postfix: str | None = None) -> None: + self._table_name_postfix = table_name_postfix + + def apply_updates(self, replica_chunk_ids: Sequence[int]) -> None: replica_chunks = self._ppdb.get_replica_chunks_ext_by_ids(replica_chunk_ids) for replica_chunk in replica_chunks: if replica_chunk.gcs_uri is None: @@ -98,9 +107,9 @@ def apply_updates(self, replica_chunk_ids: Sequence[int], table_name_postfix: st # Merge the deduplicated updates into the target tables for merger in self._mergers: merger_instance = merger(self._bq_client) - if table_name_postfix: - # Apply a postfix like "_next" to the target table - merger_instance.target_table_name += f"_{table_name_postfix}" + if self._table_name_postfix: + # Apply a postfix to the canonical target table name + merger_instance.target_table_name += f"{self.table_name_postfix}" target_dataset_fqn = f"{self._ppdb._config.project_id}.{self._ppdb._config.dataset_id}" merger_instance.merge( updates_table_fqn=deduplicated_updates_table_fqn, target_dataset_fqn=target_dataset_fqn diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py index d03ab69a..f55aecb6 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -84,7 +84,11 @@ def merge(self, *, updates_table_fqn: str, target_dataset_fqn: str) -> bigquery. """ sql = SqlResource( self.SQL_RESOURCE_NAME, - format_args={"updates_table": updates_table_fqn, "target_dataset": target_dataset_fqn}, + format_args={ + "updates_table": updates_table_fqn, + "target_dataset": target_dataset_fqn, + "target_table": self.target_table_name, + }, ).sql job = self._client.query(sql) job.result() diff --git a/python/lsst/dax/ppdb/config/sql/merge_diaforcedsource_updates.sql b/python/lsst/dax/ppdb/config/sql/merge_diaforcedsource_updates.sql index 8c60f86c..8eef46b7 100644 --- a/python/lsst/dax/ppdb/config/sql/merge_diaforcedsource_updates.sql +++ b/python/lsst/dax/ppdb/config/sql/merge_diaforcedsource_updates.sql @@ -1,15 +1,4 @@ --- merge_diaforcedsource_updates.sql --- --- Query parameters: --- @updates_table STRING -- table FQN, e.g. "project.dataset.prod_next" --- @target_dataset STRING -- dataset FQN, e.g. "project.dataset" --- --- Do NOT include backticks in parameter values. - -DECLARE sql STRING; - -SET sql = """ -MERGE `{target_dataset}.DiaForcedSource` T +MERGE `{target_dataset}.{target_table}` T USING ( WITH patch AS ( SELECT @@ -36,7 +25,4 @@ ON T.diaObjectId = P.diaObjectId AND T.detector = P.detector WHEN MATCHED THEN UPDATE SET - timeWithdrawnMjdTai = IF(P.timeWithdrawnMjdTai_present, P.timeWithdrawnMjdTai_value, T.timeWithdrawnMjdTai) -"""; - -EXECUTE IMMEDIATE sql; + timeWithdrawnMjdTai = IF(P.timeWithdrawnMjdTai_present, P.timeWithdrawnMjdTai_value, T.timeWithdrawnMjdTai); diff --git a/python/lsst/dax/ppdb/config/sql/merge_diaobject_updates.sql b/python/lsst/dax/ppdb/config/sql/merge_diaobject_updates.sql index 143f86a5..9c6c1827 100644 --- a/python/lsst/dax/ppdb/config/sql/merge_diaobject_updates.sql +++ b/python/lsst/dax/ppdb/config/sql/merge_diaobject_updates.sql @@ -1,15 +1,4 @@ --- merge_diaobject_updates.sql --- --- Query parameters: --- @updates_table STRING -- table FQN, e.g. "project.dataset.prod_next" --- @target_dataset STRING -- dataset FQN, e.g. "project.dataset" --- --- Do NOT include backticks in parameter values. - -DECLARE sql STRING; - -SET sql = """ -MERGE `{target_dataset}.DiaObject` T +MERGE `{target_dataset}.{target_table}` T USING ( WITH patch AS ( SELECT @@ -40,7 +29,4 @@ ON T.diaObjectId = P.diaObjectId WHEN MATCHED THEN UPDATE SET validityEndMjdTai = IF(P.validityEndMjdTai_present, P.validityEndMjdTai_value, T.validityEndMjdTai), - nDiaSources = IF(P.nDiaSources_present, P.nDiaSources_value, T.nDiaSources) -"""; - -EXECUTE IMMEDIATE sql; \ No newline at end of file + nDiaSources = IF(P.nDiaSources_present, P.nDiaSources_value, T.nDiaSources); diff --git a/python/lsst/dax/ppdb/config/sql/merge_diasource_updates.sql b/python/lsst/dax/ppdb/config/sql/merge_diasource_updates.sql index 5a39b877..5a2d5307 100644 --- a/python/lsst/dax/ppdb/config/sql/merge_diasource_updates.sql +++ b/python/lsst/dax/ppdb/config/sql/merge_diasource_updates.sql @@ -1,15 +1,4 @@ --- merge_diasource_updates.sql --- --- Query parameters: --- @updates_table STRING -- table FQN, e.g. "project.dataset.prod_next" --- @target_dataset STRING -- dataset FQN, e.g. "project.dataset" --- --- Do NOT include backticks in parameter values. - -DECLARE sql STRING; - -SET sql = """ -MERGE `{target_dataset}.DiaSource` T +MERGE `{target_dataset}.{target_table}` T USING ( WITH patch AS ( SELECT @@ -57,6 +46,3 @@ UPDATE SET ssObjectId = IF(P.ssObjectId_present, P.ssObjectId_value, T.ssObjectId), ssObjectReassocTimeMjdTai = IF(P.ssObjectReassocTimeMjdTai_present, P.ssObjectReassocTimeMjdTai_value, T.ssObjectReassocTimeMjdTai), timeWithdrawnMjdTai = IF(P.timeWithdrawnMjdTai_present, P.timeWithdrawnMjdTai_value, T.timeWithdrawnMjdTai) -"""; - -EXECUTE IMMEDIATE sql; diff --git a/requirements.txt b/requirements.txt index a8e9fceb..c0cc7069 100644 --- a/requirements.txt +++ b/requirements.txt @@ -1,10 +1,13 @@ astropy +google-cloud-bigquery pyarrow pydantic >=2,<3 pyyaml >= 5.1 sqlalchemy + lsst-dax-apdb @ git+https://github.com/lsst/dax_apdb@main -lsst-utils @ git+https://github.com/lsst/utils@main -lsst-resources[s3] @ git+https://github.com/lsst/resources@main +lsst-dax-ppdbx-gcp @ git+https://github.com/lsst-dm/dax_ppdbx_gcp@main lsst-felis @ git+https://github.com/lsst/felis@main lsst-sdm-schemas @ git+https://github.com/lsst/sdm_schemas@main +lsst-utils @ git+https://github.com/lsst/utils@main +lsst-resources[s3] @ git+https://github.com/lsst/resources@main From c9f65105a82a6f52759dd40b5fd78302dbe2adb1 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Mar 2026 15:02:03 -0500 Subject: [PATCH 39/49] Add check in tests to skip if there are no valid Google credentials Some test modules require that there are valid Google credentials available. This adds a check so that if these are not present, the tests will be skipped, e.g., in GitHub CI where they should not run but failures should be avoided. --- python/lsst/dax/ppdb/tests/_bigquery.py | 30 +++++++++++++++++++++++++ tests/test_update_record_expander.py | 10 +++------ tests/test_update_records.py | 12 ++++------ tests/test_updates_manager.py | 2 ++ tests/test_updates_merger.py | 27 ++++++++++------------ tests/test_updates_table.py | 16 +++++-------- 6 files changed, 56 insertions(+), 41 deletions(-) diff --git a/python/lsst/dax/ppdb/tests/_bigquery.py b/python/lsst/dax/ppdb/tests/_bigquery.py index 551c1e2e..7b65d4af 100644 --- a/python/lsst/dax/ppdb/tests/_bigquery.py +++ b/python/lsst/dax/ppdb/tests/_bigquery.py @@ -26,6 +26,9 @@ import uuid from typing import Any +import google.auth +from google.auth.exceptions import DefaultCredentialsError +from google.auth.transport.requests import Request from google.cloud import storage from lsst.dax.apdb import ( @@ -177,3 +180,30 @@ def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: } kw.update(kwargs) return ApdbSql.init_database(**kw) # type: ignore[arg-type] + + +def have_valid_google_credentials() -> bool: + """Check that valid Google credentials are available for testing. + + Returns + ------- + credentials_valid: `bool` + True if valid Google credentials are available, False if not. + + Raises + ------ + google.auth.exceptions.RefreshError + Raised if the credentials cannot be refreshed. + Exception + Raised for other transport or configuration failures. + """ + try: + credentials, _ = google.auth.default() + except DefaultCredentialsError: + return False + + # This will validate the default credentials that were found in the + # environment. + credentials.refresh(Request()) + + return True diff --git a/tests/test_update_record_expander.py b/tests/test_update_record_expander.py index 843a025a..596acfed 100644 --- a/tests/test_update_record_expander.py +++ b/tests/test_update_record_expander.py @@ -32,13 +32,9 @@ ApdbWithdrawDiaForcedSourceRecord, ApdbWithdrawDiaSourceRecord, ) - -try: - from lsst.dax.ppdb.bigquery import updates - from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords - from lsst.dax.ppdb.tests._updates import _create_test_update_records -except ImportError: - updates = None +from lsst.dax.ppdb.bigquery import updates +from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords +from lsst.dax.ppdb.tests._updates import _create_test_update_records @unittest.skipIf(updates is None, "Google Cloud environment not available") diff --git a/tests/test_update_records.py b/tests/test_update_records.py index c6a1dd8e..3b56857c 100644 --- a/tests/test_update_records.py +++ b/tests/test_update_records.py @@ -25,12 +25,6 @@ import pytest from google.cloud import storage -try: - from lsst.dax.ppdb.bigquery import updates - from lsst.dax.ppdb.bigquery.updates import UpdateRecords -except ImportError: - updates = None - from lsst.dax.apdb import ( Apdb, ApdbReplica, @@ -38,6 +32,7 @@ ) from lsst.dax.ppdb import Ppdb from lsst.dax.ppdb.bigquery import PpdbBigQuery +from lsst.dax.ppdb.bigquery.updates import UpdateRecords from lsst.dax.ppdb.replicator import Replicator from lsst.dax.ppdb.tests import ApdbMixin from lsst.dax.ppdb.tests._bigquery import ( @@ -45,10 +40,11 @@ PostgresMixin, delete_test_bucket, generate_test_bucket_name, + have_valid_google_credentials, ) -@unittest.skipIf(updates is None, "Google Cloud dependencies not available") +@unittest.skipIf(not have_valid_google_credentials(), "Missing valid Google credentials") class UpdateRecordsTestCase(PostgresMixin, ApdbMixin, unittest.TestCase): """A test case for the handling of APDB record updates by PpdbBigQuery and related classes including the ChunkUploader. @@ -332,4 +328,4 @@ def test_chunk_uploader(self) -> None: try: delete_test_bucket(bucket) except Exception as e: - raise RuntimeError(f"Failed to delete test GCS bucket: {e}") + raise RuntimeError(f"Failed to delete test GCS bucket: {e}") from e diff --git a/tests/test_updates_manager.py b/tests/test_updates_manager.py index 74d36d8b..77c02729 100644 --- a/tests/test_updates_manager.py +++ b/tests/test_updates_manager.py @@ -39,10 +39,12 @@ ChunkUploaderWithoutPubSub, PostgresMixin, generate_test_bucket_name, + have_valid_google_credentials, ) from lsst.dax.ppdb.tests._updates import _create_test_update_records +@unittest.skipIf(not have_valid_google_credentials(), "Missing valid Google credentials") class UpdatesManagerTestCase(PostgresMixin, unittest.TestCase): """A test case for the handling of APDB record updates by PpdbBigQuery and related classes including the ChunkUploader. diff --git a/tests/test_updates_merger.py b/tests/test_updates_merger.py index 5fdca1bb..ead829c5 100644 --- a/tests/test_updates_merger.py +++ b/tests/test_updates_merger.py @@ -29,21 +29,18 @@ except ImportError: bigquery = None -try: - from lsst.dax.ppdb.bigquery import updates - from lsst.dax.ppdb.bigquery.updates import ( - DiaForcedSourceUpdatesMerger, - DiaObjectUpdatesMerger, - DiaSourceUpdatesMerger, - UpdateRecordExpander, - UpdatesTable, - ) - from lsst.dax.ppdb.tests._updates import _create_test_update_records -except ImportError: - updates = None - - -@unittest.skipIf(bigquery is None or updates is None, "Google Cloud dependencies not available") +from lsst.dax.ppdb.bigquery.updates import ( + DiaForcedSourceUpdatesMerger, + DiaObjectUpdatesMerger, + DiaSourceUpdatesMerger, + UpdateRecordExpander, + UpdatesTable, +) +from lsst.dax.ppdb.tests._bigquery import have_valid_google_credentials +from lsst.dax.ppdb.tests._updates import _create_test_update_records + + +@unittest.skipIf(not have_valid_google_credentials(), "Missing valid Google credentials") class TestUpdatesMerger(unittest.TestCase): """Test UpdatesMerger functionality.""" diff --git a/tests/test_updates_table.py b/tests/test_updates_table.py index bddbbd7b..07fc1202 100644 --- a/tests/test_updates_table.py +++ b/tests/test_updates_table.py @@ -22,20 +22,14 @@ import unittest import uuid -try: - from lsst.dax.ppdb.bigquery import updates - from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander, UpdatesTable - from lsst.dax.ppdb.tests._updates import _create_test_update_records -except ImportError: - updates = None +from google.cloud import bigquery -try: - from google.cloud import bigquery -except (ModuleNotFoundError, ImportError): - bigquery = None +from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander, UpdatesTable +from lsst.dax.ppdb.tests._bigquery import have_valid_google_credentials +from lsst.dax.ppdb.tests._updates import _create_test_update_records -@unittest.skipIf(updates is None or bigquery is None, "Google Cloud dependencies not available") +@unittest.skipIf(not have_valid_google_credentials(), "Missing valid Google credentials") class TestUpdatesTable(unittest.TestCase): """Test UpdatesTable functionality.""" From 4072a8d8ef795af041ac6afd632a1fa264e0ada8 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Mar 2026 15:56:24 -0500 Subject: [PATCH 40/49] FIXUP --- python/lsst/dax/ppdb/bigquery/updates/updates_merger.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py index f55aecb6..778a6be0 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -88,7 +88,7 @@ def merge(self, *, updates_table_fqn: str, target_dataset_fqn: str) -> bigquery. "updates_table": updates_table_fqn, "target_dataset": target_dataset_fqn, "target_table": self.target_table_name, - }, + }, ).sql job = self._client.query(sql) job.result() From 7484bca50ba4d83fdbffa391b99f277c52566718 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Mar 2026 16:09:55 -0500 Subject: [PATCH 41/49] Fix type alias issue reported by ruff --- python/lsst/dax/ppdb/bigquery/query_runner.py | 21 +++++++++---------- 1 file changed, 10 insertions(+), 11 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/query_runner.py b/python/lsst/dax/ppdb/bigquery/query_runner.py index 40eaf023..e9901658 100644 --- a/python/lsst/dax/ppdb/bigquery/query_runner.py +++ b/python/lsst/dax/ppdb/bigquery/query_runner.py @@ -21,8 +21,6 @@ from __future__ import annotations -from typing import TypeAlias - __all__ = [ "QueryRunner", ] @@ -31,14 +29,6 @@ from google.cloud import bigquery -AnyBigQueryJob: TypeAlias = ( - bigquery.job.QueryJob - | bigquery.job.LoadJob - | bigquery.job.CopyJob - | bigquery.job.ExtractJob - | bigquery.job.UnknownJob -) - class QueryRunner: """Class to run BigQuery queries with logging. @@ -81,7 +71,16 @@ def location(self) -> str: return self._location @classmethod - def log_job(cls, job: AnyBigQueryJob, label: str, level: int = logging.DEBUG) -> None: + def log_job( + cls, + job: bigquery.job.QueryJob + | bigquery.job.LoadJob + | bigquery.job.CopyJob + | bigquery.job.ExtractJob + | bigquery.job.UnknownJob, + label: str, + level: int = logging.DEBUG, + ) -> None: """Log details of a BigQuery job. Parameters From d52dfb4d206bac446dc328c74a46d41e86b6c248 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Mar 2026 16:30:05 -0500 Subject: [PATCH 42/49] Add missing docstring --- python/lsst/dax/ppdb/tests/_bigquery.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/python/lsst/dax/ppdb/tests/_bigquery.py b/python/lsst/dax/ppdb/tests/_bigquery.py index 7b65d4af..2726b70d 100644 --- a/python/lsst/dax/ppdb/tests/_bigquery.py +++ b/python/lsst/dax/ppdb/tests/_bigquery.py @@ -63,6 +63,13 @@ def generate_test_bucket_name(test_prefix: str = "ppdb-test") -> str: def delete_test_bucket(bucket_or_bucket_name: str | storage.Bucket) -> None: + """Delete a cloud storage bucket that was created for testing. + + Parameters + ---------- + bucket_or_bucket_name: `str` or `storage.Bucket` + The name of the bucket or the actual bucket to delete. + """ storage_client = storage.Client() try: if isinstance(bucket_or_bucket_name, str): From f7cd7917b445f4931f443815c632f3793cc7cd41 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Mar 2026 17:31:55 -0500 Subject: [PATCH 43/49] Add `mark_chunks_promoted` method --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 28 +++++++++++++++++++ 1 file changed, 28 insertions(+) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index ab34bedf..a3c6c0d2 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -781,3 +781,31 @@ def get_promotable_chunks(self) -> list[int]: result = conn.execute(sqlalchemy.text(sql)) chunk_ids = [row[0] for row in result] return chunk_ids + + def mark_chunks_promoted(self, promotable_chunks: list[int]) -> int: + """Set status='promoted' for the given chunk IDs. Returns number + updated. + + Parameters + ---------- + promotable_chunks : `list`[`int`] + List of integers containing the ``apdb_replica_chunk`` values of + the promotable chunks. + + Returns + ------- + count: `int` + The number of rows updated in the database, which should be equal + to the number of promotable chunks provided, if they were all found + and updated successfully. + """ + table = self.get_table("PpdbReplicaChunk") + stmt = ( + sqlalchemy.update(table) + .where(table.c.apdb_replica_chunk.in_(promotable_chunks), table.c.status != "promoted") + .values(status="promoted") + ) + + with self._engine.begin() as conn: + result: sqlalchemy.engine.CursorResult = conn.execute(stmt) + return result.rowcount or 0 From 0519130e7564f673db353d624dd907109cb309cb Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Thu, 19 Mar 2026 17:45:45 -0500 Subject: [PATCH 44/49] Fix mypy errors --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 4 ++- .../ppdb/bigquery/replica_chunk_promoter.py | 16 +++++----- .../updates/update_record_expander.py | 2 -- .../ppdb/bigquery/updates/update_records.py | 31 +++++++++++-------- .../ppdb/bigquery/updates/updates_manager.py | 6 ++-- .../ppdb/bigquery/updates/updates_merger.py | 2 +- python/lsst/dax/ppdb/sql/_ppdb_sql_base.py | 2 +- python/lsst/dax/ppdb/tests/_bigquery.py | 10 ++---- python/lsst/dax/ppdb/tests/_ppdb.py | 2 +- python/lsst/dax/ppdb/tests/_updates.py | 5 ++- 10 files changed, 39 insertions(+), 41 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index a3c6c0d2..22095b4d 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -736,7 +736,7 @@ def _handle_updates( update_records = UpdateRecords( replica_chunk_id=replica_chunk.id, - records=apdb_update_records, + records=list(apdb_update_records), record_count=len(apdb_update_records), ) update_records.write_json_file(chunk_dir / "update_records.json") @@ -769,6 +769,8 @@ def get_promotable_chunks(self) -> list[int]: empty list is returned. """ table = self.get_table("PpdbReplicaChunk") + if not table.schema: + raise ValueError("Table schema is not set, cannot construct query") quoted_table_name = ( self._engine.dialect.identifier_preparer.quote(table.schema) + "." diff --git a/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py index 2e39b270..4c67a2e4 100644 --- a/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py +++ b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py @@ -98,22 +98,20 @@ def table_names(self) -> list[str]: @property def promotable_chunks(self) -> list[int]: - """List of promotable chunks (`list[`int`], - read-only). - """ + """List of promotable chunks (`list` [ `int` ], read-only).""" return self._promotable_chunks - @property - def promotable_chunk_count(self) -> int: - """Count of promotable chunks that were found in the database.""" - return len(self.promotable_chunks) - @promotable_chunks.setter def promotable_chunks(self, chunks: list[int]) -> None: if not chunks: raise NoPromotableChunksError("No promotable chunks provided") self._promotable_chunks = chunks + @property + def promotable_chunk_count(self) -> int: + """Count of promotable chunks that were found in the database.""" + return len(self.promotable_chunks) + @property def runner(self) -> QueryRunner: """Runner for executing BigQuery jobs (`QueryRunner`, read-only).""" @@ -176,7 +174,7 @@ def _execute_phase(self, phase: str) -> None: def _get_promotable_chunks(self) -> None: """Get list of promotable chunks from the database.""" - self.promotable_chunks = self.ppdb.get_promotable_chunks() + self._promotable_chunks = self.ppdb.get_promotable_chunks() logging.info("Promotable chunk count: %s", len(self.promotable_chunks)) def _copy_to_promoted_tmp(self) -> None: diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py index f5c95fdf..b0fe2fb8 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_record_expander.py @@ -168,7 +168,6 @@ def expand_single_record( # Get the record ID record_id = cls._get_record_id(update_record) - record_id_hash = cls._compute_record_id_hash(record_id) expanded_records = [] for field_name in field_names: @@ -182,7 +181,6 @@ def expand_single_record( expanded_record = ExpandedUpdateRecord( table_name=table_name, record_id=record_id, - record_id_hash=record_id_hash, field_name=field_name, value_json=value, replica_chunk_id=replica_chunk_id, diff --git a/python/lsst/dax/ppdb/bigquery/updates/update_records.py b/python/lsst/dax/ppdb/bigquery/updates/update_records.py index ae986fb3..a3107c85 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/update_records.py +++ b/python/lsst/dax/ppdb/bigquery/updates/update_records.py @@ -23,7 +23,7 @@ import json from pathlib import Path -from typing import Any +from typing import Any, cast from pydantic import BaseModel, field_serializer, field_validator @@ -90,19 +90,24 @@ def deserialize_records( The list of APDB update records. """ if records and isinstance(records[0], ApdbUpdateRecord): - return records + return cast(list[ApdbUpdateRecord], records) deserialized_records: list[ApdbUpdateRecord] = [] - for record_dict in records: - record_copy = record_dict.copy() - update_time_ns = record_copy.pop("update_time_ns") - update_order = record_copy.pop("update_order") - json_str = json.dumps(record_copy) - update_record = ApdbUpdateRecord.from_json( - update_time_ns, - update_order, - json_str, - ) - deserialized_records.append(update_record) + for record in records: + if isinstance(record, dict): + record_copy = record.copy() + update_time_ns = record_copy.pop("update_time_ns") + update_order = record_copy.pop("update_order") + json_str = json.dumps(record_copy) + update_record = ApdbUpdateRecord.from_json( + update_time_ns, + update_order, + json_str, + ) + deserialized_records.append(update_record) + elif isinstance(record, ApdbUpdateRecord): + deserialized_records.append(record) + else: + raise TypeError("Each record must be a dict or ApdbUpdateRecord") return deserialized_records def write_json_file(self, path: Path) -> None: diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py b/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py index fd9374e1..a0eaaff5 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_manager.py @@ -53,8 +53,8 @@ def __init__( self, ppdb: PpdbBigQuery, mergers: Sequence[type[UpdatesMerger]] = DEFAULT_MERGERS, - updates_table_name="updates", - deduplicated_updates_table_name="updates_deduplicated", + updates_table_name: str = "updates", + deduplicated_updates_table_name: str = "updates_deduplicated", table_name_postfix: str | None = None, ) -> None: self._ppdb = ppdb @@ -109,7 +109,7 @@ def apply_updates(self, replica_chunk_ids: Sequence[int]) -> None: merger_instance = merger(self._bq_client) if self._table_name_postfix: # Apply a postfix to the canonical target table name - merger_instance.target_table_name += f"{self.table_name_postfix}" + merger_instance.target_table_name += f"{self._table_name_postfix}" target_dataset_fqn = f"{self._ppdb._config.project_id}.{self._ppdb._config.dataset_id}" merger_instance.merge( updates_table_fqn=deduplicated_updates_table_fqn, target_dataset_fqn=target_dataset_fqn diff --git a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py index 778a6be0..e74b68d6 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py +++ b/python/lsst/dax/ppdb/bigquery/updates/updates_merger.py @@ -42,7 +42,7 @@ class UpdatesMerger(ABC): statement for this merger. The SQL file must be located in the `lsst.dax.ppdb.config.sql` package.""" - def __init__(self, client: bigquery.Client, target_table_name: str = None) -> None: + def __init__(self, client: bigquery.Client, target_table_name: str | None = None) -> None: """ Parameters ---------- diff --git a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py index ed953310..5db51d5d 100644 --- a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py +++ b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py @@ -155,7 +155,7 @@ def _build_connect_args(cls, config: PpdbSqlBaseConfig) -> MutableMapping[str, A return {"connect_args": conn_args} @classmethod - def _config_listeners(cls, engine: sqlalchemy.engine.Engine) -> sqlalchemy.engine.Engine: + def _config_listeners(cls, engine: sqlalchemy.engine.Engine) -> None: if engine.dialect.name == "sqlite": # Need to enable foreign keys on every new connection. sqlalchemy.event.listen(engine, "connect", _onSqlite3Connect) diff --git a/python/lsst/dax/ppdb/tests/_bigquery.py b/python/lsst/dax/ppdb/tests/_bigquery.py index 2726b70d..e8be8ad6 100644 --- a/python/lsst/dax/ppdb/tests/_bigquery.py +++ b/python/lsst/dax/ppdb/tests/_bigquery.py @@ -119,9 +119,7 @@ def make_instance(self, **kwargs: Any) -> PpdbConfig: "felis_path": TEST_SCHEMA_RESOURCE_PATH, "replication_dir": self.tempdir, } - bq_config = PpdbBigQuery.init_bigquery( - **kw, - ) # type: ignore[arg-type] + bq_config = PpdbBigQuery.init_bigquery(**kw) # type: ignore[arg-type] return bq_config def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: @@ -147,7 +145,6 @@ class PostgresMixin: def setUpClass(cls) -> None: # Create the postgres test server. cls.postgresql = testing.postgresql.PostgresqlFactory(cache_initialized_db=True) - super().setUpClass() @classmethod def tearDownClass(cls) -> None: @@ -155,7 +152,6 @@ def tearDownClass(cls) -> None: # so they're closed before we shut down the server. gc.collect() cls.postgresql.clear_cache() - super().tearDownClass() def setUp(self) -> None: self.server = self.postgresql() @@ -174,7 +170,7 @@ def make_instance(self, config_dict: dict[str, Any] = TEST_CONFIG, **kwargs: Any "felis_path": TEST_SCHEMA_RESOURCE_PATH, "replication_dir": self.tempdir, } - bq_config = PpdbBigQuery.init_bigquery(**kw) # type: ignore[arg-type] + bq_config = PpdbBigQuery.init_bigquery(**kw) return bq_config def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: @@ -186,7 +182,7 @@ def make_apdb_instance(self, **kwargs: Any) -> ApdbConfig: "enable_replica": True, } kw.update(kwargs) - return ApdbSql.init_database(**kw) # type: ignore[arg-type] + return ApdbSql.init_database(**kw) def have_valid_google_credentials() -> bool: diff --git a/python/lsst/dax/ppdb/tests/_ppdb.py b/python/lsst/dax/ppdb/tests/_ppdb.py index 245639ea..e2c90c47 100644 --- a/python/lsst/dax/ppdb/tests/_ppdb.py +++ b/python/lsst/dax/ppdb/tests/_ppdb.py @@ -71,7 +71,7 @@ def _make_region(xyz: tuple[float, float, float] = (1.0, 1.0, -1.0)) -> Region: return region -class ApdbMixin: +class ApdbMixin(unittest.TestCase): """Mixin class containing APDB setuup and record generation for PPDB testing. """ diff --git a/python/lsst/dax/ppdb/tests/_updates.py b/python/lsst/dax/ppdb/tests/_updates.py index bc978ffe..45d3fb1b 100644 --- a/python/lsst/dax/ppdb/tests/_updates.py +++ b/python/lsst/dax/ppdb/tests/_updates.py @@ -19,13 +19,13 @@ # You should have received a copy of the GNU General Public License # along with this program. If not, see . -import datetime from lsst.dax.apdb import ( ApdbCloseDiaObjectValidityRecord, ApdbReassignDiaSourceToDiaObjectRecord, ApdbReassignDiaSourceToSSObjectRecord, ApdbUpdateNDiaSourcesRecord, + ApdbUpdateRecord, ApdbWithdrawDiaForcedSourceRecord, ApdbWithdrawDiaSourceRecord, ) @@ -35,7 +35,7 @@ def _create_test_update_records() -> UpdateRecords: """Create test UpdateRecords with sample ApdbUpdateRecord instances.""" - records = [] + records: list[ApdbUpdateRecord] = [] # Hardcoded test values test_update_time_ns = 1640995200000000000 # 2022-01-01 00:00:00 UTC in nanoseconds @@ -153,5 +153,4 @@ def _create_test_update_records() -> UpdateRecords: replica_chunk_id=test_replica_chunk_id, record_count=len(records), records=records, - file_created_at=datetime.datetime.now(datetime.UTC), ) From bb01e3ac7382d9c404a463f3008bcad9e3130d0d Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Mar 2026 11:33:16 -0500 Subject: [PATCH 45/49] WIP: Introduce class for handling SQL passwords --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 106 +++++++----------- python/lsst/dax/ppdb/sql/__init__.py | 2 +- python/lsst/dax/ppdb/sql/_ppdb_sql_base.py | 61 ++++++++-- 3 files changed, 91 insertions(+), 78 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 22095b4d..0f697eaf 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -28,6 +28,7 @@ import felis import sqlalchemy +from google.cloud import secretmanager from lsst.dax.apdb import ( ApdbMetadata, @@ -39,17 +40,17 @@ monitor, schema_model, ) -from lsst.dax.apdb.sql import ApdbMetadataSql from lsst.dax.apdb.timer import Timer from .._arrow import write_parquet from ..ppdb import Ppdb, PpdbReplicaChunk from ..ppdb_config import PpdbConfig -from ..sql import PpdbSqlBase, PpdbSqlBaseConfig +from ..sql import PasswordProvider, PpdbSqlBase, PpdbSqlBaseConfig from .manifest import Manifest, TableStats from .ppdb_replica_chunk_extended import ChunkStatus, PpdbReplicaChunkExtended from .query_runner import QueryRunner from .sql_resource import SqlResource +from .updates.update_records import UpdateRecords __all__ = ["ConfigValidationError", "PpdbBigQuery", "PpdbBigQueryConfig"] @@ -117,6 +118,30 @@ def fq_dataset_id(self) -> str: return f"{self.project_id}:{self.dataset_id}" +class _SecretManagerPasswordProvider(PasswordProvider): + """Retrieves a database password from Google Cloud Secret Manager. + + Parameters + ---------- + project_id : `str` + GCP project that owns the secret. + secret_name : `str`, optional + Name of the secret. Defaults to ``"ppdb-db-password"``. + """ + + def __init__(self, project_id: str, secret_name: str = "ppdb-db-password") -> None: + self._project_id = project_id + self._secret_name = secret_name + + def get_password(self) -> str: + """Return the password fetched from Secret Manager.""" + client = secretmanager.SecretManagerServiceClient() + name = f"projects/{self._project_id}/secrets/{self._secret_name}/versions/latest" + _LOG.info("Retrieving database password from Secret Manager: %s", name) + response = client.access_secret_version(request={"name": name}) + return response.payload.data.decode("UTF-8") + + class ConfigValidationError(Exception): """Indicates an error validating the configuration.""" @@ -131,8 +156,15 @@ class PpdbBigQuery(Ppdb, PpdbSqlBase): """ def __init__(self, config: PpdbBigQueryConfig): - # Initialize the SQL interface for the PPDB - self._init_sql(config) + # Build an optional password provider for GCP Secret Manager. + password_provider: PasswordProvider | None = None + if os.getenv("PPDB_USE_SECRET_MANAGER", "false").lower() == "true": + _LOG.info("Using Secret Manager to retrieve database password") + password_provider = _SecretManagerPasswordProvider(config.project_id) + + # Delegate SQL initialisation (schema load, engine, metadata, version + # checks) to the base class, passing the optional password provider. + PpdbSqlBase.__init__(self, config.sql, password_provider=password_provider) # Read parameters from config if config.replication_dir is None: @@ -166,22 +198,6 @@ def query_runner(self) -> QueryRunner: self._query_runner = QueryRunner(self.config.project_id, self.config.dataset_id) return self._query_runner - def _init_sql(self, config: PpdbBigQueryConfig) -> None: - sql_config = config.sql - self._sa_metadata, self._schema_version = self.read_schema( - sql_config.felis_path, sql_config.schema_name, sql_config.felis_schema, sql_config.db_url - ) - - self._engine = self._make_engine(config) # Includes Secrets Manager support - sa_metadata = sqlalchemy.MetaData(schema=sql_config.schema_name) - - meta_table = sqlalchemy.schema.Table("metadata", sa_metadata, autoload_with=self._engine) - self._metadata = ApdbMetadataSql(self._engine, meta_table) - - # Check schema amd code version compatibility. - self._check_schema_version(self._schema_version) - self._check_code_version() - def _generate_manifest( self, replica_chunk: ReplicaChunk, @@ -501,47 +517,6 @@ def filter_table_names(cls, original_table_names: Iterable[str]) -> Iterable[str # Only the metadata table is needed for the BigQuery-based PPDB. return ["metadata"] - @classmethod - def _get_secretmanager_password(cls, project_id: str, password_name: str = "ppdb-db-password") -> str: - from google.cloud import secretmanager - - client = secretmanager.SecretManagerServiceClient() - name = f"projects/{project_id}/secrets/{password_name}/versions/latest" - response = client.access_secret_version(request={"name": name}) - return response.payload.data.decode("UTF-8") - - @classmethod - def _use_secret_manager(cls) -> bool: - return os.getenv("PPDB_USE_SECRET_MANAGER", "false").lower() == "true" - - @classmethod - def _make_engine(cls, config: PpdbBigQueryConfig) -> sqlalchemy.engine.Engine: - """Make SQLALchemy engine based on configured parameters. - - Parameters - ---------- - config : `PpdbBigQueryConfig` - Configuration object with SQL parameters. - """ - sql_config = config.sql - db_url = sqlalchemy.make_url(sql_config.db_url) - - # If using Secret Manager, retrieve the password and update the - # database URL. - if cls._use_secret_manager(): - _LOG.info("Using Secret Manager to retrieve database password") - if db_url.password is not None: - raise ValueError("Database URL should not include a password when using Secret Manager") - password = cls._get_secretmanager_password(config.project_id) - db_url = db_url.set(password=password) - - kw = cls._build_connect_args(sql_config) - engine = sqlalchemy.create_engine(db_url, **kw) - - cls._config_listeners(engine) - - return engine - @classmethod def init_bigquery( cls, @@ -627,7 +602,11 @@ def init_bigquery( if stage_chunk_topic is not None: bq_config.stage_chunk_topic = stage_chunk_topic - engine = cls._make_engine(bq_config) + password_provider: PasswordProvider | None = None + if os.getenv("PPDB_USE_SECRET_MANAGER", "false").lower() == "true": + _LOG.info("Using Secret Manager to retrieve database password") + password_provider = _SecretManagerPasswordProvider(bq_config.project_id) + engine = cls.make_engine(bq_config.sql, password_provider=password_provider) cls.make_database(engine, bq_config.sql, sa_metadata, schema_version, db_drop) # Validate the config if requested. @@ -731,9 +710,6 @@ def _handle_updates( Serializes the ApdbUpdateRecord objects into a dictionary structure for processing. """ - # Import inlined here to avoid triggering google cloud imports - from .updates.update_records import UpdateRecords - update_records = UpdateRecords( replica_chunk_id=replica_chunk.id, records=list(apdb_update_records), diff --git a/python/lsst/dax/ppdb/sql/__init__.py b/python/lsst/dax/ppdb/sql/__init__.py index 92e21081..566853c3 100644 --- a/python/lsst/dax/ppdb/sql/__init__.py +++ b/python/lsst/dax/ppdb/sql/__init__.py @@ -20,4 +20,4 @@ # along with this program. If not, see . from ._ppdb_sql import PpdbSql, PpdbSqlConfig -from ._ppdb_sql_base import PpdbSqlBase, PpdbSqlBaseConfig +from ._ppdb_sql_base import PasswordProvider, PpdbSqlBase, PpdbSqlBaseConfig diff --git a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py index 5db51d5d..542029e5 100644 --- a/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py +++ b/python/lsst/dax/ppdb/sql/_ppdb_sql_base.py @@ -21,11 +21,12 @@ from __future__ import annotations -__all__ = ["PpdbSqlBase"] +__all__ = ["PasswordProvider", "PpdbSqlBase"] import logging import os import sqlite3 +from abc import ABC, abstractmethod from collections.abc import Iterable, MutableMapping from contextlib import closing from typing import Any @@ -49,6 +50,25 @@ _LOG = logging.getLogger(__name__) +class PasswordProvider(ABC): + """Abstract base class for objects that supply a database password. + + Implementations are free to retrieve the password from any source + (e.g. environment variables, a secrets manager, a local file) without + `PpdbSqlBase` needing to know about the mechanism. + """ + + @abstractmethod + def get_password(self) -> str: + """Return the database password. + + Returns + ------- + password : `str` + Plain-text password to embed in the database connection URL. + """ + + class MissingSchemaVersionError(RuntimeError): """Exception raised when schema version is not defined in the schema. @@ -121,12 +141,12 @@ class PpdbSqlBase: meta_schema_version_key = "version:schema" """Name of the metadata key to store Felis schema version number.""" - def __init__(self, config: PpdbSqlBaseConfig) -> None: + def __init__(self, config: PpdbSqlBaseConfig, password_provider: PasswordProvider | None = None) -> None: self._sa_metadata, self._schema_version = self.read_schema( config.felis_path, config.schema_name, config.felis_schema, config.db_url ) - self._engine = self.make_engine(config) + self._engine = self.make_engine(config, password_provider=password_provider) sa_metadata = sqlalchemy.MetaData(schema=config.schema_name) meta_table = sqlalchemy.schema.Table("metadata", sa_metadata, autoload_with=self._engine) @@ -155,23 +175,40 @@ def _build_connect_args(cls, config: PpdbSqlBaseConfig) -> MutableMapping[str, A return {"connect_args": conn_args} @classmethod - def _config_listeners(cls, engine: sqlalchemy.engine.Engine) -> None: - if engine.dialect.name == "sqlite": - # Need to enable foreign keys on every new connection. - sqlalchemy.event.listen(engine, "connect", _onSqlite3Connect) - - @classmethod - def make_engine(cls, config: PpdbSqlBaseConfig) -> sqlalchemy.engine.Engine: + def make_engine( + cls, + config: PpdbSqlBaseConfig, + *, + password_provider: PasswordProvider | None = None, + ) -> sqlalchemy.engine.Engine: """Make SQLALchemy engine based on configured parameters. Parameters ---------- config : `PpdbSqlBaseConfig` Configuration object with SQL parameters. + password_provider : `PasswordProvider`, optional + If provided, the password returned by + ``password_provider.get_password()`` is injected into the + database URL. The URL must not already contain a password when + this argument is given. + + Raises + ------ + ValueError + Raised if ``password_provider`` is given but the URL already + contains a password. """ + db_url = sqlalchemy.make_url(config.db_url) + if password_provider is not None: + if db_url.password is not None: + raise ValueError("Database URL must not contain a password when password_provider is used.") + db_url = db_url.set(password=password_provider.get_password()) kw = cls._build_connect_args(config) - engine = sqlalchemy.create_engine(config.db_url, **kw) - cls._config_listeners(engine) + engine = sqlalchemy.create_engine(db_url, **kw) + if engine.dialect.name == "sqlite": + # Need to enable foreign keys on every new connection. + sqlalchemy.event.listen(engine, "connect", _onSqlite3Connect) return engine From 855c826c4dd47d9cc60b47bd54362335d2eff7b1 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Mar 2026 11:51:26 -0500 Subject: [PATCH 46/49] Fix circular reference in imports --- python/lsst/dax/ppdb/bigquery/updates/__init__.py | 1 - tests/test_updates_manager.py | 5 +++-- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/updates/__init__.py b/python/lsst/dax/ppdb/bigquery/updates/__init__.py index ef9abea9..1536958d 100644 --- a/python/lsst/dax/ppdb/bigquery/updates/__init__.py +++ b/python/lsst/dax/ppdb/bigquery/updates/__init__.py @@ -29,4 +29,3 @@ from .update_records import UpdateRecords from .update_record_expander import UpdateRecordExpander from .updates_table import UpdatesTable -from .updates_manager import UpdatesManager diff --git a/tests/test_updates_manager.py b/tests/test_updates_manager.py index 77c02729..22ae556c 100644 --- a/tests/test_updates_manager.py +++ b/tests/test_updates_manager.py @@ -34,7 +34,8 @@ ReplicaChunk, ) from lsst.dax.ppdb import Ppdb -from lsst.dax.ppdb.bigquery import PpdbBigQuery, updates +from lsst.dax.ppdb.bigquery import PpdbBigQuery +from lsst.dax.ppdb.bigquery.updates.updates_manager import UpdatesManager from lsst.dax.ppdb.tests._bigquery import ( ChunkUploaderWithoutPubSub, PostgresMixin, @@ -253,5 +254,5 @@ def rows(self) -> Collection[tuple]: uploader.run() # Apply the updates to the target tables - updates_manager = updates.UpdatesManager(self.ppdb) + updates_manager = UpdatesManager(self.ppdb) updates_manager.apply_updates([update_records.replica_chunk_id]) From cdfa69f30c1ce235e43ebc31ccf99447e5607dca Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Mar 2026 11:51:52 -0500 Subject: [PATCH 47/49] Remove unnecessary property functions --- .../ppdb/bigquery/replica_chunk_promoter.py | 86 +++++-------------- 1 file changed, 21 insertions(+), 65 deletions(-) diff --git a/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py index 4c67a2e4..424b8a27 100644 --- a/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py +++ b/python/lsst/dax/ppdb/bigquery/replica_chunk_promoter.py @@ -27,14 +27,13 @@ ] import logging -from collections.abc import Callable from google.api_core.exceptions import NotFound from google.cloud import bigquery from .ppdb_bigquery import PpdbBigQuery from .query_runner import QueryRunner -from .updates import UpdatesManager +from .updates.updates_manager import UpdatesManager class NoPromotableChunksError(Exception): @@ -60,6 +59,8 @@ def __init__( table_names: list[str] | None = None, ): self._ppdb = ppdb + self._project_id = self._ppdb._config.project_id + self._dataset_id = self._ppdb._config.dataset_id self._runner = ppdb.query_runner # DM-52326: Hard-coded table names; these should be passed in from # config. @@ -76,26 +77,6 @@ def __init__( self._promotable_chunks: list[int] = [] - @property - def ppdb(self) -> PpdbBigQuery: - """PPDB interface to BigQuery.""" - return self._ppdb - - @property - def project_id(self) -> str: - """Google Cloud project ID (`str`, read-only).""" - return self._runner.project_id - - @property - def dataset_id(self) -> str: - """Dataset ID (`str`, read-only).""" - return self._runner.dataset_id - - @property - def table_names(self) -> list[str]: - """List of table names to promote (`list`[`str`], read-only).""" - return self._table_names - @property def promotable_chunks(self) -> list[int]: """List of promotable chunks (`list` [ `int` ], read-only).""" @@ -107,37 +88,12 @@ def promotable_chunks(self, chunks: list[int]) -> None: raise NoPromotableChunksError("No promotable chunks provided") self._promotable_chunks = chunks - @property - def promotable_chunk_count(self) -> int: - """Count of promotable chunks that were found in the database.""" - return len(self.promotable_chunks) - - @property - def runner(self) -> QueryRunner: - """Runner for executing BigQuery jobs (`QueryRunner`, read-only).""" - return self._runner - - @property - def bq_client(self) -> bigquery.Client: - """Client for interacting with BigQuery (`bigquery.Client`, - read-only). - """ - return self._bq_client - - @property - def phases(self) -> dict[str, Callable]: - """Phases of the promotion process as a dictionary mapping phase names - to their corresponding class methods (`dict`[`str`, `Callable`], - read-only). - """ - return self._phases - @property def table_prod_refs(self) -> list[str]: """Fully-qualified production table references (`list`[`str`], read-only). """ - return [f"{self.project_id}.{self.dataset_id}.{table_name}" for table_name in self.table_names] + return [f"{self._project_id}.{self._dataset_id}.{table_name}" for table_name in self._table_names] @property def table_staging_refs(self) -> list[str]: @@ -145,7 +101,7 @@ def table_staging_refs(self) -> list[str]: read-only). """ return [ - f"{self.project_id}.{self.dataset_id}._{table_name}_staging" for table_name in self.table_names + f"{self._project_id}.{self._dataset_id}._{table_name}_staging" for table_name in self._table_names ] @property @@ -154,8 +110,8 @@ def table_promoted_tmp_refs(self) -> list[str]: read-only). """ return [ - f"{self.project_id}.{self.dataset_id}._{table_name}_promoted_tmp" - for table_name in self.table_names + f"{self._project_id}.{self._dataset_id}._{table_name}_promoted_tmp" + for table_name in self._table_names ] def _execute_phase(self, phase: str) -> None: @@ -167,14 +123,14 @@ def _execute_phase(self, phase: str) -> None: The name of the promotion phase to execute. This should be one of the keys in the `phases` property. """ - if phase not in self.phases: + if phase not in self._phases: raise ValueError(f"Unknown promotion phase: {phase}") logging.debug("Executing promotion phase: %s", phase) self._phases[phase]() def _get_promotable_chunks(self) -> None: """Get list of promotable chunks from the database.""" - self._promotable_chunks = self.ppdb.get_promotable_chunks() + self._promotable_chunks = self._ppdb.get_promotable_chunks() logging.info("Promotable chunk count: %s", len(self.promotable_chunks)) def _copy_to_promoted_tmp(self) -> None: @@ -191,13 +147,13 @@ def _copy_to_promoted_tmp(self) -> None: ): # Drop any existing tmp table (should not exist but just to be # safe) - self.runner.run_job("drop_tmp", f"DROP TABLE IF EXISTS `{tmp_ref}`") + self._runner.run_job("drop_tmp", f"DROP TABLE IF EXISTS `{tmp_ref}`") # Clone prod table structure and data (zero-copy) - self.runner.run_job("clone_prod", f"CREATE TABLE `{tmp_ref}` CLONE `{prod_ref}`") + self._runner.run_job("clone_prod", f"CREATE TABLE `{tmp_ref}` CLONE `{prod_ref}`") # Build ordered target list from the cloned tmp schema - tmp_schema = self.bq_client.get_table(tmp_ref).schema + tmp_schema = self._bq_client.get_table(tmp_ref).schema target_names = [f.name for f in tmp_schema if f.name != "apdb_replica_chunk"] target_list_sql = ", ".join(f"`{n}`" for n in target_names) @@ -214,7 +170,7 @@ def _copy_to_promoted_tmp(self) -> None: WHERE s.apdb_replica_chunk IN UNNEST(@ids) """ logging.debug("SQL for inserting staged rows into %s: %s", tmp_ref, sql) - self.runner.run_job("insert_staged_to_tmp", sql, job_config=job_cfg) + self._runner.run_job("insert_staged_to_tmp", sql, job_config=job_cfg) def _promote_tmp_to_prod(self) -> None: """ @@ -225,14 +181,14 @@ def _promote_tmp_to_prod(self) -> None: for prod_ref, tmp_ref in zip(self.table_prod_refs, self.table_promoted_tmp_refs, strict=False): # Ensure tmp exists try: - self.bq_client.get_table(tmp_ref) + self._bq_client.get_table(tmp_ref) except NotFound as e: raise RuntimeError(f"Missing tmp table for promotion: {tmp_ref}") from e # Atomic zero-copy replacement of prod with tmp copy_cfg = bigquery.CopyJobConfig(write_disposition=bigquery.WriteDisposition.WRITE_TRUNCATE) - job = self.bq_client.copy_table( - tmp_ref, prod_ref, job_config=copy_cfg, location=self.runner.location + job = self._bq_client.copy_table( + tmp_ref, prod_ref, job_config=copy_cfg, location=self._runner.location ) job.result() QueryRunner.log_job(job, "promote_tmp_to_prod") @@ -240,7 +196,7 @@ def _promote_tmp_to_prod(self) -> None: def _cleanup(self) -> None: """Drop the promotion temporary tables.""" for tmp_ref in self.table_promoted_tmp_refs: - self.bq_client.delete_table(tmp_ref, not_found_ok=True) + self._bq_client.delete_table(tmp_ref, not_found_ok=True) logging.debug("Dropped %s (if it existed)", tmp_ref) def _delete_staged_chunks(self) -> None: @@ -254,7 +210,7 @@ def _delete_staged_chunks(self) -> None: for staging_ref in self.table_staging_refs: try: sql = f"DELETE FROM `{staging_ref}` WHERE apdb_replica_chunk IN UNNEST(@ids)" - self.runner.run_job("delete_staged_chunks", sql, job_config=job_config) + self._runner.run_job("delete_staged_chunks", sql, job_config=job_config) logging.debug( "Deleted %d chunk(s) from staging table %s", len(self.promotable_chunks), staging_ref ) @@ -263,12 +219,12 @@ def _delete_staged_chunks(self) -> None: def _apply_record_updates(self) -> None: """Apply record updates to the promoted temporary tables.""" - updates_manager = UpdatesManager(self.ppdb, table_name_postfix="_promoted_tmp") - updates_manager.apply_updates(self._promotable_chunks) + updates_manager = UpdatesManager(self._ppdb, table_name_postfix="_promoted_tmp") + updates_manager.apply_updates(self.promotable_chunks) def _mark_chunks_promoted(self) -> None: """Mark the replica chunks as promoted in the database.""" - self.ppdb.mark_chunks_promoted(self._promotable_chunks) + self._ppdb.mark_chunks_promoted(self.promotable_chunks) def promote_chunks(self) -> None: """Promote APDB replica chunks into production by executing a series of From 32f0fea9f77e250ca3ee20127be184b958c49802 Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Mar 2026 11:54:15 -0500 Subject: [PATCH 48/49] Remove no longer necessary check for test execution --- tests/test_update_record_expander.py | 11 ----------- 1 file changed, 11 deletions(-) diff --git a/tests/test_update_record_expander.py b/tests/test_update_record_expander.py index 596acfed..5038bfb5 100644 --- a/tests/test_update_record_expander.py +++ b/tests/test_update_record_expander.py @@ -32,12 +32,10 @@ ApdbWithdrawDiaForcedSourceRecord, ApdbWithdrawDiaSourceRecord, ) -from lsst.dax.ppdb.bigquery import updates from lsst.dax.ppdb.bigquery.updates import ExpandedUpdateRecord, UpdateRecordExpander, UpdateRecords from lsst.dax.ppdb.tests._updates import _create_test_update_records -@unittest.skipIf(updates is None, "Google Cloud environment not available") class UpdateRecordExpanderTestCase(unittest.TestCase): """Test UpdateRecordExpander functionality.""" @@ -52,8 +50,6 @@ def setUp(self) -> None: def test_get_update_fields(self) -> None: """Test get_update_fields class method.""" - from lsst.dax.ppdb.bigquery.updates import UpdateRecordExpander - # Test known update types self.assertEqual( UpdateRecordExpander.get_update_fields("reassign_diasource_to_diaobject"), ["diaObjectId"] @@ -284,13 +280,6 @@ def test_update_records_all(self) -> None: expanded = UpdateRecordExpander.expand_updates(update_records) - # Should have 8 total expanded records: - # - 1 from ApdbReassignDiaSourceToDiaObjectRecord - # - 2 from ApdbReassignDiaSourceToSSObjectRecord - # - 1 from ApdbWithdrawDiaSourceRecord - # - 1 from ApdbWithdrawDiaForcedSourceRecord - # - 2 from ApdbCloseDiaObjectValidityRecord - # - 1 from ApdbUpdateNDiaSourcesRecord self.assertEqual(len(expanded), 10) # Verify all expanded records have correct replica_chunk_id From 13694171409bbb0496b7560b9687fb198a19b06d Mon Sep 17 00:00:00 2001 From: Jeremy McCormick Date: Fri, 20 Mar 2026 18:04:20 -0500 Subject: [PATCH 49/49] Add update method --- .../lsst/dax/ppdb/bigquery/ppdb_bigquery.py | 41 +++++++++++++++++++ 1 file changed, 41 insertions(+) diff --git a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py index 0f697eaf..dd0a23ed 100644 --- a/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py +++ b/python/lsst/dax/ppdb/bigquery/ppdb_bigquery.py @@ -25,6 +25,7 @@ import shutil from collections.abc import Collection, Iterable, Sequence from pathlib import Path +from typing import Any import felis import sqlalchemy @@ -787,3 +788,43 @@ def mark_chunks_promoted(self, promotable_chunks: list[int]) -> int: with self._engine.begin() as conn: result: sqlalchemy.engine.CursorResult = conn.execute(stmt) return result.rowcount or 0 + + def update(self, chunk_id: int, values: dict[str, Any]) -> int: + """Update an existing replica chunk in the database. + + Parameters + ---------- + chunk_id : `int` + The ID of the replica chunk to update. + values : `dict`[`str`, `Any`] + A dictionary of column names and their new values to update. + + Returns + ------- + count : `int` + The number of rows updated. This should be 1 if the update is + successful, or 0 if no rows were updated (e.g., if the chunk ID + does not exist or the status is already set to the new value). + """ + logging.info("Preparing to update replica chunk %d with values: %s", chunk_id, values) + table = self.get_table("PpdbReplicaChunk") + stmt = sqlalchemy.update(table).where(table.c.apdb_replica_chunk == chunk_id).values(values) + with self._engine.begin() as conn: + result = conn.execute(stmt) + affected_rows = result.rowcount + + new_status = values.get("status") + if affected_rows == 0: + logging.warning( + "No rows updated for replica chunk %s with status '%s'", + chunk_id, + new_status, + ) + else: + logging.info( + "Successfully updated %d row(s) for replica chunk %s to status '%s'", + affected_rows, + chunk_id, + new_status, + ) + return affected_rows