From 4b60c7074843794ef5e84f9dc26c0bd4732ec610 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 29 Sep 2025 12:29:53 -0700 Subject: [PATCH 01/35] introducing branching --- .../OpenHouseInternalTableOperations.java | 141 ++- .../spark/catalogtest/BranchTestSpark3_5.java | 878 ++++++++++++++++++ 2 files changed, 990 insertions(+), 29 deletions(-) create mode 100644 integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index d9fa34257..793167e47 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -24,6 +24,7 @@ import java.time.Clock; import java.time.Instant; import java.util.ArrayList; +import java.util.Collections; import java.util.HashMap; import java.util.List; import java.util.Map; @@ -287,6 +288,9 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { serializedSnapshotRefs == null ? new HashMap<>() : SnapshotsUtil.parseSnapshotRefs(serializedSnapshotRefs); + + // Multi-branch support is now enabled with snapshot ID matching + updatedMetadata = maybeAppendSnapshots(updatedMetadata, appendedSnapshots, snapshotRefs, true); updatedMetadata = maybeDeleteSnapshots(updatedMetadata, deletedSnapshots); @@ -554,6 +558,67 @@ public TableMetadata maybeDeleteSnapshots( return result; } + /** + * Determines the target branch for a snapshot commit based on the provided snapshotRefs. + * + * @param snapshotRefs map of branch names to snapshot references + * @param defaultBranch default branch to use if no specific branch can be determined + * @return target branch name for the snapshot commit + */ + private String determineTargetBranch( + Map snapshotRefs, String defaultBranch) { + return determineTargetBranch(snapshotRefs, Collections.emptyList(), defaultBranch); + } + + /** + * Determines the target branch for snapshot commits by matching snapshot IDs. When multiple + * branches are present, finds which branch should receive the new snapshots. + */ + private String determineTargetBranch( + Map snapshotRefs, List newSnapshots, String defaultBranch) { + if (MapUtils.isEmpty(snapshotRefs)) { + return defaultBranch; + } + + // If there's only one branch in the refs, use that as the target + if (snapshotRefs.size() == 1) { + return snapshotRefs.keySet().iterator().next(); + } + + // CRITICAL FIX: For multi-branch scenarios, find which branch should get the new snapshots + if (!newSnapshots.isEmpty()) { + // Get the latest snapshot ID from new snapshots + long latestSnapshotId = newSnapshots.get(newSnapshots.size() - 1).snapshotId(); + + // Find which branch in snapshotRefs should point to this snapshot + for (Map.Entry entry : snapshotRefs.entrySet()) { + String branchName = entry.getKey(); + long branchSnapshotId = entry.getValue().snapshotId(); + + if (branchSnapshotId == latestSnapshotId) { + log.debug( + "Determined target branch '{}' by snapshot ID match: {}", + branchName, + latestSnapshotId); + return branchName; + } + } + } + + // Fallback: if we can't match by snapshot ID, prefer non-main branches for branch operations + for (String branchName : snapshotRefs.keySet()) { + if (!branchName.equals(SnapshotRef.MAIN_BRANCH)) { + log.debug( + "Multiple branches, no snapshot match, preferring non-main branch: {}", branchName); + return branchName; + } + } + + // Final fallback to main + log.debug("Multiple branches, falling back to main branch"); + return SnapshotRef.MAIN_BRANCH; + } + public TableMetadata maybeAppendSnapshots( TableMetadata metadata, List snapshotsToAppend, @@ -563,62 +628,80 @@ public TableMetadata maybeAppendSnapshots( List appendedSnapshots = new ArrayList<>(); List stagedSnapshots = new ArrayList<>(); List cherryPickedSnapshots = new ArrayList<>(); - // Throw an exception if client sent request that included non-main branches in the - // snapshotRefs. - for (Map.Entry entry : snapshotRefs.entrySet()) { - if (!entry.getKey().equals(SnapshotRef.MAIN_BRANCH)) { - throw new UnsupportedOperationException("OpenHouse supports only MAIN branch"); - } - } + /** * First check if there are new snapshots to be appended to current TableMetadata. If yes, * following are the cases to be handled: * - *

[1] A regular (non-wap) snapshot is being added to the MAIN branch. + *

[1] A regular (non-wap) snapshot is being added to any branch. * *

[2] A staged (wap) snapshot is being created on top of current snapshot as its base. - * Recognized by STAGED_WAP_ID_PROP. + * Recognized by STAGED_WAP_ID_PROP. These are stage-only and not committed to any branch. * - *

[3] A staged (wap) snapshot is being cherry picked to the MAIN branch wherein current - * snapshot in the MAIN branch is not the same as the base snapshot the staged (wap) snapshot - * was created on. Recognized by SOURCE_SNAPSHOT_ID_PROP. This case is called non-fast forward + *

[3] A staged (wap) snapshot is being cherry picked to any branch wherein current snapshot + * in the target branch is not the same as the base snapshot the staged (wap) snapshot was + * created on. Recognized by SOURCE_SNAPSHOT_ID_PROP. This case is called non-fast forward * cherry pick. * *

In case no new snapshots are to be appended to current TableMetadata, there could be a - * cherrypick of a staged (wap) snapshot on top of the current snapshot in the MAIN branch which - * is the same as the base snapshot the staged (wap) snapshot was created on. This case is - * called fast forward cherry pick. + * cherrypick of a staged (wap) snapshot on top of the current snapshot in any branch which is + * the same as the base snapshot the staged (wap) snapshot was created on. This case is called + * fast forward cherry pick. */ if (CollectionUtils.isNotEmpty(snapshotsToAppend)) { for (Snapshot snapshot : snapshotsToAppend) { snapshotInspector.validateSnapshot(snapshot); if (snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) { - // a stage only snapshot using wap.id + // a stage only snapshot using wap.id - not committed to any branch metadataBuilder.addSnapshot(snapshot); stagedSnapshots.add(String.valueOf(snapshot.snapshotId())); } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { // a snapshot created on a non fast-forward cherry-pick snapshot - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + // Determine target branch from snapshotRefs or default to MAIN_BRANCH + String targetBranch = + determineTargetBranch(snapshotRefs, snapshotsToAppend, SnapshotRef.MAIN_BRANCH); + metadataBuilder.setBranchSnapshot(snapshot, targetBranch); appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); cherryPickedSnapshots.add( String.valueOf(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))); } else { - // a regular snapshot - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + // a regular snapshot - assign to appropriate branch using snapshotRefs context + if (MapUtils.isNotEmpty(snapshotRefs)) { + // We have explicit branch information, use it to assign snapshot + String targetBranch = + determineTargetBranch(snapshotRefs, snapshotsToAppend, SnapshotRef.MAIN_BRANCH); + metadataBuilder.setBranchSnapshot(snapshot, targetBranch); + } else { + // No explicit branch refs - treat as staged snapshot + // This maintains isolation until refs are explicitly updated + metadataBuilder.addSnapshot(snapshot); + } appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); } } } else if (MapUtils.isNotEmpty(snapshotRefs)) { - // Updated ref in the main branch with no new snapshot means this is a - // fast-forward cherry-pick or rollback operation. - long newSnapshotId = snapshotRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); - // Either the current snapshot is null or the current snapshot is not equal - // to the new snapshot indicates an update. The first case happens when the - // stage/wap snapshot being cherry-picked is the first snapshot. - if (MapUtils.isEmpty(metadata.refs()) - || metadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() != newSnapshotId) { - metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); - cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); + // Handle ref updates for all branches (fast-forward cherry-pick or rollback operations) + for (Map.Entry entry : snapshotRefs.entrySet()) { + String branchName = entry.getKey(); + long newSnapshotId = entry.getValue().snapshotId(); + + // Check if this is an actual update for this branch + boolean isUpdate = false; + if (MapUtils.isEmpty(metadata.refs())) { + // No refs exist yet, this is a new branch + isUpdate = true; + } else { + SnapshotRef currentRef = metadata.refs().get(branchName); + if (currentRef == null || currentRef.snapshotId() != newSnapshotId) { + // Branch doesn't exist or snapshot is different + isUpdate = true; + } + } + + if (isUpdate) { + metadataBuilder.setBranchSnapshot(newSnapshotId, branchName); + cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); + } } } if (recordAction) { diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java new file mode 100644 index 000000000..c5b239a4e --- /dev/null +++ b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java @@ -0,0 +1,878 @@ +package com.linkedin.openhouse.spark.catalogtest; + +import static org.junit.jupiter.api.Assertions.*; + +import com.linkedin.openhouse.tablestest.OpenHouseSparkITest; +import java.util.List; +import java.util.Set; +import java.util.stream.Collectors; +import org.apache.spark.sql.Row; +import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.Test; + +/** + * Comprehensive tests for multi-branch WAP operations in Spark 3.5. Tests validate the enhanced + * maybeAppendSnapshots functionality that supports: - Non-main branch operations (add/expire + * snapshots from any branch) - WAP.id staging with multi-branch support - Cherry picking between + * any branches - Fast forward merges for all branches - Backward compatibility with main-only + * workflows - Forward compatibility for future wap.branch features + */ +public class BranchTestSpark3_5 extends OpenHouseSparkITest { + + // ===== BASIC BRANCH OPERATIONS ===== + + @Test + public void testBasicBranchOperations() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Add initial data to main + spark.sql("INSERT INTO " + tableName + " VALUES ('main.initial')"); + + // Create feature branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Write to feature branch + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data2')"); + + // Verify branch isolation + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature-a has 3 rows + + // Verify refs exist for both branches + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refs.size()); + assertEquals("feature_a", refs.get(0).getString(0)); + assertEquals("main", refs.get(1).getString(0)); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + // ===== WAP STAGING WITH MULTI-BRANCH SUPPORT ===== + + @Test + public void testWapStagingWithBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup main and feature branches + spark.sql("INSERT INTO " + tableName + " VALUES ('main.data')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data')"); + + // Stage WAP snapshot (should not affect any branch) + spark.conf().set("spark.wap.id", "multi-branch-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.staged.data')"); + spark.conf().unset("spark.wap.id"); + + // Verify WAP staging doesn't affect branch visibility + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature-a unchanged + + // Verify WAP snapshot exists but no new refs + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + ".snapshots") + .collectAsList() + .size()); // 1 main + 1 feature + 1 wap + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + ".refs") + .collectAsList() + .size()); // main + feature-a only + + // Verify WAP snapshot has correct properties + List wapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'multi-branch-wap'") + .collectAsList(); + assertEquals(1, wapSnapshots.size()); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + // ===== CHERRY PICKING BETWEEN BRANCHES ===== + + @Test + public void testCherryPickToMainWithFeatureBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup branches + spark.sql("INSERT INTO " + tableName + " VALUES ('main.base')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Create WAP snapshot + spark.conf().set("spark.wap.id", "feature-target-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.for.feature')"); + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'feature-target-wap'") + .first() + .mkString(); + spark.conf().unset("spark.wap.id"); + + // CRITICAL: Advance main branch to force non-fast-forward cherry-pick + spark.sql("INSERT INTO " + tableName + " VALUES ('main.advance')"); + + // Cherry-pick WAP to main branch (this tests our enhanced maybeAppendSnapshots) + // Main should have 2 rows now (main.base + main.advance) + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + wapSnapshotId)); + + // Verify cherry-pick worked - 3 rows of data should appear in main (main.base + main.advance + // + wap.for.feature) + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify published WAP snapshot properties + List publishedSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['published-wap-id'] = 'feature-target-wap'") + .collectAsList(); + assertTrue( + publishedSnapshots.size() >= 1, + "Should find at least one snapshot with published-wap-id"); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + // ===== FAST FORWARD MERGES ===== + + @Test + public void testFastForwardMergeToMain() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Advance feature branch + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data2')"); + + // Verify initial state + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature has 3 rows + + // Fast-forward main to feature_a + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"); + + // Verify fast-forward worked - main should now have same data as feature_a + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify both branches point to same snapshot + String mainSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .first() + .mkString(); + String featureSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + assertEquals(mainSnapshot, featureSnapshot); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + @Test + public void testFastForwardMergeToFeature() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Advance main branch (feature_a stays at base) + spark.sql("INSERT INTO " + tableName + " VALUES ('main.data1')"); + spark.sql("INSERT INTO " + tableName + " VALUES ('main.data2')"); + + // Verify initial state + assertEquals( + 3, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 3 rows + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature has 1 row + + // Fast-forward feature_a to main + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'feature_a', 'main')"); + + // Verify fast-forward worked - feature_a should now have same data as main + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify both branches point to same snapshot + String mainSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .first() + .mkString(); + String featureSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + assertEquals(mainSnapshot, featureSnapshot); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + @Test + public void testFastForwardMergeWithWapId() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Create WAP snapshot + spark.conf().set("spark.wap.id", "test-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.data')"); + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'test-wap'") + .first() + .mkString(); + spark.conf().unset("spark.wap.id"); + + // Advance feature branch normally (not using WAP) + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data')"); + + // Verify WAP snapshot doesn't interfere with fast-forward + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature advanced + + // Fast-forward main to feature_a should work despite WAP presence + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"); + + // Verify fast-forward worked + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify WAP snapshot is still available for cherry-pick + List wapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'test-wap'") + .collectAsList(); + assertEquals(1, wapSnapshots.size()); + assertEquals(wapSnapshotId, wapSnapshots.get(0).mkString()); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + @Test + public void testFastForwardMergeBetweenTwoFeatureBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create two feature branches from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_b"); + + // Advance feature_a + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature_a.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature_a.data2')"); + + // Verify initial state + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a has 3 rows + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_b'") + .collectAsList() + .size()); // feature_b has 1 row + + // Fast-forward feature_b to feature_a + spark.sql( + "CALL openhouse.system.fast_forward('" + tableName + "', 'feature_b', 'feature_a')"); + + // Verify fast-forward worked + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a unchanged + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_b'") + .collectAsList() + .size()); // feature_b now matches feature_a + + // Verify feature_a and feature_b point to same snapshot + String featureASnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + String featureBSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_b'") + .first() + .mkString(); + assertEquals(featureASnapshot, featureBSnapshot); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + @Test + public void testFastForwardMergeIncompatibleLineage() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create feature branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Advance both branches independently (creating divergent history) + spark.sql("INSERT INTO " + tableName + " VALUES ('main.divergent')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.divergent')"); + + // Verify divergent state + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 2 rows + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a has 2 rows (different) + + // Attempt fast-forward should fail due to incompatible lineage + assertThrows( + Exception.class, + () -> + spark.sql( + "CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"), + "Fast-forward should fail when branches have divergent history"); + + // Verify branches remain unchanged after failed fast-forward + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Verify snapshots are still different + String mainSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .first() + .mkString(); + String featureSnapshot = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") + .first() + .mkString(); + assertNotEquals(mainSnapshot, featureSnapshot); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + // ===== SNAPSHOT EXPIRATION FROM NON-MAIN BRANCHES ===== + + @Test + public void testSnapshotExpirationFromFeatureBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup: Create multiple snapshots to have some that can be expired + + // 1. Create initial main data + spark.sql("INSERT INTO " + tableName + " VALUES ('main.initial')"); + + // 2. Create feature branch from main + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // 3. Add multiple snapshots to feature branch + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data1')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data2')"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data3')"); + + // 4. Query metadata tables to find snapshots that are NOT current branch heads + + // Get all snapshots + List allSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".snapshots ORDER BY committed_at") + .collectAsList(); + assertTrue(allSnapshots.size() >= 4, "Should have at least 4 snapshots"); + + // Get current branch head snapshots from refs table + List branchHeads = + spark.sql("SELECT snapshot_id FROM " + tableName + ".refs").collectAsList(); + Set referencedSnapshots = + branchHeads.stream().map(row -> row.mkString()).collect(Collectors.toSet()); + + System.out.println( + "DEBUG: All snapshots: " + + allSnapshots.stream().map(Row::mkString).collect(Collectors.toList())); + System.out.println("DEBUG: Referenced snapshots (branch heads): " + referencedSnapshots); + + // Find snapshots that are NOT referenced by any branch head + List unreferencedSnapshots = + allSnapshots.stream() + .map(Row::mkString) + .filter(snapshotId -> !referencedSnapshots.contains(snapshotId)) + .collect(Collectors.toList()); + + System.out.println("DEBUG: Unreferenced snapshots: " + unreferencedSnapshots); + + // We should have at least one unreferenced snapshot (intermediate feature snapshots) + assertFalse( + unreferencedSnapshots.isEmpty(), + "Should have at least one unreferenced snapshot to expire"); + + // Select the first unreferenced snapshot to expire + String snapshotToExpire = unreferencedSnapshots.get(0); + + // Verify this snapshot exists in the snapshots table + List beforeExpiration = + spark.sql("SELECT snapshot_id FROM " + tableName + ".snapshots").collectAsList(); + assertTrue( + beforeExpiration.stream().anyMatch(row -> row.mkString().equals(snapshotToExpire)), + "Snapshot to expire should exist before expiration"); + + // Expire the unreferenced snapshot + spark.sql( + String.format( + "CALL openhouse.system.expire_snapshots(table => '" + + tableName.replace("openhouse.", "") + + "', snapshot_ids => Array(%s))", + snapshotToExpire)); + + // Verify snapshot is gone + List afterExpiration = + spark.sql("SELECT snapshot_id FROM " + tableName + ".snapshots").collectAsList(); + assertFalse( + afterExpiration.stream().anyMatch(row -> row.mkString().equals(snapshotToExpire)), + "Expired snapshot should no longer exist"); + + // Verify branches are still intact after expiration + // Main should have: main.initial = 1 row + assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + + // Feature_a should have: main.initial + feature.data1 + feature.data2 + feature.data3 = 4 + // rows + assertEquals( + 4, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + @Test + public void testWapSnapshotExpirationWithMultipleBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup multi-branch environment + spark.sql("INSERT INTO " + tableName + " VALUES ('main.base')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.base')"); + + // Create multiple WAP snapshots + spark.conf().set("spark.wap.id", "wap-to-keep"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.keep.data')"); + + spark.conf().set("spark.wap.id", "wap-to-expire"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap.expire.data')"); + String expireWapId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'wap-to-expire'") + .first() + .mkString(); + spark.conf().unset("spark.wap.id"); + + // Expire specific WAP snapshot + spark.sql( + String.format( + "CALL openhouse.system.expire_snapshots(table => '" + + tableName.replace("openhouse.", "") + + "', snapshot_ids => Array(%s))", + expireWapId)); + + // Verify selective WAP expiration + List remainingWaps = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'wap-to-keep'") + .collectAsList(); + assertEquals(1, remainingWaps.size()); + + List expiredWaps = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'wap-to-expire'") + .collectAsList(); + assertEquals(0, expiredWaps.size()); + + // Verify branches unchanged + assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + // ===== BACKWARD COMPATIBILITY ===== + + @Test + public void testBackwardCompatibilityMainBranchOnly() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Traditional main-only workflow (should work exactly as before) + spark.sql("INSERT INTO " + tableName + " VALUES ('main.1')"); + spark.sql("INSERT INTO " + tableName + " VALUES ('main.2')"); + + // WAP staging (traditional) + spark.conf().set("spark.wap.id", "compat-test-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('compat.wap.data')"); + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'compat-test-wap'") + .first() + .mkString(); + spark.conf().unset("spark.wap.id"); + + // Traditional cherry-pick to main + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + wapSnapshotId)); + + // Verify traditional behavior preserved + assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + List refs = spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); + assertEquals(1, refs.size()); + assertEquals("main", refs.get(0).getString(0)); + + // Traditional snapshot queries should work + assertTrue( + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList().size() >= 3); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + // ===== ERROR SCENARIOS ===== + + @Test + public void testErrorInsertToNonExistentBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + + // Create one valid branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Verify valid branch works + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('valid.data')"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); + + // Attempt to insert into non-existent branch should fail + assertThrows( + Exception.class, + () -> + spark.sql("INSERT INTO " + tableName + ".branch_nonexistent VALUES ('invalid.data')"), + "Insert to non-existent branch should fail"); + + // Verify table state unchanged after failed insert + assertEquals( + 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a unchanged + + // Verify only valid branches exist + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refs.size()); + assertEquals("feature_a", refs.get(0).getString(0)); + assertEquals("main", refs.get(1).getString(0)); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } + + @Test + public void testErrorCherryPickNonExistentWapId() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data and branch + spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + + // Create a valid WAP snapshot + spark.conf().set("spark.wap.id", "valid-wap"); + spark.sql("INSERT INTO " + tableName + " VALUES ('valid.wap.data')"); + String validWapId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") + .first() + .mkString(); + spark.conf().unset("spark.wap.id"); + + // Verify valid WAP cherry-pick works + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + validWapId)); + assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + + // Attempt to cherry-pick non-existent snapshot ID should fail + long nonExistentSnapshotId = 999999999L; + assertThrows( + Exception.class, + () -> + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + nonExistentSnapshotId)), + "Cherry-pick of non-existent snapshot should fail"); + + // Attempt to cherry-pick with malformed snapshot ID should fail + assertThrows( + Exception.class, + () -> + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + "invalid-id")), + "Cherry-pick with invalid snapshot ID should fail"); + + // Verify table state unchanged after failed cherry-picks + assertEquals( + 2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") + .collectAsList() + .size()); // feature_a unchanged + + // Verify valid WAP snapshot still exists + List validWaps = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") + .collectAsList(); + assertEquals(1, validWaps.size()); + + spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + } + } +} From 8546d4323d49b8a97bcbc5fedbba545487f88b9b Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 30 Sep 2025 14:46:58 -0700 Subject: [PATCH 02/35] wap branch green tests --- .../spark/catalogtest/BranchTestSpark3_5.java | 467 ++++++++++++++++-- 1 file changed, 420 insertions(+), 47 deletions(-) diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java index c5b239a4e..942f5e89a 100644 --- a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java +++ b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java @@ -8,7 +8,12 @@ import java.util.stream.Collectors; import org.apache.spark.sql.Row; import org.apache.spark.sql.SparkSession; +import org.junit.jupiter.api.AfterEach; +import org.junit.jupiter.api.MethodOrderer; import org.junit.jupiter.api.Test; +import org.junit.jupiter.api.TestMethodOrder; +import org.junit.jupiter.api.parallel.Execution; +import org.junit.jupiter.api.parallel.ExecutionMode; /** * Comprehensive tests for multi-branch WAP operations in Spark 3.5. Tests validate the enhanced @@ -17,8 +22,53 @@ * any branches - Fast forward merges for all branches - Backward compatibility with main-only * workflows - Forward compatibility for future wap.branch features */ +@TestMethodOrder(MethodOrderer.MethodName.class) +@Execution(ExecutionMode.SAME_THREAD) public class BranchTestSpark3_5 extends OpenHouseSparkITest { + /** + * Comprehensive cleanup method to prevent configuration and table bleed-over between tests. This + * ensures WAP configurations are properly reset and all test tables are dropped. + */ + @AfterEach + public void cleanupAfterTest() { + try (SparkSession spark = getSparkSession()) { + // Clear WAP configurations to prevent bleed-over between tests + spark.conf().unset("spark.wap.id"); + spark.conf().unset("spark.wap.branch"); + + // Drop all test tables to ensure clean state for next test + // Get all tables in the d1 database that start with branch_test_ or similar patterns + try { + List tables = spark.sql("SHOW TABLES IN openhouse.d1").collectAsList(); + for (Row table : tables) { + String tableName = table.getString(1); // table name is in second column + if (tableName.startsWith("branch_test_") || tableName.startsWith("test_")) { + String fullTableName = "openhouse.d1." + tableName; + spark.sql("DROP TABLE IF EXISTS " + fullTableName); + } + } + } catch (Exception e) { + // If SHOW TABLES fails, try to drop common test table patterns + // This is a fallback in case the database doesn't exist yet + for (String pattern : new String[] {"branch_test_", "test_"}) { + for (int i = 0; i < 10; i++) { // Try a few recent timestamps + long timestamp = System.currentTimeMillis() - (i * 1000); + String tableName = "openhouse.d1." + pattern + timestamp; + try { + spark.sql("DROP TABLE IF EXISTS " + tableName); + } catch (Exception ignored) { + // Ignore failures for non-existent tables + } + } + } + } + } catch (Exception e) { + // Log but don't fail the test for cleanup issues + System.err.println("Warning: Failed to cleanup after test: " + e.getMessage()); + } + } + // ===== BASIC BRANCH OPERATIONS ===== @Test @@ -27,7 +77,6 @@ public void testBasicBranchOperations() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName); spark.sql("CREATE TABLE " + tableName + " (name string)"); // Add initial data to main @@ -56,8 +105,6 @@ public void testBasicBranchOperations() throws Exception { assertEquals(2, refs.size()); assertEquals("feature_a", refs.get(0).getString(0)); assertEquals("main", refs.get(1).getString(0)); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -69,7 +116,6 @@ public void testWapStagingWithBranches() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); @@ -81,7 +127,6 @@ public void testWapStagingWithBranches() throws Exception { // Stage WAP snapshot (should not affect any branch) spark.conf().set("spark.wap.id", "multi-branch-wap"); spark.sql("INSERT INTO " + tableName + " VALUES ('wap.staged.data')"); - spark.conf().unset("spark.wap.id"); // Verify WAP staging doesn't affect branch visibility assertEquals( @@ -116,8 +161,6 @@ public void testWapStagingWithBranches() throws Exception { + ".snapshots WHERE summary['wap.id'] = 'multi-branch-wap'") .collectAsList(); assertEquals(1, wapSnapshots.size()); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -129,7 +172,6 @@ public void testCherryPickToMainWithFeatureBranch() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); @@ -148,9 +190,9 @@ public void testCherryPickToMainWithFeatureBranch() throws Exception { + ".snapshots WHERE summary['wap.id'] = 'feature-target-wap'") .first() .mkString(); - spark.conf().unset("spark.wap.id"); - // CRITICAL: Advance main branch to force non-fast-forward cherry-pick + // CRITICAL: Unset WAP ID before advancing main branch to force non-fast-forward cherry-pick + // spark.conf().unset("spark.wap.id"); spark.sql("INSERT INTO " + tableName + " VALUES ('main.advance')"); // Cherry-pick WAP to main branch (this tests our enhanced maybeAppendSnapshots) @@ -184,8 +226,6 @@ public void testCherryPickToMainWithFeatureBranch() throws Exception { assertTrue( publishedSnapshots.size() >= 1, "Should find at least one snapshot with published-wap-id"); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -197,7 +237,6 @@ public void testFastForwardMergeToMain() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); // Setup base data @@ -244,8 +283,6 @@ public void testFastForwardMergeToMain() throws Exception { .first() .mkString(); assertEquals(mainSnapshot, featureSnapshot); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -255,7 +292,6 @@ public void testFastForwardMergeToFeature() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); // Setup base data @@ -303,18 +339,15 @@ public void testFastForwardMergeToFeature() throws Exception { .first() .mkString(); assertEquals(mainSnapshot, featureSnapshot); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @Test - public void testFastForwardMergeWithWapId() throws Exception { + public void testFastForwardFeatureToMainAndWapId() throws Exception { try (SparkSession spark = getSparkSession()) { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); @@ -335,9 +368,10 @@ public void testFastForwardMergeWithWapId() throws Exception { + ".snapshots WHERE summary['wap.id'] = 'test-wap'") .first() .mkString(); - spark.conf().unset("spark.wap.id"); - // Advance feature branch normally (not using WAP) + // Unset WAP ID before advancing feature branch normally (not using WAP - else WAP staged + // snapshot will apply to feature branch) + spark.conf().unset("spark.wap.id"); spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data')"); // Verify WAP snapshot doesn't interfere with fast-forward @@ -372,8 +406,6 @@ public void testFastForwardMergeWithWapId() throws Exception { .collectAsList(); assertEquals(1, wapSnapshots.size()); assertEquals(wapSnapshotId, wapSnapshots.get(0).mkString()); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -383,7 +415,6 @@ public void testFastForwardMergeBetweenTwoFeatureBranches() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); // Setup base data @@ -445,8 +476,6 @@ public void testFastForwardMergeBetweenTwoFeatureBranches() throws Exception { .first() .mkString(); assertEquals(featureASnapshot, featureBSnapshot); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -456,7 +485,6 @@ public void testFastForwardMergeIncompatibleLineage() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); // Setup base data @@ -509,8 +537,6 @@ public void testFastForwardMergeIncompatibleLineage() throws Exception { .first() .mkString(); assertNotEquals(mainSnapshot, featureSnapshot); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -522,7 +548,6 @@ public void testSnapshotExpirationFromFeatureBranch() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); @@ -610,8 +635,6 @@ public void testSnapshotExpirationFromFeatureBranch() throws Exception { .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") .collectAsList() .size()); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -621,7 +644,6 @@ public void testWapSnapshotExpirationWithMultipleBranches() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); @@ -644,7 +666,6 @@ public void testWapSnapshotExpirationWithMultipleBranches() throws Exception { + ".snapshots WHERE summary['wap.id'] = 'wap-to-expire'") .first() .mkString(); - spark.conf().unset("spark.wap.id"); // Expire specific WAP snapshot spark.sql( @@ -681,20 +702,161 @@ public void testWapSnapshotExpirationWithMultipleBranches() throws Exception { .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") .collectAsList() .size()); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } // ===== BACKWARD COMPATIBILITY ===== + @Test + public void testWapIdOnFeatureBranchAndMainBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data in main branch + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'main_base')"); + + // Create feature branch and add base data to it + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES (10, 'feature_base')"); + + // Verify initial state - main has 1 row, feature has 2 rows + assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); + assertEquals( + 2, spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size()); + + // Create WAP staged snapshot (invisible to normal reads) + spark.conf().set("spark.wap.id", "shared-wap-snapshot"); + spark.sql("INSERT INTO " + tableName + " VALUES (99, 'wap_staged_data')"); + + // Get the WAP snapshot ID + String wapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'shared-wap-snapshot'") + .first() + .mkString(); + + // Verify WAP staging doesn't affect normal reads (principle 2: invisible until published) + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main should not see WAP staged data"); + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), + "Feature should not see WAP staged data"); + + // Clear WAP ID to avoid contamination + spark.conf().unset("spark.wap.id"); + + // Cherry-pick the same WAP snapshot to MAIN branch + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + wapSnapshotId)); + + // Verify cherry-pick to main worked - main should now have the WAP data + List mainAfterCherryPick = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals(2, mainAfterCherryPick.size(), "Main should have base + cherry-picked WAP data"); + boolean mainHasWapData = + mainAfterCherryPick.stream().anyMatch(row -> "wap_staged_data".equals(row.getString(1))); + assertTrue(mainHasWapData, "Main should contain cherry-picked WAP data"); + + // Verify feature branch is still unaffected + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), + "Feature branch should be unchanged"); + + // Demonstrate that WAP snapshots work independently on different branches by + // creating a separate WAP snapshot while on the feature branch context + + // Create another WAP snapshot that could be applied to feature branch + spark.conf().set("spark.wap.id", "feature-specific-wap"); + spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES (50, 'feature_wap_data')"); + + String featureWapSnapshotId = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'feature-specific-wap'") + .first() + .mkString(); + + // Clear WAP ID again + spark.conf().unset("spark.wap.id"); + + // Verify that both WAP snapshots exist but are invisible to normal reads + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main should still only show cherry-picked data"); + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), + "Feature should not show new WAP data yet"); + + // Show that we can cherry-pick the feature WAP to main as well (demonstrating cross-branch + // capability) + spark.sql( + String.format( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', %s)", + featureWapSnapshotId)); + + // Verify main now has both cherry-picked WAP snapshots + List finalMain = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals(3, finalMain.size(), "Main should have base + first WAP + second WAP data"); + + boolean hasOriginalWap = + finalMain.stream().anyMatch(row -> "wap_staged_data".equals(row.getString(1))); + boolean hasFeatureWap = + finalMain.stream().anyMatch(row -> "feature_wap_data".equals(row.getString(1))); + assertTrue(hasOriginalWap, "Main should contain first cherry-picked WAP data"); + assertTrue(hasFeatureWap, "Main should contain second cherry-picked WAP data"); + + // Verify feature branch is still independent and unchanged by main's cherry-picks + List finalFeature = + spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList(); + assertEquals( + 2, finalFeature.size(), "Feature should still only have base + feature_base data"); + + // Verify that both original WAP snapshots are still available in metadata + List originalWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'shared-wap-snapshot'") + .collectAsList(); + List featureWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots WHERE summary['wap.id'] = 'feature-specific-wap'") + .collectAsList(); + assertEquals(1, originalWapSnapshots.size(), "Original WAP snapshot should still exist"); + assertEquals(1, featureWapSnapshots.size(), "Feature WAP snapshot should still exist"); + } + } + @Test public void testBackwardCompatibilityMainBranchOnly() throws Exception { try (SparkSession spark = getSparkSession()) { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); @@ -713,7 +875,6 @@ public void testBackwardCompatibilityMainBranchOnly() throws Exception { + ".snapshots WHERE summary['wap.id'] = 'compat-test-wap'") .first() .mkString(); - spark.conf().unset("spark.wap.id"); // Traditional cherry-pick to main spark.sql( @@ -732,8 +893,227 @@ public void testBackwardCompatibilityMainBranchOnly() throws Exception { // Traditional snapshot queries should work assertTrue( spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList().size() >= 3); + } + } + + // ===== WAP BRANCH TESTING ===== + // These tests validate the intended WAP branch functionality. + // WAP branch should stage writes to a specific branch without affecting main. + + @Test + public void testStagedChangesVisibleViaConf() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (1, 'base_data')"); + + // Create WAP branch and insert staged data + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH wap_branch"); + spark.conf().set("spark.wap.branch", "wap_branch"); + spark.sql("INSERT INTO " + tableName + " VALUES (2, 'staged_data')"); + + // When spark.wap.branch is set, SELECT should see WAP branch data (2 rows) + List wapVisible = spark.sql("SELECT * FROM " + tableName).collectAsList(); + assertEquals( + 2, wapVisible.size(), "Should see both base and staged data when wap.branch is set"); + + // When spark.wap.branch is unset, SELECT should see only main data (1 row) + spark.conf().unset("spark.wap.branch"); + List mainOnly = spark.sql("SELECT * FROM " + tableName).collectAsList(); + assertEquals(1, mainOnly.size(), "Should see only base data when wap.branch is unset"); + } + } + + @Test + public void testStagedChangesHidden() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create WAP branch for staged operations + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH wap"); + + // Set WAP branch for staged testing + spark.conf().set("spark.wap.branch", "wap"); + + // INSERT INTO table -> inserts to the WAP branch + spark.sql("INSERT INTO " + tableName + " VALUES (1, 'staged_data')"); + + // When spark.wap.branch is set: + // ✅ SELECT * FROM table → reads from the WAP branch + List tableData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals( + 2, + tableData.size(), + "SELECT * FROM table should read from WAP branch when spark.wap.branch is set"); + boolean hasBase = tableData.stream().anyMatch(row -> "base".equals(row.getString(1))); + boolean hasStaged = + tableData.stream().anyMatch(row -> "staged_data".equals(row.getString(1))); + assertTrue(hasBase, "WAP branch should contain base data"); + assertTrue(hasStaged, "WAP branch should contain staged data"); + + // ✅ SELECT * FROM table.branch_wap → explicitly reads from WAP branch + List wapBranchData = + spark.sql("SELECT * FROM " + tableName + ".branch_wap").collectAsList(); + assertEquals(2, wapBranchData.size(), "Explicit WAP branch select should show staged data"); + + // ✅ SELECT * FROM table.branch_main → explicitly reads from main branch + List mainBranchData = + spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList(); + assertEquals( + 1, mainBranchData.size(), "Explicit main branch select should only show base data"); + assertEquals( + "base", mainBranchData.get(0).getString(1), "Main branch should only contain base data"); + + // Now unset spark.wap.branch and ensure main branch is the referenced data + spark.conf().unset("spark.wap.branch"); + + // When spark.wap.branch is unset, SELECT * FROM table should read from main branch + List afterUnsetData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals( + 1, + afterUnsetData.size(), + "SELECT * FROM table should read from main branch when spark.wap.branch is unset"); + assertEquals( + "base", + afterUnsetData.get(0).getString(1), + "After unsetting wap.branch, should read from main"); + + // INSERT INTO table should go to main branch when spark.wap.branch is unset + spark.sql("INSERT INTO " + tableName + " VALUES (2, 'main_data')"); + List finalMainData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals( + 2, finalMainData.size(), "Main branch should now have 2 rows after unsetting wap.branch"); + boolean hasMainData = + finalMainData.stream().anyMatch(row -> "main_data".equals(row.getString(1))); + assertTrue(hasMainData, "Main branch should contain the newly inserted data"); + + // WAP branch should remain unchanged + List finalWapData = + spark.sql("SELECT * FROM " + tableName + ".branch_wap").collectAsList(); + assertEquals( + 2, finalWapData.size(), "WAP branch should remain unchanged with base + staged data"); + } + } + + @Test + public void testPublishWapBranch() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create staging branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); + + // Stage changes to WAP branch + spark.conf().set("spark.wap.branch", "staging"); + spark.sql("INSERT INTO " + tableName + " VALUES (1, 'staged_for_publish')"); + + // When spark.wap.branch is set, SELECT * FROM table should read from WAP branch + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "SELECT * FROM table should read from WAP branch when spark.wap.branch is set"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'staging'") + .collectAsList() + .size(), + "Staging should have staged data"); + + // Verify main branch still only has base data + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList().size(), + "Main branch should not have staged data"); - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); + // Fast-forward main branch to staging branch to publish the staged changes + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'staging')"); + + // Verify data is now published to main branch (need to explicitly check main branch) + List publishedData = + spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList(); + assertEquals(2, publishedData.size(), "Main branch should now have published data"); + + boolean hasPublished = + publishedData.stream().anyMatch(row -> "staged_for_publish".equals(row.getString(1))); + assertTrue(hasPublished, "Main branch should contain the published staged data"); + + // Verify that with wap.branch still set, SELECT * FROM table still reads from WAP branch + List wapData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); + assertEquals(2, wapData.size(), "SELECT * FROM table should still read from WAP branch"); + } + } + + @Test + public void testWapIdAndWapBranchIncompatible() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create staging branch + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); + + // Set both WAP ID and WAP branch - this should be invalid + spark.conf().set("spark.wap.id", "test-wap-id"); + spark.conf().set("spark.wap.branch", "staging"); + + // Attempt to write with both configurations should fail + assertThrows( + Exception.class, + () -> spark.sql("INSERT INTO " + tableName + " VALUES (1, 'invalid')"), + "Cannot use both wap.id and wap.branch simultaneously"); + } + } + + @Test + public void testCannotWriteToBothBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Setup base data + spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); + + // Create branches + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature"); + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); + + // Set WAP branch + spark.conf().set("spark.wap.branch", "staging"); + + // ❌ INVALID: Cannot write to both normal branch and WAP branch + assertThrows( + Exception.class, + () -> spark.sql("INSERT INTO " + tableName + ".branch_feature VALUES (1, 'invalid')"), + "Cannot write to explicit branch when wap.branch is set"); } } @@ -745,7 +1125,6 @@ public void testErrorInsertToNonExistentBranch() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); // Setup base data @@ -786,8 +1165,6 @@ public void testErrorInsertToNonExistentBranch() throws Exception { assertEquals(2, refs.size()); assertEquals("feature_a", refs.get(0).getString(0)); assertEquals("main", refs.get(1).getString(0)); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } @@ -797,7 +1174,6 @@ public void testErrorCherryPickNonExistentWapId() throws Exception { String tableId = "branch_test_" + System.currentTimeMillis(); String tableName = "openhouse.d1." + tableId; - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); spark.sql("CREATE TABLE " + tableName + " (name string)"); spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); @@ -816,7 +1192,6 @@ public void testErrorCherryPickNonExistentWapId() throws Exception { + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") .first() .mkString(); - spark.conf().unset("spark.wap.id"); // Verify valid WAP cherry-pick works spark.sql( @@ -871,8 +1246,6 @@ public void testErrorCherryPickNonExistentWapId() throws Exception { + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") .collectAsList(); assertEquals(1, validWaps.size()); - - spark.sql("DROP TABLE IF EXISTS " + tableName + ""); } } } From e20688889df85d9d8b0c08795490a1d9f64dcdaa Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 30 Sep 2025 14:59:40 -0700 Subject: [PATCH 03/35] accidentally commented line --- .../openhouse/spark/catalogtest/BranchTestSpark3_5.java | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java index 942f5e89a..30c9be7a0 100644 --- a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java +++ b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java @@ -192,7 +192,7 @@ public void testCherryPickToMainWithFeatureBranch() throws Exception { .mkString(); // CRITICAL: Unset WAP ID before advancing main branch to force non-fast-forward cherry-pick - // spark.conf().unset("spark.wap.id"); + spark.conf().unset("spark.wap.id"); spark.sql("INSERT INTO " + tableName + " VALUES ('main.advance')"); // Cherry-pick WAP to main branch (this tests our enhanced maybeAppendSnapshots) From ef1e5b412700bdf4a461456f203254315132ca01 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 30 Sep 2025 17:17:32 -0700 Subject: [PATCH 04/35] remove test with old behavior --- .../OpenHouseInternalTableOperationsTest.java | 23 ------------------- 1 file changed, 23 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index f484b60ae..bcec8377b 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -587,29 +587,6 @@ void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { } } - @Test - void testDoCommitAppendSnapshotsToNonMainBranch() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - Map properties = new HashMap<>(BASE_TABLE_METADATA.properties()); - try (MockedStatic ignoreWriteMock = - Mockito.mockStatic(TableMetadataParser.class)) { - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, - SnapshotsUtil.serializedSnapshots(testSnapshots.subList(0, 1))); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testSnapshots.get(0), "branch"))); - properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); - - TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); - // verify throw an error when committing to non-main branch. - Assertions.assertThrows( - CommitStateUnknownException.class, - () -> openHouseInternalTableOperations.doCommit(BASE_TABLE_METADATA, metadata)); - } - } - @Test void testAppendSnapshotsWithOldSnapshots() throws IOException { TableMetadata metadata = From d0de1da63e9d360199b32e9dd485207435d632ea Mon Sep 17 00:00:00 2001 From: cbb330 Date: Wed, 1 Oct 2025 17:50:13 -0700 Subject: [PATCH 05/35] fixing multi-branch commits and ambiguous references --- .../OpenHouseInternalTableOperations.java | 118 +- .../spark/catalogtest/BranchTestSpark3_5.java | 1116 +++++++++++++++++ 2 files changed, 1209 insertions(+), 25 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 793167e47..5ed27c62b 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -571,8 +571,9 @@ private String determineTargetBranch( } /** - * Determines the target branch for snapshot commits by matching snapshot IDs. When multiple - * branches are present, finds which branch should receive the new snapshots. + * Determines the target branch for snapshot commits using explicit branch targeting information. + * The snapshotRefs parameter contains the explicit branch targeting from the client commit + * operation. */ private String determineTargetBranch( Map snapshotRefs, List newSnapshots, String defaultBranch) { @@ -581,42 +582,102 @@ private String determineTargetBranch( } // If there's only one branch in the refs, use that as the target + // This is the most common case - client explicitly specified which branch to commit to if (snapshotRefs.size() == 1) { - return snapshotRefs.keySet().iterator().next(); + String targetBranch = snapshotRefs.keySet().iterator().next(); + log.debug("Using explicit target branch from commit context: {}", targetBranch); + return targetBranch; } - // CRITICAL FIX: For multi-branch scenarios, find which branch should get the new snapshots + // Multiple branches specified in commit - need to determine which one based on snapshot + // relationships + log.info( + "Multiple branches in snapshotRefs ({}), analyzing snapshot relationships", + snapshotRefs.size()); if (!newSnapshots.isEmpty()) { - // Get the latest snapshot ID from new snapshots - long latestSnapshotId = newSnapshots.get(newSnapshots.size() - 1).snapshotId(); + Snapshot latestSnapshot = newSnapshots.get(newSnapshots.size() - 1); + long latestSnapshotId = latestSnapshot.snapshotId(); + log.info("Latest snapshot ID: {}", latestSnapshotId); - // Find which branch in snapshotRefs should point to this snapshot + // First try: exact snapshot ID match within the explicitly targeted branches + List exactMatches = new ArrayList<>(); for (Map.Entry entry : snapshotRefs.entrySet()) { String branchName = entry.getKey(); long branchSnapshotId = entry.getValue().snapshotId(); if (branchSnapshotId == latestSnapshotId) { - log.debug( - "Determined target branch '{}' by snapshot ID match: {}", - branchName, - latestSnapshotId); - return branchName; + exactMatches.add(branchName); } } - } - // Fallback: if we can't match by snapshot ID, prefer non-main branches for branch operations - for (String branchName : snapshotRefs.keySet()) { - if (!branchName.equals(SnapshotRef.MAIN_BRANCH)) { - log.debug( - "Multiple branches, no snapshot match, preferring non-main branch: {}", branchName); - return branchName; + if (exactMatches.size() == 1) { + String targetBranch = exactMatches.get(0); + log.info( + "Determined target branch '{}' by exact snapshot ID match within commit context: {}", + targetBranch, + latestSnapshotId); + return targetBranch; + } else if (exactMatches.size() > 1) { + log.error( + "Multiple branches point to same snapshot {}: {}", latestSnapshotId, exactMatches); + throw new IllegalStateException( + String.format( + "Multiple explicitly targeted branches point to the same snapshot %s: %s. " + + "This indicates an invalid commit state.", + latestSnapshotId, exactMatches)); + } + + // Second try: parent-child relationship match within the explicitly targeted branches + Long parentSnapshotId = latestSnapshot.parentId(); + log.info("Parent snapshot ID: {}", parentSnapshotId); + if (parentSnapshotId != null) { + List parentMatches = new ArrayList<>(); + for (Map.Entry entry : snapshotRefs.entrySet()) { + String branchName = entry.getKey(); + long branchSnapshotId = entry.getValue().snapshotId(); + + if (branchSnapshotId == parentSnapshotId) { + parentMatches.add(branchName); + log.info("Branch '{}' matches parent snapshot {}", branchName, parentSnapshotId); + } + } + + if (parentMatches.size() == 1) { + String targetBranch = parentMatches.get(0); + log.info( + "Determined target branch '{}' by parent-child relationship within commit context: new snapshot {} is child of branch snapshot {}", + targetBranch, + latestSnapshotId, + parentSnapshotId); + return targetBranch; + } else if (parentMatches.size() > 1) { + log.error( + "Multiple branches point to parent snapshot {}: {}", parentSnapshotId, parentMatches); + throw new IllegalStateException( + String.format( + "Multiple explicitly targeted branches point to parent snapshot %s: %s. " + + "Cannot determine which branch should receive child snapshot %s. " + + "This indicates ambiguous commit targeting - the client should specify a single target branch.", + parentSnapshotId, parentMatches, latestSnapshotId)); + } + // If parentMatches.size() == 0, none of the explicitly targeted branches are parents + // This could happen in cherry-pick or other non-linear operations } } - // Final fallback to main - log.debug("Multiple branches, falling back to main branch"); - return SnapshotRef.MAIN_BRANCH; + // If we reach here, we have multiple explicitly targeted branches but couldn't determine + // the target based on snapshot relationships. This suggests the commit operation itself + // is ambiguous or invalid. + log.error( + "Cannot determine target branch from explicitly targeted branches: {}", + snapshotRefs.keySet()); + throw new IllegalStateException( + String.format( + "Cannot determine target branch from explicitly targeted branches: %s. " + + "The commit specifies multiple target branches but snapshot relationships " + + "don't clearly indicate which branch should receive the new snapshots. " + + "This suggests an invalid or ambiguous commit operation.", + snapshotRefs.keySet())); } public TableMetadata maybeAppendSnapshots( @@ -658,8 +719,10 @@ public TableMetadata maybeAppendSnapshots( } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { // a snapshot created on a non fast-forward cherry-pick snapshot // Determine target branch from snapshotRefs or default to MAIN_BRANCH + // Pass only the current snapshot being processed, not the entire list String targetBranch = - determineTargetBranch(snapshotRefs, snapshotsToAppend, SnapshotRef.MAIN_BRANCH); + determineTargetBranch( + snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); metadataBuilder.setBranchSnapshot(snapshot, targetBranch); appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); cherryPickedSnapshots.add( @@ -668,8 +731,10 @@ public TableMetadata maybeAppendSnapshots( // a regular snapshot - assign to appropriate branch using snapshotRefs context if (MapUtils.isNotEmpty(snapshotRefs)) { // We have explicit branch information, use it to assign snapshot + // Pass only the current snapshot being processed, not the entire list String targetBranch = - determineTargetBranch(snapshotRefs, snapshotsToAppend, SnapshotRef.MAIN_BRANCH); + determineTargetBranch( + snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); metadataBuilder.setBranchSnapshot(snapshot, targetBranch); } else { // No explicit branch refs - treat as staged snapshot @@ -679,7 +744,10 @@ public TableMetadata maybeAppendSnapshots( appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); } } - } else if (MapUtils.isNotEmpty(snapshotRefs)) { + } + + // Handle ref updates (this can happen independently of snapshot append operations) + if (MapUtils.isNotEmpty(snapshotRefs)) { // Handle ref updates for all branches (fast-forward cherry-pick or rollback operations) for (Map.Entry entry : snapshotRefs.entrySet()) { String branchName = entry.getKey(); diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java index 30c9be7a0..c8a0f3e03 100644 --- a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java +++ b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java @@ -164,6 +164,1122 @@ public void testWapStagingWithBranches() throws Exception { } } + @Test + public void testWapIdAfterCreateTable() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_id_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table without any data (no snapshots exist) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Enable WAP on the table + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Verify no snapshots exist yet + List initialSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); + + // Verify no branches exist yet (empty table has no branches) + List initialRefs = spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); + assertEquals(0, initialRefs.size(), "Empty table should have no branches initially"); + + // ===== WAP STAGING ON EMPTY TABLE ===== + + // 1. Create WAP staged data on empty table (should create staging snapshot) + spark.conf().set("spark.wap.id", "wap-stage-1"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_1')"); + spark.conf().unset("spark.wap.id"); + + // Verify WAP snapshot was created + List wapSnapshots = + spark + .sql( + "SELECT snapshot_id, summary FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] = 'wap-stage-1'") + .collectAsList(); + assertEquals(1, wapSnapshots.size(), "Should have 1 WAP staged snapshot"); + + // Verify no branches exist yet (WAP staging doesn't create branches) + List refsAfterWapStaging = + spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); + assertEquals(0, refsAfterWapStaging.size(), "WAP staging should not create branches"); + + // Verify WAP data is not visible in main queries (no branch exists) + assertEquals( + 0, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 0 rows - no branches exist, WAP data is staged"); + + // ===== WAP PUBLISHING TO CREATE MAIN BRANCH ===== + + // 2. Publish WAP data to create main branch + String wapSnapshotId = String.valueOf(wapSnapshots.get(0).getLong(0)); + spark.sql( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', " + + wapSnapshotId + + ")"); + + // Verify main branch now exists + List refsAfterPublishing = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals( + 1, refsAfterPublishing.size(), "Should have main branch after publishing WAP data"); + assertEquals("main", refsAfterPublishing.get(0).getString(0), "Should have main branch"); + + // Verify WAP data is now visible in main branch + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row after WAP publishing"); + + List mainData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); + assertEquals( + "wap_staged_data_1", mainData.get(0).getString(0), "Should see published WAP data"); + + // ===== MULTI-WAP OPERATIONS ===== + + // 3. Create multiple WAP staged data sets + spark.conf().set("spark.wap.id", "wap-stage-2"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_2')"); + spark.conf().unset("spark.wap.id"); + + spark.conf().set("spark.wap.id", "wap-stage-3"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_3')"); + spark.conf().unset("spark.wap.id"); + + // Verify multiple WAP snapshots exist + List allWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] IS NOT NULL") + .collectAsList(); + assertEquals(3, allWapSnapshots.size(), "Should have 3 WAP staged snapshots"); + + // Verify main branch is unchanged (WAP data is staged) + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (staged WAP not visible)"); + + // ===== SELECTIVE WAP PUBLISHING ===== + + // 4. Publish second WAP data set only + List wap2Snapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] = 'wap-stage-2'") + .collectAsList(); + String wap2SnapshotId = String.valueOf(wap2Snapshots.get(0).getLong(0)); + spark.sql( + "CALL openhouse.system.cherrypick_snapshot('" + + tableName.replace("openhouse.", "") + + "', " + + wap2SnapshotId + + ")"); + + // Verify main branch now has both published datasets + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 2 rows after second WAP publishing"); + + List publishedData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + "wap_staged_data_1", + publishedData.get(0).getString(0), + "First row should be first WAP data"); + assertEquals( + "wap_staged_data_2", + publishedData.get(1).getString(0), + "Second row should be second WAP data"); + + // ===== UNPUBLISHED WAP DATA VERIFICATION ===== + + // 5. Verify third WAP data remains unpublished + List wap3Snapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] = 'wap-stage-3'") + .collectAsList(); + assertEquals(1, wap3Snapshots.size(), "Third WAP snapshot should still exist"); + + // Verify unpublished WAP data is not visible + List currentData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertFalse( + currentData.stream().anyMatch(row -> "wap_staged_data_3".equals(row.getString(0))), + "Unpublished WAP data should not be visible in main branch"); + + // ===== REGULAR DATA VS WAP DATA ===== + + // 6. Add regular (non-WAP) data to main branch + spark.sql("INSERT INTO " + tableName + " VALUES ('regular_data')"); + + // Verify main branch now has mixed data + assertEquals( + 3, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 3 rows (2 published WAP + 1 regular)"); + + List finalData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("regular_data", finalData.get(0).getString(0), "Should contain regular data"); + assertEquals( + "wap_staged_data_1", finalData.get(1).getString(0), "Should contain first WAP data"); + assertEquals( + "wap_staged_data_2", finalData.get(2).getString(0), "Should contain second WAP data"); + + // ===== SNAPSHOT HISTORY VERIFICATION ===== + + // 7. Verify snapshot counts and types + List totalSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertTrue( + totalSnapshots.size() >= 4, "Should have at least 4 snapshots (3 WAP + 1 regular)"); + + // Verify WAP snapshots still exist in metadata + List remainingWapSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".snapshots " + + "WHERE summary['wap.id'] IS NOT NULL") + .collectAsList(); + assertEquals( + 3, remainingWapSnapshots.size(), "All 3 WAP snapshots should still exist in metadata"); + + // Verify main branch has the latest published snapshot (points to regular INSERT snapshot) + List mainSnapshotRef = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + assertEquals(1, mainSnapshotRef.size(), "Main branch should exist and point to a snapshot"); + } + } + + @Test + public void testBranchAfterCreateTable() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table without any data (no snapshots exist) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Verify no snapshots exist yet + List initialSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); + + // Create branch on table with no existing snapshots + // According to Iceberg specification, this should succeed and create an empty snapshot + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_on_empty"); + + // Verify that an empty snapshot was created for the branch + List snapshotsAfterBranchCreation = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals( + 1, + snapshotsAfterBranchCreation.size(), + "Should have 1 empty snapshot after branch creation"); + + // Verify the empty snapshot properties + Row emptySnapshot = snapshotsAfterBranchCreation.get(0); + // The parent_id should be null for the empty snapshot + assertNull( + emptySnapshot.get(emptySnapshot.fieldIndex("parent_id")), + "Empty snapshot should have no parent"); + + // Verify the branch was created successfully + List refsAfterBranchCreation = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals( + 1, + refsAfterBranchCreation.size(), + "Should have feature_on_empty branch (main doesn't exist yet)"); + assertEquals( + "feature_on_empty", + refsAfterBranchCreation.get(0).getString(0), + "Should have feature_on_empty branch"); + + // Verify that main branch still doesn't exist (as expected) + boolean hasMainBranch = + refsAfterBranchCreation.stream().anyMatch(row -> "main".equals(row.getString(0))); + assertFalse(hasMainBranch, "Main branch should not exist on empty table"); + + // Now insert data to create a data snapshot + spark.sql("INSERT INTO " + tableName + " VALUES ('initial.data')"); + + // Verify we now have 2 snapshots (empty + data) + List snapshotsAfterInsert = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals( + 2, snapshotsAfterInsert.size(), "Should have 2 snapshots after insert (empty + data)"); + + // Now we should have main branch as well + List refsAfterInsert = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refsAfterInsert.size(), "Should have feature_on_empty and main branches"); + + // Create another branch after data exists - this should also succeed + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_after_snapshot"); + + // Verify we now have 3 branches (feature_on_empty, main, feature_after_snapshot) + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(3, refs.size(), "Should have 3 branches total"); + + // Verify all expected branches exist + Set branchNames = + refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(branchNames.contains("feature_on_empty"), "feature_on_empty branch should exist"); + assertTrue(branchNames.contains("main"), "main branch should exist"); + assertTrue( + branchNames.contains("feature_after_snapshot"), + "feature_after_snapshot branch should exist"); + + // ===== BRANCH ISOLATION TESTING ===== + + // 1. Test initial state: main and feature_after_snapshot should have the same data + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should have 1 row"); + + // 2. Test feature_on_empty branch should be empty (points to empty snapshot) + assertEquals( + 0, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should have 0 rows (points to empty snapshot)"); + + // 3. Add data to feature_on_empty branch only + spark.sql( + "INSERT INTO " + tableName + ".branch_feature_on_empty VALUES ('empty_branch_data')"); + + // Verify isolation: feature_on_empty now has data, others unchanged + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should now have 1 row"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (unchanged)"); + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should still have 1 row (unchanged)"); + + // 4. Add different data to feature_after_snapshot branch + spark.sql( + "INSERT INTO " + + tableName + + ".branch_feature_after_snapshot VALUES ('snapshot_branch_data')"); + + // Verify isolation: each branch has its own data + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should still have 1 row"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (unchanged)"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should now have 2 rows"); + + // 5. Add data to main branch + spark.sql("INSERT INTO " + tableName + " VALUES ('main_branch_data')"); + + // Verify complete isolation: each branch maintains its own data + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") + .collectAsList() + .size(), + "feature_on_empty branch should still have 1 row"); + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should now have 2 rows"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") + .collectAsList() + .size(), + "feature_after_snapshot branch should still have 2 rows (unchanged)"); + + // 6. Verify data content isolation + List featureOnEmptyData = + spark + .sql( + "SELECT name FROM " + + tableName + + " VERSION AS OF 'feature_on_empty' ORDER BY name") + .collectAsList(); + assertEquals( + "empty_branch_data", + featureOnEmptyData.get(0).getString(0), + "feature_on_empty should contain its specific data"); + + List mainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + "initial.data", mainData.get(0).getString(0), "main should contain initial data"); + assertEquals( + "main_branch_data", + mainData.get(1).getString(0), + "main should contain its specific data"); + + List featureAfterSnapshotData = + spark + .sql( + "SELECT name FROM " + + tableName + + " VERSION AS OF 'feature_after_snapshot' ORDER BY name") + .collectAsList(); + assertEquals( + "initial.data", + featureAfterSnapshotData.get(0).getString(0), + "feature_after_snapshot should contain initial data"); + assertEquals( + "snapshot_branch_data", + featureAfterSnapshotData.get(1).getString(0), + "feature_after_snapshot should contain its specific data"); + + // 7. Verify snapshot isolation: each branch should have different snapshot histories + List mainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List featureOnEmptySnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_on_empty'") + .collectAsList(); + List featureAfterSnapshotSnapshots = + spark + .sql( + "SELECT snapshot_id FROM " + + tableName + + ".refs WHERE name = 'feature_after_snapshot'") + .collectAsList(); + + assertNotEquals( + mainSnapshots.get(0).getLong(0), + featureOnEmptySnapshots.get(0).getLong(0), + "main and feature_on_empty should point to different snapshots"); + assertNotEquals( + mainSnapshots.get(0).getLong(0), + featureAfterSnapshotSnapshots.get(0).getLong(0), + "main and feature_after_snapshot should point to different snapshots"); + assertNotEquals( + featureOnEmptySnapshots.get(0).getLong(0), + featureAfterSnapshotSnapshots.get(0).getLong(0), + "feature_on_empty and feature_after_snapshot should point to different snapshots"); + } + } + + @Test + public void testWapBranchAfterCreateTable() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table without any data (no snapshots exist) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Enable WAP on the table + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Verify no snapshots exist yet + List initialSnapshots = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); + + // Create branch on table with no existing snapshots + // According to Iceberg specification, this should succeed and create an empty snapshot + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_empty"); + + // Verify that an empty snapshot was created for the branch + List snapshotsAfterBranchCreation = + spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); + assertEquals( + 1, + snapshotsAfterBranchCreation.size(), + "Should have 1 empty snapshot after branch creation"); + + // Verify the branch was created successfully + List refsAfterBranchCreation = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals( + 1, + refsAfterBranchCreation.size(), + "Should have feature_empty branch (main doesn't exist yet)"); + assertEquals( + "feature_empty", + refsAfterBranchCreation.get(0).getString(0), + "Should have feature_empty branch"); + + // ===== WAP BRANCH TESTING ===== + + // 1. Set WAP branch and insert data - should go to the feature_empty branch + spark.conf().set("spark.wap.branch", "feature_empty"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_branch_data_1')"); + + // Verify WAP branch data is visible when spark.wap.branch is set + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 1 row when spark.wap.branch=feature_empty"); + + List wapBranchData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); + assertEquals( + "wap_branch_data_1", wapBranchData.get(0).getString(0), "Should see WAP branch data"); + + // Verify feature_empty branch directly + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") + .collectAsList() + .size(), + "feature_empty branch should have 1 row"); + + // Unset WAP branch - queries should now see main branch (which doesn't exist yet, so empty) + spark.conf().unset("spark.wap.branch"); + assertEquals( + 0, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 0 rows when spark.wap.branch is unset (main doesn't exist)"); + + // ===== MULTI-BRANCH WAP TESTING ===== + + // 2. Create main branch with regular data + spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); + + // Now we should have main branch + List refs = + spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); + assertEquals(2, refs.size(), "Should have feature_empty and main branches"); + + // Verify main branch data when spark.wap.branch is unset + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + List mainData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); + assertEquals("main_data", mainData.get(0).getString(0), "Should see main branch data"); + + // 3. Create another branch and test WAP branch functionality + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_wap_test"); + + // Set WAP branch to feature_wap_test and add data + spark.conf().set("spark.wap.branch", "feature_wap_test"); + spark.sql("INSERT INTO " + tableName + " VALUES ('wap_branch_data_2')"); + + // Verify WAP branch data is visible when spark.wap.branch=feature_wap_test + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 2 rows when spark.wap.branch=feature_wap_test (main_data + wap_branch_data_2)"); + + // ===== COMPREHENSIVE WAP BRANCH ISOLATION VERIFICATION ===== + + // Verify each branch has independent data + spark.conf().unset("spark.wap.branch"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row when WAP branch is unset"); + + assertEquals( + 1, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") + .collectAsList() + .size(), + "feature_empty branch should have 1 row"); + + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_wap_test'") + .collectAsList() + .size(), + "feature_wap_test branch should have 2 rows"); + + // Verify data content isolation + List finalMainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("main_data", finalMainData.get(0).getString(0), "main should contain main_data"); + + List finalFeatureEmptyData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'feature_empty' ORDER BY name") + .collectAsList(); + assertEquals( + "wap_branch_data_1", + finalFeatureEmptyData.get(0).getString(0), + "feature_empty should contain wap_branch_data_1"); + + List finalFeatureWapTestData = + spark + .sql( + "SELECT name FROM " + + tableName + + " VERSION AS OF 'feature_wap_test' ORDER BY name") + .collectAsList(); + assertEquals( + "main_data", + finalFeatureWapTestData.get(0).getString(0), + "feature_wap_test should contain main_data"); + assertEquals( + "wap_branch_data_2", + finalFeatureWapTestData.get(1).getString(0), + "feature_wap_test should contain wap_branch_data_2"); + + // ===== WAP BRANCH SWITCHING BEHAVIOR ===== + + // 4. Test switching between WAP branches + spark.conf().set("spark.wap.branch", "feature_empty"); + List switchToFeatureEmpty = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + "wap_branch_data_1", + switchToFeatureEmpty.get(0).getString(0), + "Should see feature_empty data when switched"); + + spark.conf().set("spark.wap.branch", "feature_wap_test"); + List switchToFeatureWapTest = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals( + 2, switchToFeatureWapTest.size(), "Should see 2 rows when switched to feature_wap_test"); + assertEquals( + "main_data", switchToFeatureWapTest.get(0).getString(0), "First row should be main_data"); + assertEquals( + "wap_branch_data_2", + switchToFeatureWapTest.get(1).getString(0), + "Second row should be wap_branch_data_2"); + + // 5. Test INSERT behavior with WAP branch set + spark.conf().set("spark.wap.branch", "feature_empty"); + spark.sql("INSERT INTO " + tableName + " VALUES ('additional_wap_data')"); + + // Verify the insert went to the WAP branch + assertEquals( + 2, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Should see 2 rows in feature_empty after additional insert"); + + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") + .collectAsList() + .size(), + "feature_empty branch should have 2 rows after additional insert"); + + // Verify other branches are unchanged + spark.conf().unset("spark.wap.branch"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row (unchanged)"); + + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_wap_test'") + .collectAsList() + .size(), + "feature_wap_test branch should still have 2 rows (unchanged)"); + + // ===== SNAPSHOT HISTORY VERIFICATION ===== + + // 6. Verify that each branch points to different snapshots + List finalMainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List finalFeatureEmptySnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_empty'") + .collectAsList(); + List finalFeatureWapTestSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_wap_test'") + .collectAsList(); + + assertNotEquals( + finalMainSnapshots.get(0).getLong(0), + finalFeatureEmptySnapshots.get(0).getLong(0), + "main and feature_empty should point to different snapshots"); + assertNotEquals( + finalMainSnapshots.get(0).getLong(0), + finalFeatureWapTestSnapshots.get(0).getLong(0), + "main and feature_wap_test should point to different snapshots"); + assertNotEquals( + finalFeatureEmptySnapshots.get(0).getLong(0), + finalFeatureWapTestSnapshots.get(0).getLong(0), + "feature_empty and feature_wap_test should point to different snapshots"); + + // Clean up WAP branch configuration + spark.conf().unset("spark.wap.branch"); + } + } + + @Test + public void testWapBranchCommitWithMultipleBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "wap_multi_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table and enable WAP + spark.sql("CREATE TABLE " + tableName + " (name string)"); + spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); + + // Step 1: Start with main at snapshotX + spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); + + // Verify main branch exists and get its snapshot + List mainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + assertEquals(1, mainSnapshots.size(), "Main branch should exist"); + long snapshotX = mainSnapshots.get(0).getLong(0); + System.out.println("SnapshotX (main): " + snapshotX); + + // Step 2: Create branchA from main → branchA also points to snapshotX + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchA"); + + // Verify branchA points to same snapshot as main + List branchASnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + assertEquals(1, branchASnapshots.size(), "BranchA should exist"); + long branchASnapshotAfterCreation = branchASnapshots.get(0).getLong(0); + assertEquals( + snapshotX, branchASnapshotAfterCreation, "BranchA should point to same snapshot as main"); + + // Step 3: Set branchA as the WAP branch and commit data + spark.conf().set("spark.wap.branch", "branchA"); + spark.sql("INSERT INTO " + tableName + " VALUES ('branchA_data')"); + + // Step 4: Verify branchA now points to snapshotY (child of snapshotX) + List branchASnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + long snapshotY = branchASnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotX, snapshotY, "BranchA should now point to a new snapshot (snapshotY)"); + System.out.println("SnapshotY (branchA after commit): " + snapshotY); + + // Verify branchA has both main_data and branchA_data + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows after commit"); + + // Verify main still points to snapshotX and has only main_data + spark.conf().unset("spark.wap.branch"); + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row"); + + // Step 5: Create branchB from branchA → branchB points to snapshotY + // First create the branch, then set it to point to the same snapshot as branchA + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchB"); + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'branchB', 'branchA')"); + + // Verify branchB points to snapshotY + List branchBSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long branchBSnapshotAfterCreation = branchBSnapshots.get(0).getLong(0); + assertEquals( + snapshotY, + branchBSnapshotAfterCreation, + "BranchB should point to snapshotY (same as branchA)"); + + // Step 6: Make a commit on branchB → branchB now points to snapshotZ (child of snapshotY) + // Use direct branch syntax to target branchB specifically + spark.sql("INSERT INTO " + tableName + ".branch_branchB VALUES ('branchB_data')"); + + // Verify branchB now points to snapshotZ + List branchBSnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long snapshotZ = branchBSnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotY, snapshotZ, "BranchB should now point to a new snapshot (snapshotZ)"); + System.out.println("SnapshotZ (branchB after commit): " + snapshotZ); + + // ===== VERIFICATION OF FINAL STATE ===== + + // Verify all three branches exist and point to different snapshots + List allRefs = + spark + .sql("SELECT name, snapshot_id FROM " + tableName + ".refs ORDER BY name") + .collectAsList(); + assertEquals(3, allRefs.size(), "Should have 3 branches: main, branchA, branchB"); + + // Verify snapshot relationships + List mainFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List branchAFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + List branchBFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + + long finalSnapshotX = mainFinalSnapshots.get(0).getLong(0); + long finalSnapshotY = branchAFinalSnapshots.get(0).getLong(0); + long finalSnapshotZ = branchBFinalSnapshots.get(0).getLong(0); + + assertEquals(snapshotX, finalSnapshotX, "Main should still point to snapshotX"); + assertEquals(snapshotY, finalSnapshotY, "BranchA should still point to snapshotY"); + assertEquals(snapshotZ, finalSnapshotZ, "BranchB should point to snapshotZ"); + + // Verify data isolation between branches + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows"); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") + .collectAsList() + .size(), + "BranchB should have 3 rows"); + + // Verify content + List mainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("main_data", mainData.get(0).getString(0), "Main should contain main_data"); + + List branchAData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchA' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchAData.get(0).getString(0), "BranchA should contain branchA_data"); + assertEquals( + "main_data", branchAData.get(1).getString(0), "BranchA should contain main_data"); + + List branchBData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchB' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchBData.get(0).getString(0), "BranchB should contain branchA_data"); + assertEquals( + "branchB_data", branchBData.get(1).getString(0), "BranchB should contain branchB_data"); + assertEquals( + "main_data", branchBData.get(2).getString(0), "BranchB should contain main_data"); + + // Verify parent-child relationships in snapshot metadata + List allSnapshots = + spark + .sql( + "SELECT snapshot_id, parent_id FROM " + + tableName + + ".snapshots ORDER BY committed_at") + .collectAsList(); + assertTrue(allSnapshots.size() >= 3, "Should have at least 3 snapshots"); + + // Clean up WAP configuration + spark.conf().unset("spark.wap.branch"); + } + } + + @Test + public void testRegularCommitWithMultipleBranches() throws Exception { + try (SparkSession spark = getSparkSession()) { + String tableId = "regular_multi_branch_test_" + System.currentTimeMillis(); + String tableName = "openhouse.d1." + tableId; + + // Create table (no WAP needed for this test) + spark.sql("CREATE TABLE " + tableName + " (name string)"); + + // Step 1: Start with main at snapshotX + spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); + + // Verify main branch exists and get its snapshot + List mainSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + assertEquals(1, mainSnapshots.size(), "Main branch should exist"); + long snapshotX = mainSnapshots.get(0).getLong(0); + System.out.println("SnapshotX (main): " + snapshotX); + + // Step 2: Create branchA from main → branchA also points to snapshotX + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchA"); + + // Verify branchA points to same snapshot as main + List branchASnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + assertEquals(1, branchASnapshots.size(), "BranchA should exist"); + long branchASnapshotAfterCreation = branchASnapshots.get(0).getLong(0); + assertEquals( + snapshotX, branchASnapshotAfterCreation, "BranchA should point to same snapshot as main"); + + // Step 3: Commit some data on branchA → branchA now points to snapshotY (child of snapshotX) + spark.sql("INSERT INTO " + tableName + ".branch_branchA VALUES ('branchA_data')"); + + // Verify branchA now points to snapshotY (child of snapshotX) + List branchASnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + long snapshotY = branchASnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotX, snapshotY, "BranchA should now point to a new snapshot (snapshotY)"); + System.out.println("SnapshotY (branchA after commit): " + snapshotY); + + // Verify branchA has both main_data and branchA_data + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows after commit"); + + // Verify main still points to snapshotX and has only main_data + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should still have 1 row"); + + // Step 4: Create branchB from branchA → branchB points to snapshotY + // First create the branch, then set it to point to the same snapshot as branchA + spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchB"); + spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'branchB', 'branchA')"); + + // Verify branchB points to snapshotY + List branchBSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long branchBSnapshotAfterCreation = branchBSnapshots.get(0).getLong(0); + assertEquals( + snapshotY, + branchBSnapshotAfterCreation, + "BranchB should point to snapshotY (same as branchA)"); + + // Step 5: Make a commit on branchB → branchB now points to snapshotZ (child of snapshotY) + spark.sql("INSERT INTO " + tableName + ".branch_branchB VALUES ('branchB_data')"); + + // Verify branchB now points to snapshotZ + List branchBSnapshotsAfterCommit = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + long snapshotZ = branchBSnapshotsAfterCommit.get(0).getLong(0); + assertNotEquals( + snapshotY, snapshotZ, "BranchB should now point to a new snapshot (snapshotZ)"); + System.out.println("SnapshotZ (branchB after commit): " + snapshotZ); + + // ===== VERIFICATION OF FINAL STATE ===== + + // Verify all three branches exist and point to different snapshots + List allRefs = + spark + .sql("SELECT name, snapshot_id FROM " + tableName + ".refs ORDER BY name") + .collectAsList(); + assertEquals(3, allRefs.size(), "Should have 3 branches: main, branchA, branchB"); + + // Verify snapshot relationships + List mainFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") + .collectAsList(); + List branchAFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + List branchBFinalSnapshots = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + + long finalSnapshotX = mainFinalSnapshots.get(0).getLong(0); + long finalSnapshotY = branchAFinalSnapshots.get(0).getLong(0); + long finalSnapshotZ = branchBFinalSnapshots.get(0).getLong(0); + + assertEquals(snapshotX, finalSnapshotX, "Main should still point to snapshotX"); + assertEquals(snapshotY, finalSnapshotY, "BranchA should still point to snapshotY"); + assertEquals(snapshotZ, finalSnapshotZ, "BranchB should point to snapshotZ"); + + // Verify all snapshots are different + assertNotEquals( + finalSnapshotX, finalSnapshotY, "SnapshotX and snapshotY should be different"); + assertNotEquals( + finalSnapshotY, finalSnapshotZ, "SnapshotY and snapshotZ should be different"); + assertNotEquals( + finalSnapshotX, finalSnapshotZ, "SnapshotX and snapshotZ should be different"); + + // Verify data isolation between branches + assertEquals( + 1, + spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), + "Main branch should have 1 row"); + assertEquals( + 2, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should have 2 rows"); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") + .collectAsList() + .size(), + "BranchB should have 3 rows"); + + // Verify content + List mainData = + spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); + assertEquals("main_data", mainData.get(0).getString(0), "Main should contain main_data"); + + List branchAData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchA' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchAData.get(0).getString(0), "BranchA should contain branchA_data"); + assertEquals( + "main_data", branchAData.get(1).getString(0), "BranchA should contain main_data"); + + List branchBData = + spark + .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchB' ORDER BY name") + .collectAsList(); + assertEquals( + "branchA_data", branchBData.get(0).getString(0), "BranchB should contain branchA_data"); + assertEquals( + "branchB_data", branchBData.get(1).getString(0), "BranchB should contain branchB_data"); + assertEquals( + "main_data", branchBData.get(2).getString(0), "BranchB should contain main_data"); + + // ===== TEST THE SPECIFIC SCENARIO THAT WOULD HAVE BEEN AMBIGUOUS ===== + + // At this point, we have: + // - main points to snapshotX + // - branchA points to snapshotY + // - branchB points to snapshotZ + // + // If we were to commit a new snapshot as child of snapshotY, our fixed logic should work + // because only the explicitly targeted branch (via branch-specific insert syntax) should be + // considered + + // Verify that we can still commit to branchA even though multiple branches exist + spark.sql("INSERT INTO " + tableName + ".branch_branchA VALUES ('additional_branchA_data')"); + + // Verify branchA advanced but branchB didn't + List branchAFinalSnapshots2 = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") + .collectAsList(); + List branchBFinalSnapshots2 = + spark + .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") + .collectAsList(); + + long finalSnapshotY2 = branchAFinalSnapshots2.get(0).getLong(0); + long finalSnapshotZ2 = branchBFinalSnapshots2.get(0).getLong(0); + + assertNotEquals(snapshotY, finalSnapshotY2, "BranchA should have advanced to a new snapshot"); + assertEquals(snapshotZ, finalSnapshotZ2, "BranchB should remain at the same snapshot"); + + // Verify data counts + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") + .collectAsList() + .size(), + "BranchA should now have 3 rows"); + assertEquals( + 3, + spark + .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") + .collectAsList() + .size(), + "BranchB should still have 3 rows (unchanged)"); + } + } + // ===== CHERRY PICKING BETWEEN BRANCHES ===== @Test From 554a3c3ddb33ccf3ee74fe7cf2baba37683d8497 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Wed, 1 Oct 2025 18:34:48 -0700 Subject: [PATCH 06/35] refactoring for readability --- .../OpenHouseInternalTableOperations.java | 472 +++++++++++------- .../OpenHouseInternalTableOperationsTest.java | 4 +- .../spark/catalogtest/BranchTestSpark3_5.java | 4 +- 3 files changed, 307 insertions(+), 173 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 5ed27c62b..d486d1b63 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -292,7 +292,7 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { // Multi-branch support is now enabled with snapshot ID matching updatedMetadata = - maybeAppendSnapshots(updatedMetadata, appendedSnapshots, snapshotRefs, true); + applySnapshotOperations(updatedMetadata, appendedSnapshots, snapshotRefs, true); updatedMetadata = maybeDeleteSnapshots(updatedMetadata, deletedSnapshots); } @@ -571,97 +571,120 @@ private String determineTargetBranch( } /** - * Determines the target branch for snapshot commits using explicit branch targeting information. - * The snapshotRefs parameter contains the explicit branch targeting from the client commit - * operation. + * Returns the single target branch when only one branch is explicitly specified. This is the most + * common case - client explicitly specified which branch to commit to. */ - private String determineTargetBranch( - Map snapshotRefs, List newSnapshots, String defaultBranch) { - if (MapUtils.isEmpty(snapshotRefs)) { - return defaultBranch; + private String getSingleTargetBranch(Map snapshotRefs) { + String targetBranch = snapshotRefs.keySet().iterator().next(); + log.debug("Using explicit target branch from commit context: {}", targetBranch); + return targetBranch; + } + + /** + * Finds branches that exactly match the given snapshot ID. Returns the single matching branch, or + * null if there are zero or multiple matches. + */ + private String findExactSnapshotMatch(Map snapshotRefs, long snapshotId) { + List exactMatches = new ArrayList<>(); + for (Map.Entry entry : snapshotRefs.entrySet()) { + String branchName = entry.getKey(); + long branchSnapshotId = entry.getValue().snapshotId(); + + if (branchSnapshotId == snapshotId) { + exactMatches.add(branchName); + } } - // If there's only one branch in the refs, use that as the target - // This is the most common case - client explicitly specified which branch to commit to - if (snapshotRefs.size() == 1) { - String targetBranch = snapshotRefs.keySet().iterator().next(); - log.debug("Using explicit target branch from commit context: {}", targetBranch); + if (exactMatches.size() == 1) { + String targetBranch = exactMatches.get(0); + log.info( + "Determined target branch '{}' by exact snapshot ID match within commit context: {}", + targetBranch, + snapshotId); + return targetBranch; + } else if (exactMatches.size() > 1) { + log.error("Multiple branches point to same snapshot {}: {}", snapshotId, exactMatches); + throw new IllegalStateException( + String.format( + "Multiple explicitly targeted branches point to the same snapshot %s: %s. " + + "This indicates an invalid commit state.", + snapshotId, exactMatches)); + } + + // No exact match or zero matches + return null; + } + + /** + * Finds branches that match parent-child relationship with the given snapshot. Returns the single + * matching branch, or null if there are zero or multiple matches. + */ + private String findParentChildMatch( + Map snapshotRefs, long parentSnapshotId, long childSnapshotId) { + List parentMatches = new ArrayList<>(); + for (Map.Entry entry : snapshotRefs.entrySet()) { + String branchName = entry.getKey(); + long branchSnapshotId = entry.getValue().snapshotId(); + + if (branchSnapshotId == parentSnapshotId) { + parentMatches.add(branchName); + log.info("Branch '{}' matches parent snapshot {}", branchName, parentSnapshotId); + } + } + + if (parentMatches.size() == 1) { + String targetBranch = parentMatches.get(0); + log.info( + "Determined target branch '{}' by parent-child relationship within commit context: new snapshot {} is child of branch snapshot {}", + targetBranch, + childSnapshotId, + parentSnapshotId); return targetBranch; + } else if (parentMatches.size() > 1) { + log.error( + "Multiple branches point to parent snapshot {}: {}", parentSnapshotId, parentMatches); + throw new IllegalStateException( + String.format( + "Multiple explicitly targeted branches point to parent snapshot %s: %s. " + + "Cannot determine which branch should receive child snapshot %s. " + + "This indicates ambiguous commit targeting - the client should specify a single target branch.", + parentSnapshotId, parentMatches, childSnapshotId)); } - // Multiple branches specified in commit - need to determine which one based on snapshot - // relationships + // No parent match or zero matches - could happen in cherry-pick or other non-linear operations + return null; + } + + /** + * Determines target branch when multiple branches are specified by analyzing snapshot + * relationships. + */ + private String determineTargetFromMultipleBranches( + Map snapshotRefs, List newSnapshots) { + log.info( "Multiple branches in snapshotRefs ({}), analyzing snapshot relationships", snapshotRefs.size()); + if (!newSnapshots.isEmpty()) { Snapshot latestSnapshot = newSnapshots.get(newSnapshots.size() - 1); long latestSnapshotId = latestSnapshot.snapshotId(); log.info("Latest snapshot ID: {}", latestSnapshotId); - // First try: exact snapshot ID match within the explicitly targeted branches - List exactMatches = new ArrayList<>(); - for (Map.Entry entry : snapshotRefs.entrySet()) { - String branchName = entry.getKey(); - long branchSnapshotId = entry.getValue().snapshotId(); - - if (branchSnapshotId == latestSnapshotId) { - exactMatches.add(branchName); - } + // First try: exact snapshot ID match + String exactMatch = findExactSnapshotMatch(snapshotRefs, latestSnapshotId); + if (exactMatch != null) { + return exactMatch; } - if (exactMatches.size() == 1) { - String targetBranch = exactMatches.get(0); - log.info( - "Determined target branch '{}' by exact snapshot ID match within commit context: {}", - targetBranch, - latestSnapshotId); - return targetBranch; - } else if (exactMatches.size() > 1) { - log.error( - "Multiple branches point to same snapshot {}: {}", latestSnapshotId, exactMatches); - throw new IllegalStateException( - String.format( - "Multiple explicitly targeted branches point to the same snapshot %s: %s. " - + "This indicates an invalid commit state.", - latestSnapshotId, exactMatches)); - } - - // Second try: parent-child relationship match within the explicitly targeted branches + // Second try: parent-child relationship match Long parentSnapshotId = latestSnapshot.parentId(); log.info("Parent snapshot ID: {}", parentSnapshotId); if (parentSnapshotId != null) { - List parentMatches = new ArrayList<>(); - for (Map.Entry entry : snapshotRefs.entrySet()) { - String branchName = entry.getKey(); - long branchSnapshotId = entry.getValue().snapshotId(); - - if (branchSnapshotId == parentSnapshotId) { - parentMatches.add(branchName); - log.info("Branch '{}' matches parent snapshot {}", branchName, parentSnapshotId); - } - } - - if (parentMatches.size() == 1) { - String targetBranch = parentMatches.get(0); - log.info( - "Determined target branch '{}' by parent-child relationship within commit context: new snapshot {} is child of branch snapshot {}", - targetBranch, - latestSnapshotId, - parentSnapshotId); - return targetBranch; - } else if (parentMatches.size() > 1) { - log.error( - "Multiple branches point to parent snapshot {}: {}", parentSnapshotId, parentMatches); - throw new IllegalStateException( - String.format( - "Multiple explicitly targeted branches point to parent snapshot %s: %s. " - + "Cannot determine which branch should receive child snapshot %s. " - + "This indicates ambiguous commit targeting - the client should specify a single target branch.", - parentSnapshotId, parentMatches, latestSnapshotId)); + String parentMatch = findParentChildMatch(snapshotRefs, parentSnapshotId, latestSnapshotId); + if (parentMatch != null) { + return parentMatch; } - // If parentMatches.size() == 0, none of the explicitly targeted branches are parents - // This could happen in cherry-pick or other non-linear operations } } @@ -680,19 +703,197 @@ private String determineTargetBranch( snapshotRefs.keySet())); } - public TableMetadata maybeAppendSnapshots( - TableMetadata metadata, - List snapshotsToAppend, + /** + * Determines the target branch for snapshot commits using explicit branch targeting information. + * The snapshotRefs parameter contains the explicit branch targeting from the client commit + * operation. + */ + private String determineTargetBranch( + Map snapshotRefs, List newSnapshots, String defaultBranch) { + + // Handle simple case: no explicit branch targeting + if (MapUtils.isEmpty(snapshotRefs)) { + return defaultBranch; + } + + // Handle simple case: single branch explicitly specified + if (snapshotRefs.size() == 1) { + return getSingleTargetBranch(snapshotRefs); + } + + // Handle complex case: multiple branches with snapshot relationship analysis + return determineTargetFromMultipleBranches(snapshotRefs, newSnapshots); + } + + /** + * Applies a regular (non-WAP, non-cherry-picked) snapshot by assigning it to a branch or staging + * it. + */ + private void applyRegularSnapshot( + Snapshot snapshot, Map snapshotRefs, - boolean recordAction) { - TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); + TableMetadata.Builder metadataBuilder) { + + if (MapUtils.isNotEmpty(snapshotRefs)) { + // We have explicit branch information, use it to assign snapshot + String targetBranch = + determineTargetBranch( + snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); + metadataBuilder.setBranchSnapshot(snapshot, targetBranch); + } else { + // No branch information provided - add snapshot without assigning to any branch + // The snapshot will exist in metadata but won't be the HEAD of any branch + // Branch refs can be updated later via separate calls to applySnapshotOperations with + // snapshotRefs + metadataBuilder.addSnapshot(snapshot); + } + } + + /** Applies a WAP staged snapshot - not committed to any branch. */ + private void applyStagedSnapshot(Snapshot snapshot, TableMetadata.Builder metadataBuilder) { + metadataBuilder.addSnapshot(snapshot); + } + + /** Applies a cherry-picked snapshot - non fast-forward cherry pick. */ + private void applyCherryPickedSnapshot( + Snapshot snapshot, + Map snapshotRefs, + TableMetadata.Builder metadataBuilder) { + String targetBranch = + determineTargetBranch( + snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); + metadataBuilder.setBranchSnapshot(snapshot, targetBranch); + } + + /** Result of categorizing and applying snapshots. */ + private static class SnapshotOperationResult { + final List appendedSnapshots; + final List stagedSnapshots; + final List cherryPickedSnapshots; + + SnapshotOperationResult( + List appendedSnapshots, + List stagedSnapshots, + List cherryPickedSnapshots) { + this.appendedSnapshots = new ArrayList<>(appendedSnapshots); + this.stagedSnapshots = new ArrayList<>(stagedSnapshots); + this.cherryPickedSnapshots = new ArrayList<>(cherryPickedSnapshots); + } + } + + /** Categorizes snapshots by type and applies them to the metadata builder. */ + private SnapshotOperationResult categorizeAndApplySnapshots( + List snapshots, + Map snapshotRefs, + TableMetadata.Builder metadataBuilder) { + List appendedSnapshots = new ArrayList<>(); List stagedSnapshots = new ArrayList<>(); List cherryPickedSnapshots = new ArrayList<>(); + for (Snapshot snapshot : snapshots) { + snapshotInspector.validateSnapshot(snapshot); + + if (snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) { + applyStagedSnapshot(snapshot, metadataBuilder); + stagedSnapshots.add(String.valueOf(snapshot.snapshotId())); + + } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { + applyCherryPickedSnapshot(snapshot, snapshotRefs, metadataBuilder); + appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); + cherryPickedSnapshots.add( + String.valueOf(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))); + + } else { + applyRegularSnapshot(snapshot, snapshotRefs, metadataBuilder); + appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); + } + } + + return new SnapshotOperationResult(appendedSnapshots, stagedSnapshots, cherryPickedSnapshots); + } + + /** + * Updates branch references for fast-forward cherry-pick or rollback operations. Returns list of + * cherry-picked snapshot IDs. + */ + private List updateBranchReferences( + TableMetadata metadata, + Map snapshotRefs, + TableMetadata.Builder metadataBuilder) { + + List cherryPickedSnapshots = new ArrayList<>(); + + for (Map.Entry entry : snapshotRefs.entrySet()) { + String branchName = entry.getKey(); + long newSnapshotId = entry.getValue().snapshotId(); + + if (needsBranchUpdate(metadata, branchName, newSnapshotId)) { + metadataBuilder.setBranchSnapshot(newSnapshotId, branchName); + cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); + } + } + + return cherryPickedSnapshots; + } + + /** Checks if a branch needs to be updated based on current refs and new snapshot ID. */ + private boolean needsBranchUpdate(TableMetadata metadata, String branchName, long newSnapshotId) { + if (MapUtils.isEmpty(metadata.refs())) { + // No refs exist yet, this is a new branch + return true; + } + + SnapshotRef currentRef = metadata.refs().get(branchName); + return currentRef == null || currentRef.snapshotId() != newSnapshotId; + } + + /** Records snapshot actions in table properties and reports metrics. */ + private void recordSnapshotActions( + TableMetadata metadata, + TableMetadata.Builder metadataBuilder, + List appendedSnapshots, + List stagedSnapshots, + List cherryPickedSnapshots) { + + Map updatedProperties = new HashMap<>(metadata.properties()); + + if (CollectionUtils.isNotEmpty(appendedSnapshots)) { + updatedProperties.put( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + appendedSnapshots.stream().collect(Collectors.joining(","))); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); + } + + if (CollectionUtils.isNotEmpty(stagedSnapshots)) { + updatedProperties.put( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + stagedSnapshots.stream().collect(Collectors.joining(","))); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); + } + + if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { + updatedProperties.put( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + cherryPickedSnapshots.stream().collect(Collectors.joining(","))); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickedSnapshots.size()); + } + + metadataBuilder.setProperties(updatedProperties); + } + + public TableMetadata applySnapshotOperations( + TableMetadata metadata, + List snapshots, + Map snapshotRefs, + boolean recordAction) { + TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); + /** - * First check if there are new snapshots to be appended to current TableMetadata. If yes, - * following are the cases to be handled: + * Apply snapshots to current TableMetadata. The following cases are handled: * *

[1] A regular (non-wap) snapshot is being added to any branch. * @@ -704,99 +905,32 @@ public TableMetadata maybeAppendSnapshots( * created on. Recognized by SOURCE_SNAPSHOT_ID_PROP. This case is called non-fast forward * cherry pick. * - *

In case no new snapshots are to be appended to current TableMetadata, there could be a - * cherrypick of a staged (wap) snapshot on top of the current snapshot in any branch which is - * the same as the base snapshot the staged (wap) snapshot was created on. This case is called - * fast forward cherry pick. + *

Additionally, branch ref updates can occur independently for fast-forward cherry-pick or + * rollback operations where existing snapshots are assigned to branches. */ - if (CollectionUtils.isNotEmpty(snapshotsToAppend)) { - for (Snapshot snapshot : snapshotsToAppend) { - snapshotInspector.validateSnapshot(snapshot); - if (snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) { - // a stage only snapshot using wap.id - not committed to any branch - metadataBuilder.addSnapshot(snapshot); - stagedSnapshots.add(String.valueOf(snapshot.snapshotId())); - } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { - // a snapshot created on a non fast-forward cherry-pick snapshot - // Determine target branch from snapshotRefs or default to MAIN_BRANCH - // Pass only the current snapshot being processed, not the entire list - String targetBranch = - determineTargetBranch( - snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); - metadataBuilder.setBranchSnapshot(snapshot, targetBranch); - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - cherryPickedSnapshots.add( - String.valueOf(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))); - } else { - // a regular snapshot - assign to appropriate branch using snapshotRefs context - if (MapUtils.isNotEmpty(snapshotRefs)) { - // We have explicit branch information, use it to assign snapshot - // Pass only the current snapshot being processed, not the entire list - String targetBranch = - determineTargetBranch( - snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); - metadataBuilder.setBranchSnapshot(snapshot, targetBranch); - } else { - // No explicit branch refs - treat as staged snapshot - // This maintains isolation until refs are explicitly updated - metadataBuilder.addSnapshot(snapshot); - } - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - } - } - } + SnapshotOperationResult snapshotResult = + CollectionUtils.isNotEmpty(snapshots) + ? categorizeAndApplySnapshots(snapshots, snapshotRefs, metadataBuilder) + : new SnapshotOperationResult( + Collections.emptyList(), Collections.emptyList(), Collections.emptyList()); + + // Handle ref updates (this can happen independently of snapshot processing operations) + List refUpdateResults = + MapUtils.isNotEmpty(snapshotRefs) + ? updateBranchReferences(metadata, snapshotRefs, metadataBuilder) + : Collections.emptyList(); - // Handle ref updates (this can happen independently of snapshot append operations) - if (MapUtils.isNotEmpty(snapshotRefs)) { - // Handle ref updates for all branches (fast-forward cherry-pick or rollback operations) - for (Map.Entry entry : snapshotRefs.entrySet()) { - String branchName = entry.getKey(); - long newSnapshotId = entry.getValue().snapshotId(); - - // Check if this is an actual update for this branch - boolean isUpdate = false; - if (MapUtils.isEmpty(metadata.refs())) { - // No refs exist yet, this is a new branch - isUpdate = true; - } else { - SnapshotRef currentRef = metadata.refs().get(branchName); - if (currentRef == null || currentRef.snapshotId() != newSnapshotId) { - // Branch doesn't exist or snapshot is different - isUpdate = true; - } - } - - if (isUpdate) { - metadataBuilder.setBranchSnapshot(newSnapshotId, branchName); - cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); - } - } - } if (recordAction) { - Map updatedProperties = new HashMap<>(metadata.properties()); - if (CollectionUtils.isNotEmpty(appendedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - appendedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); - } - if (CollectionUtils.isNotEmpty(stagedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - stagedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); - } - if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - cherryPickedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, - cherryPickedSnapshots.size()); - } - metadataBuilder.setProperties(updatedProperties); + // Combine cherry-picked snapshots from both operations + List allCherryPickedSnapshots = new ArrayList<>(snapshotResult.cherryPickedSnapshots); + allCherryPickedSnapshots.addAll(refUpdateResults); + + recordSnapshotActions( + metadata, + metadataBuilder, + snapshotResult.appendedSnapshots, + snapshotResult.stagedSnapshots, + allCherryPickedSnapshots); } return metadataBuilder.build(); } diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index bcec8377b..125966bf5 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -599,12 +599,12 @@ void testAppendSnapshotsWithOldSnapshots() throws IOException { Assertions.assertThrows( IllegalArgumentException.class, () -> - openHouseInternalTableOperations.maybeAppendSnapshots( + openHouseInternalTableOperations.applySnapshotOperations( metadata, snapshots, ImmutableMap.of(), false)); // the latest snapshots have larger timestamp than the previous metadata timestamp, so it should // pass the validation snapshots.addAll(IcebergTestUtil.getFutureSnapshots()); - openHouseInternalTableOperations.maybeAppendSnapshots( + openHouseInternalTableOperations.applySnapshotOperations( metadata, snapshots, ImmutableMap.of(), false); } diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java index c8a0f3e03..488750620 100644 --- a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java +++ b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java @@ -17,7 +17,7 @@ /** * Comprehensive tests for multi-branch WAP operations in Spark 3.5. Tests validate the enhanced - * maybeAppendSnapshots functionality that supports: - Non-main branch operations (add/expire + * applySnapshotOperations functionality that supports: - Non-main branch operations (add/expire * snapshots from any branch) - WAP.id staging with multi-branch support - Cherry picking between * any branches - Fast forward merges for all branches - Backward compatibility with main-only * workflows - Forward compatibility for future wap.branch features @@ -1311,7 +1311,7 @@ public void testCherryPickToMainWithFeatureBranch() throws Exception { spark.conf().unset("spark.wap.id"); spark.sql("INSERT INTO " + tableName + " VALUES ('main.advance')"); - // Cherry-pick WAP to main branch (this tests our enhanced maybeAppendSnapshots) + // Cherry-pick WAP to main branch (this tests our enhanced applySnapshotOperations) // Main should have 2 rows now (main.base + main.advance) assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); spark.sql( From ea5ff0ee93f66525dd40bb5e61606ebc1eea78d7 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Wed, 1 Oct 2025 18:50:06 -0700 Subject: [PATCH 07/35] fixed edge case --- .../OpenHouseInternalTableOperations.java | 95 +++++++++++++------ 1 file changed, 65 insertions(+), 30 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index d486d1b63..4f4f6871c 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -779,6 +779,11 @@ private static class SnapshotOperationResult { this.stagedSnapshots = new ArrayList<>(stagedSnapshots); this.cherryPickedSnapshots = new ArrayList<>(cherryPickedSnapshots); } + + static SnapshotOperationResult empty() { + return new SnapshotOperationResult( + Collections.emptyList(), Collections.emptyList(), Collections.emptyList()); + } } /** Categorizes snapshots by type and applies them to the metadata builder. */ @@ -814,29 +819,53 @@ private SnapshotOperationResult categorizeAndApplySnapshots( } /** - * Updates branch references for fast-forward cherry-pick or rollback operations. Returns list of - * cherry-picked snapshot IDs. + * Updates branch references to point to specific snapshots. + * + *

This handles two scenarios: + * + *

    + *
  • Standalone ref operations: Moving branches to existing snapshots (fast-forward/rollback) + *
  • Guided snapshot assignment: Using refs to guide where new snapshots should be assigned + *
+ * + * @param recordAsCherryPicks whether to record ref updates as cherry-pick operations + * @return list of snapshot IDs that were cherry-picked (only when recordAsCherryPicks is true) */ private List updateBranchReferences( TableMetadata metadata, Map snapshotRefs, - TableMetadata.Builder metadataBuilder) { + TableMetadata.Builder metadataBuilder, + boolean recordAsCherryPicks) { List cherryPickedSnapshots = new ArrayList<>(); for (Map.Entry entry : snapshotRefs.entrySet()) { String branchName = entry.getKey(); - long newSnapshotId = entry.getValue().snapshotId(); + long targetSnapshotId = entry.getValue().snapshotId(); + + if (needsBranchUpdate(metadata, branchName, targetSnapshotId)) { + metadataBuilder.setBranchSnapshot(targetSnapshotId, branchName); - if (needsBranchUpdate(metadata, branchName, newSnapshotId)) { - metadataBuilder.setBranchSnapshot(newSnapshotId, branchName); - cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); + if (recordAsCherryPicks) { + cherryPickedSnapshots.add(String.valueOf(targetSnapshotId)); + } } } return cherryPickedSnapshots; } + /** + * Combines cherry-picked snapshot IDs from both snapshot processing and standalone ref + * operations. + */ + private List combineCherryPickedSnapshots( + List fromSnapshotProcessing, List fromStandaloneRefUpdates) { + List allCherryPicks = new ArrayList<>(fromSnapshotProcessing); + allCherryPicks.addAll(fromStandaloneRefUpdates); + return allCherryPicks; + } + /** Checks if a branch needs to be updated based on current refs and new snapshot ID. */ private boolean needsBranchUpdate(TableMetadata metadata, String branchName, long newSnapshotId) { if (MapUtils.isEmpty(metadata.refs())) { @@ -893,43 +922,49 @@ public TableMetadata applySnapshotOperations( TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); /** - * Apply snapshots to current TableMetadata. The following cases are handled: + * Process snapshots and branch reference updates. Two main operation types: * - *

[1] A regular (non-wap) snapshot is being added to any branch. + *

Snapshot Processing: When snapshots list is non-empty: * - *

[2] A staged (wap) snapshot is being created on top of current snapshot as its base. - * Recognized by STAGED_WAP_ID_PROP. These are stage-only and not committed to any branch. + *

    + *
  • [1] Regular snapshots - committed to branches (if snapshotRefs provided) or staged + *
  • [2] WAP staged snapshots (STAGED_WAP_ID_PROP) - staged but not committed to branches + *
  • [3] Cherry-picked snapshots (SOURCE_SNAPSHOT_ID_PROP) - committed to target branches + *
* - *

[3] A staged (wap) snapshot is being cherry picked to any branch wherein current snapshot - * in the target branch is not the same as the base snapshot the staged (wap) snapshot was - * created on. Recognized by SOURCE_SNAPSHOT_ID_PROP. This case is called non-fast forward - * cherry pick. + *

Branch Reference Updates: When snapshotRefs is non-empty: * - *

Additionally, branch ref updates can occur independently for fast-forward cherry-pick or - * rollback operations where existing snapshots are assigned to branches. + *

    + *
  • If snapshots are also provided: snapshotRefs guides branch assignment during processing + *
  • If only snapshotRefs provided: standalone fast-forward/rollback operations on existing + * snapshots + *
*/ - SnapshotOperationResult snapshotResult = + SnapshotOperationResult snapshotProcessingResults = CollectionUtils.isNotEmpty(snapshots) ? categorizeAndApplySnapshots(snapshots, snapshotRefs, metadataBuilder) - : new SnapshotOperationResult( - Collections.emptyList(), Collections.emptyList(), Collections.emptyList()); + : SnapshotOperationResult.empty(); - // Handle ref updates (this can happen independently of snapshot processing operations) - List refUpdateResults = - MapUtils.isNotEmpty(snapshotRefs) - ? updateBranchReferences(metadata, snapshotRefs, metadataBuilder) - : Collections.emptyList(); + // Update branch references (for standalone fast-forward/rollback operations) + List standaloneRefCherryPicks = Collections.emptyList(); + if (MapUtils.isNotEmpty(snapshotRefs)) { + boolean recordRefUpdatesAsCherryPicks = CollectionUtils.isEmpty(snapshots); + standaloneRefCherryPicks = + updateBranchReferences( + metadata, snapshotRefs, metadataBuilder, recordRefUpdatesAsCherryPicks); + } if (recordAction) { - // Combine cherry-picked snapshots from both operations - List allCherryPickedSnapshots = new ArrayList<>(snapshotResult.cherryPickedSnapshots); - allCherryPickedSnapshots.addAll(refUpdateResults); + // Combine cherry-picked snapshots from both snapshot processing and standalone ref updates + List allCherryPickedSnapshots = + combineCherryPickedSnapshots( + snapshotProcessingResults.cherryPickedSnapshots, standaloneRefCherryPicks); recordSnapshotActions( metadata, metadataBuilder, - snapshotResult.appendedSnapshots, - snapshotResult.stagedSnapshots, + snapshotProcessingResults.appendedSnapshots, + snapshotProcessingResults.stagedSnapshots, allCherryPickedSnapshots); } return metadataBuilder.build(); From 4fc379215db713fc370865c4c9d6bee6cc2e71f1 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Thu, 2 Oct 2025 12:49:52 -0700 Subject: [PATCH 08/35] refactoring to make more simple --- .../OpenHouseInternalTableOperations.java | 571 +++++++----------- 1 file changed, 229 insertions(+), 342 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 4f4f6871c..19dee8435 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -558,301 +558,22 @@ public TableMetadata maybeDeleteSnapshots( return result; } - /** - * Determines the target branch for a snapshot commit based on the provided snapshotRefs. - * - * @param snapshotRefs map of branch names to snapshot references - * @param defaultBranch default branch to use if no specific branch can be determined - * @return target branch name for the snapshot commit - */ - private String determineTargetBranch( - Map snapshotRefs, String defaultBranch) { - return determineTargetBranch(snapshotRefs, Collections.emptyList(), defaultBranch); - } - - /** - * Returns the single target branch when only one branch is explicitly specified. This is the most - * common case - client explicitly specified which branch to commit to. - */ - private String getSingleTargetBranch(Map snapshotRefs) { - String targetBranch = snapshotRefs.keySet().iterator().next(); - log.debug("Using explicit target branch from commit context: {}", targetBranch); - return targetBranch; - } - - /** - * Finds branches that exactly match the given snapshot ID. Returns the single matching branch, or - * null if there are zero or multiple matches. - */ - private String findExactSnapshotMatch(Map snapshotRefs, long snapshotId) { - List exactMatches = new ArrayList<>(); - for (Map.Entry entry : snapshotRefs.entrySet()) { - String branchName = entry.getKey(); - long branchSnapshotId = entry.getValue().snapshotId(); - - if (branchSnapshotId == snapshotId) { - exactMatches.add(branchName); - } - } - - if (exactMatches.size() == 1) { - String targetBranch = exactMatches.get(0); - log.info( - "Determined target branch '{}' by exact snapshot ID match within commit context: {}", - targetBranch, - snapshotId); - return targetBranch; - } else if (exactMatches.size() > 1) { - log.error("Multiple branches point to same snapshot {}: {}", snapshotId, exactMatches); - throw new IllegalStateException( - String.format( - "Multiple explicitly targeted branches point to the same snapshot %s: %s. " - + "This indicates an invalid commit state.", - snapshotId, exactMatches)); - } - - // No exact match or zero matches - return null; - } - - /** - * Finds branches that match parent-child relationship with the given snapshot. Returns the single - * matching branch, or null if there are zero or multiple matches. - */ - private String findParentChildMatch( - Map snapshotRefs, long parentSnapshotId, long childSnapshotId) { - List parentMatches = new ArrayList<>(); - for (Map.Entry entry : snapshotRefs.entrySet()) { - String branchName = entry.getKey(); - long branchSnapshotId = entry.getValue().snapshotId(); - - if (branchSnapshotId == parentSnapshotId) { - parentMatches.add(branchName); - log.info("Branch '{}' matches parent snapshot {}", branchName, parentSnapshotId); - } - } - - if (parentMatches.size() == 1) { - String targetBranch = parentMatches.get(0); - log.info( - "Determined target branch '{}' by parent-child relationship within commit context: new snapshot {} is child of branch snapshot {}", - targetBranch, - childSnapshotId, - parentSnapshotId); - return targetBranch; - } else if (parentMatches.size() > 1) { - log.error( - "Multiple branches point to parent snapshot {}: {}", parentSnapshotId, parentMatches); - throw new IllegalStateException( - String.format( - "Multiple explicitly targeted branches point to parent snapshot %s: %s. " - + "Cannot determine which branch should receive child snapshot %s. " - + "This indicates ambiguous commit targeting - the client should specify a single target branch.", - parentSnapshotId, parentMatches, childSnapshotId)); - } + /** Represents the semantic difference between current server state and client-desired state. */ + private static class StateDiff { + final List newSnapshots; + final Map branchUpdates; // branch -> snapshotId + final Map snapshotLookup; // snapshotId -> Snapshot for efficiency - // No parent match or zero matches - could happen in cherry-pick or other non-linear operations - return null; - } - - /** - * Determines target branch when multiple branches are specified by analyzing snapshot - * relationships. - */ - private String determineTargetFromMultipleBranches( - Map snapshotRefs, List newSnapshots) { - - log.info( - "Multiple branches in snapshotRefs ({}), analyzing snapshot relationships", - snapshotRefs.size()); - - if (!newSnapshots.isEmpty()) { - Snapshot latestSnapshot = newSnapshots.get(newSnapshots.size() - 1); - long latestSnapshotId = latestSnapshot.snapshotId(); - log.info("Latest snapshot ID: {}", latestSnapshotId); - - // First try: exact snapshot ID match - String exactMatch = findExactSnapshotMatch(snapshotRefs, latestSnapshotId); - if (exactMatch != null) { - return exactMatch; - } - - // Second try: parent-child relationship match - Long parentSnapshotId = latestSnapshot.parentId(); - log.info("Parent snapshot ID: {}", parentSnapshotId); - if (parentSnapshotId != null) { - String parentMatch = findParentChildMatch(snapshotRefs, parentSnapshotId, latestSnapshotId); - if (parentMatch != null) { - return parentMatch; - } - } + StateDiff( + List newSnapshots, + Map branchUpdates, + List allClientSnapshots) { + this.newSnapshots = List.copyOf(newSnapshots); + this.branchUpdates = Map.copyOf(branchUpdates); + this.snapshotLookup = + allClientSnapshots.stream() + .collect(Collectors.toMap(s -> String.valueOf(s.snapshotId()), s -> s)); } - - // If we reach here, we have multiple explicitly targeted branches but couldn't determine - // the target based on snapshot relationships. This suggests the commit operation itself - // is ambiguous or invalid. - log.error( - "Cannot determine target branch from explicitly targeted branches: {}", - snapshotRefs.keySet()); - throw new IllegalStateException( - String.format( - "Cannot determine target branch from explicitly targeted branches: %s. " - + "The commit specifies multiple target branches but snapshot relationships " - + "don't clearly indicate which branch should receive the new snapshots. " - + "This suggests an invalid or ambiguous commit operation.", - snapshotRefs.keySet())); - } - - /** - * Determines the target branch for snapshot commits using explicit branch targeting information. - * The snapshotRefs parameter contains the explicit branch targeting from the client commit - * operation. - */ - private String determineTargetBranch( - Map snapshotRefs, List newSnapshots, String defaultBranch) { - - // Handle simple case: no explicit branch targeting - if (MapUtils.isEmpty(snapshotRefs)) { - return defaultBranch; - } - - // Handle simple case: single branch explicitly specified - if (snapshotRefs.size() == 1) { - return getSingleTargetBranch(snapshotRefs); - } - - // Handle complex case: multiple branches with snapshot relationship analysis - return determineTargetFromMultipleBranches(snapshotRefs, newSnapshots); - } - - /** - * Applies a regular (non-WAP, non-cherry-picked) snapshot by assigning it to a branch or staging - * it. - */ - private void applyRegularSnapshot( - Snapshot snapshot, - Map snapshotRefs, - TableMetadata.Builder metadataBuilder) { - - if (MapUtils.isNotEmpty(snapshotRefs)) { - // We have explicit branch information, use it to assign snapshot - String targetBranch = - determineTargetBranch( - snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); - metadataBuilder.setBranchSnapshot(snapshot, targetBranch); - } else { - // No branch information provided - add snapshot without assigning to any branch - // The snapshot will exist in metadata but won't be the HEAD of any branch - // Branch refs can be updated later via separate calls to applySnapshotOperations with - // snapshotRefs - metadataBuilder.addSnapshot(snapshot); - } - } - - /** Applies a WAP staged snapshot - not committed to any branch. */ - private void applyStagedSnapshot(Snapshot snapshot, TableMetadata.Builder metadataBuilder) { - metadataBuilder.addSnapshot(snapshot); - } - - /** Applies a cherry-picked snapshot - non fast-forward cherry pick. */ - private void applyCherryPickedSnapshot( - Snapshot snapshot, - Map snapshotRefs, - TableMetadata.Builder metadataBuilder) { - String targetBranch = - determineTargetBranch( - snapshotRefs, Collections.singletonList(snapshot), SnapshotRef.MAIN_BRANCH); - metadataBuilder.setBranchSnapshot(snapshot, targetBranch); - } - - /** Result of categorizing and applying snapshots. */ - private static class SnapshotOperationResult { - final List appendedSnapshots; - final List stagedSnapshots; - final List cherryPickedSnapshots; - - SnapshotOperationResult( - List appendedSnapshots, - List stagedSnapshots, - List cherryPickedSnapshots) { - this.appendedSnapshots = new ArrayList<>(appendedSnapshots); - this.stagedSnapshots = new ArrayList<>(stagedSnapshots); - this.cherryPickedSnapshots = new ArrayList<>(cherryPickedSnapshots); - } - - static SnapshotOperationResult empty() { - return new SnapshotOperationResult( - Collections.emptyList(), Collections.emptyList(), Collections.emptyList()); - } - } - - /** Categorizes snapshots by type and applies them to the metadata builder. */ - private SnapshotOperationResult categorizeAndApplySnapshots( - List snapshots, - Map snapshotRefs, - TableMetadata.Builder metadataBuilder) { - - List appendedSnapshots = new ArrayList<>(); - List stagedSnapshots = new ArrayList<>(); - List cherryPickedSnapshots = new ArrayList<>(); - - for (Snapshot snapshot : snapshots) { - snapshotInspector.validateSnapshot(snapshot); - - if (snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) { - applyStagedSnapshot(snapshot, metadataBuilder); - stagedSnapshots.add(String.valueOf(snapshot.snapshotId())); - - } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { - applyCherryPickedSnapshot(snapshot, snapshotRefs, metadataBuilder); - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - cherryPickedSnapshots.add( - String.valueOf(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))); - - } else { - applyRegularSnapshot(snapshot, snapshotRefs, metadataBuilder); - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - } - } - - return new SnapshotOperationResult(appendedSnapshots, stagedSnapshots, cherryPickedSnapshots); - } - - /** - * Updates branch references to point to specific snapshots. - * - *

This handles two scenarios: - * - *

    - *
  • Standalone ref operations: Moving branches to existing snapshots (fast-forward/rollback) - *
  • Guided snapshot assignment: Using refs to guide where new snapshots should be assigned - *
- * - * @param recordAsCherryPicks whether to record ref updates as cherry-pick operations - * @return list of snapshot IDs that were cherry-picked (only when recordAsCherryPicks is true) - */ - private List updateBranchReferences( - TableMetadata metadata, - Map snapshotRefs, - TableMetadata.Builder metadataBuilder, - boolean recordAsCherryPicks) { - - List cherryPickedSnapshots = new ArrayList<>(); - - for (Map.Entry entry : snapshotRefs.entrySet()) { - String branchName = entry.getKey(); - long targetSnapshotId = entry.getValue().snapshotId(); - - if (needsBranchUpdate(metadata, branchName, targetSnapshotId)) { - metadataBuilder.setBranchSnapshot(targetSnapshotId, branchName); - - if (recordAsCherryPicks) { - cherryPickedSnapshots.add(String.valueOf(targetSnapshotId)); - } - } - } - - return cherryPickedSnapshots; } /** @@ -914,60 +635,226 @@ private void recordSnapshotActions( metadataBuilder.setProperties(updatedProperties); } + /** + * Applies client-requested changes to server state using functional approach. + * + *

Contract: currentState + clientDesiredState -> newState + metrics + * + *

Client sends desired final state, server computes semantic diff and applies it. + */ public TableMetadata applySnapshotOperations( - TableMetadata metadata, - List snapshots, - Map snapshotRefs, + TableMetadata currentMetadata, + List clientSnapshots, + Map clientRefs, boolean recordAction) { - TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); - /** - * Process snapshots and branch reference updates. Two main operation types: - * - *

Snapshot Processing: When snapshots list is non-empty: - * - *

    - *
  • [1] Regular snapshots - committed to branches (if snapshotRefs provided) or staged - *
  • [2] WAP staged snapshots (STAGED_WAP_ID_PROP) - staged but not committed to branches - *
  • [3] Cherry-picked snapshots (SOURCE_SNAPSHOT_ID_PROP) - committed to target branches - *
- * - *

Branch Reference Updates: When snapshotRefs is non-empty: - * - *

    - *
  • If snapshots are also provided: snapshotRefs guides branch assignment during processing - *
  • If only snapshotRefs provided: standalone fast-forward/rollback operations on existing - * snapshots - *
- */ - SnapshotOperationResult snapshotProcessingResults = - CollectionUtils.isNotEmpty(snapshots) - ? categorizeAndApplySnapshots(snapshots, snapshotRefs, metadataBuilder) - : SnapshotOperationResult.empty(); - - // Update branch references (for standalone fast-forward/rollback operations) - List standaloneRefCherryPicks = Collections.emptyList(); - if (MapUtils.isNotEmpty(snapshotRefs)) { - boolean recordRefUpdatesAsCherryPicks = CollectionUtils.isEmpty(snapshots); - standaloneRefCherryPicks = - updateBranchReferences( - metadata, snapshotRefs, metadataBuilder, recordRefUpdatesAsCherryPicks); - } + return computeStateDiff(currentMetadata, clientSnapshots, clientRefs) + .map( + diff -> { + TableMetadata newMetadata = applyStateDiff(currentMetadata, diff); + return recordAction + ? recordTransition(currentMetadata, newMetadata, diff) + : newMetadata; + }) + .orElse(currentMetadata); + } + + /** Computes semantic difference between current server state and client-desired state. */ + private Optional computeStateDiff( + TableMetadata currentMetadata, + List clientSnapshots, + Map clientRefs) { + + if (CollectionUtils.isEmpty(clientSnapshots) && MapUtils.isEmpty(clientRefs)) { + return Optional.empty(); // No changes requested + } + + Set currentSnapshotIds = + currentMetadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + // Find truly new snapshots (not in current metadata) + List newSnapshots = + Optional.ofNullable(clientSnapshots).orElse(Collections.emptyList()).stream() + .filter(s -> !currentSnapshotIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + + // Find branch updates needed + Map branchUpdates = + Optional.ofNullable(clientRefs).orElse(Collections.emptyMap()).entrySet().stream() + .filter( + entry -> + needsBranchUpdate( + currentMetadata, entry.getKey(), entry.getValue().snapshotId())) + .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().snapshotId())); + + return Optional.of( + new StateDiff( + newSnapshots, + branchUpdates, + Optional.ofNullable(clientSnapshots).orElse(Collections.emptyList()))); + } + + /** Applies the computed state diff to create new metadata. */ + private TableMetadata applyStateDiff(TableMetadata currentMetadata, StateDiff diff) { + TableMetadata.Builder builder = TableMetadata.buildFrom(currentMetadata); + + // Add new snapshots (respecting Iceberg semantics) + diff.newSnapshots.forEach( + snapshot -> { + snapshotInspector.validateSnapshot(snapshot); + + if (isWapStaged(snapshot)) { + // WAP snapshots are always staged (never assigned to branches initially) + builder.addSnapshot(snapshot); + } else { + // All other snapshots: assign to branch if specified, otherwise stage + findTargetBranchForSnapshot(snapshot, diff.branchUpdates) + .ifPresentOrElse( + targetBranch -> builder.setBranchSnapshot(snapshot, targetBranch), + () -> builder.addSnapshot(snapshot)); + } + }); + + // Update branch pointers to existing snapshots + diff.branchUpdates.entrySet().stream() + .filter(entry -> !isNewSnapshot(entry.getValue(), diff.newSnapshots)) + .forEach(entry -> builder.setBranchSnapshot(entry.getValue(), entry.getKey())); + + return builder.build(); + } + + /** Checks if snapshot is WAP staged (should not be assigned to any branch). */ + private boolean isWapStaged(Snapshot snapshot) { + return snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + } + + /** Checks if snapshot is cherry-picked (should go directly to target branch). */ + private boolean isCherryPicked(Snapshot snapshot) { + return snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP); + } - if (recordAction) { - // Combine cherry-picked snapshots from both snapshot processing and standalone ref updates - List allCherryPickedSnapshots = - combineCherryPickedSnapshots( - snapshotProcessingResults.cherryPickedSnapshots, standaloneRefCherryPicks); - - recordSnapshotActions( - metadata, - metadataBuilder, - snapshotProcessingResults.appendedSnapshots, - snapshotProcessingResults.stagedSnapshots, - allCherryPickedSnapshots); + /** + * Finds which branch this snapshot should be assigned to based on branch updates. Fails fast if + * multiple branches want the same snapshot (ambiguous commit). + */ + private Optional findTargetBranchForSnapshot( + Snapshot snapshot, Map branchUpdates) { + List matchingBranches = + branchUpdates.entrySet().stream() + .filter(entry -> entry.getValue() == snapshot.snapshotId()) + .map(Map.Entry::getKey) + .toList(); + + if (matchingBranches.size() > 1) { + throw new IllegalStateException( + "Multiple branches (%s) specify the same target snapshot %d. " + + "This indicates an ambiguous commit operation - each snapshot can only be assigned to one branch." + .formatted(matchingBranches, snapshot.snapshotId())); + } + + return matchingBranches.stream().findFirst(); + } + + /** Checks if this snapshot ID is in the list of new snapshots being added. */ + private boolean isNewSnapshot(Long snapshotId, List newSnapshots) { + return newSnapshots.stream().anyMatch(s -> s.snapshotId() == snapshotId); + } + + /** Records metrics and properties about the state transition that occurred. */ + private TableMetadata recordTransition( + TableMetadata originalMetadata, TableMetadata newMetadata, StateDiff diff) { + + Map properties = new HashMap<>(newMetadata.properties()); + + // Categorize new snapshots by their semantic type for metrics + Map> snapshotsByType = + diff.newSnapshots.stream() + .collect( + Collectors.groupingBy( + this::getSnapshotCategory, + Collectors.mapping(s -> String.valueOf(s.snapshotId()), Collectors.toList()))); + + // Record snapshot metrics by type + recordIfPresent( + properties, + snapshotsByType, + "appended", + CatalogConstants.APPENDED_SNAPSHOTS, + InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR); + recordIfPresent( + properties, + snapshotsByType, + "staged", + CatalogConstants.STAGED_SNAPSHOTS, + InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR); + + // For cherry-picked snapshots, record the SOURCE snapshot IDs that were cherry-picked + List cherryPickSourceIds = + diff.newSnapshots.stream() + .filter(this::isCherryPicked) + .map(this::getCherryPickSourceId) + .filter(Optional::isPresent) + .map(Optional::get) + .collect(Collectors.toList()); + + if (!cherryPickSourceIds.isEmpty()) { + properties.put( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + String.join(",", cherryPickSourceIds)); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickSourceIds.size()); + } + + // Record branch updates that don't involve new snapshots (pure ref moves) + List refOnlyCherryPicks = + diff.branchUpdates.entrySet().stream() + .filter(entry -> !isNewSnapshot(entry.getValue(), diff.newSnapshots)) + .map(entry -> String.valueOf(entry.getValue())) + .collect(Collectors.toList()); + + if (!refOnlyCherryPicks.isEmpty()) { + String existing = + properties.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); + String combined = + existing != null + ? existing + "," + String.join(",", refOnlyCherryPicks) + : String.join(",", refOnlyCherryPicks); + properties.put(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), combined); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, refOnlyCherryPicks.size()); } - return metadataBuilder.build(); + + return TableMetadata.buildFrom(newMetadata).setProperties(properties).build(); + } + + /** Categorizes snapshot for metrics based on its semantic type. */ + private String getSnapshotCategory(Snapshot snapshot) { + if (isWapStaged(snapshot)) return "staged"; + if (isCherryPicked(snapshot)) + return "appended"; // Cherry-picked snapshots are NEW, so they're "appended" + return "appended"; + } + + /** Extracts the source snapshot ID for cherry-picked snapshots. */ + private Optional getCherryPickSourceId(Snapshot snapshot) { + return Optional.ofNullable(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)); + } + + /** Records snapshot category in properties if snapshots exist. */ + private void recordIfPresent( + Map properties, + Map> categorized, + String category, + String propertyKey, + String metricKey) { + + Optional.ofNullable(categorized.get(category)) + .filter(CollectionUtils::isNotEmpty) + .ifPresent( + snapshots -> { + properties.put(getCanonicalFieldName(propertyKey), String.join(",", snapshots)); + metricsReporter.count(metricKey, snapshots.size()); + }); } /** Helper function to dump contents for map in debugging mode. */ From 9d6aec0278527bc3d62546b4819a67c563c1f49f Mon Sep 17 00:00:00 2001 From: cbb330 Date: Thu, 2 Oct 2025 14:40:32 -0700 Subject: [PATCH 09/35] removing unused function --- .../catalog/OpenHouseInternalTableOperations.java | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 19dee8435..16577bdbd 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -23,7 +23,6 @@ import java.io.IOException; import java.time.Clock; import java.time.Instant; -import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -576,17 +575,6 @@ private static class StateDiff { } } - /** - * Combines cherry-picked snapshot IDs from both snapshot processing and standalone ref - * operations. - */ - private List combineCherryPickedSnapshots( - List fromSnapshotProcessing, List fromStandaloneRefUpdates) { - List allCherryPicks = new ArrayList<>(fromSnapshotProcessing); - allCherryPicks.addAll(fromStandaloneRefUpdates); - return allCherryPicks; - } - /** Checks if a branch needs to be updated based on current refs and new snapshot ID. */ private boolean needsBranchUpdate(TableMetadata metadata, String branchName, long newSnapshotId) { if (MapUtils.isEmpty(metadata.refs())) { From bf5a474725ad6639d42126af44ccaba1eb11984f Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 6 Oct 2025 18:01:55 -0700 Subject: [PATCH 10/35] workign tests for ambiguous commits --- .../OpenHouseInternalTableOperations.java | 62 +- .../OpenHouseInternalTableOperationsTest.java | 567 +++++++++++++++++- 2 files changed, 622 insertions(+), 7 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 16577bdbd..dc9ab7ead 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -23,6 +23,7 @@ import java.io.IOException; import java.time.Clock; import java.time.Instant; +import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -538,6 +539,9 @@ public TableMetadata maybeDeleteSnapshots( TableMetadata metadata, List snapshotsToDelete) { TableMetadata result = metadata; if (CollectionUtils.isNotEmpty(snapshotsToDelete)) { + // Validate that snapshots to delete are not referenced by any branches or tags + validateSnapshotsNotReferenced(metadata, snapshotsToDelete); + Set snapshotIds = snapshotsToDelete.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); Map updatedProperties = new HashMap<>(result.properties()); @@ -552,7 +556,7 @@ public TableMetadata maybeDeleteSnapshots( .build() .removeSnapshotsIf(s -> snapshotIds.contains(s.snapshotId())); metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, snapshotsToDelete.size()); + InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, (double) snapshotsToDelete.size()); } return result; } @@ -586,6 +590,59 @@ private boolean needsBranchUpdate(TableMetadata metadata, String branchName, lon return currentRef == null || currentRef.snapshotId() != newSnapshotId; } + /** Validates that no two branches are trying to point to the same snapshot (ambiguous commit). */ + private void validateNoBranchConflicts(Map branchUpdates) { + // Group branches by target snapshot ID + Map> snapshotToBranches = new HashMap<>(); + for (Map.Entry entry : branchUpdates.entrySet()) { + snapshotToBranches + .computeIfAbsent(entry.getValue(), k -> new ArrayList<>()) + .add(entry.getKey()); + } + + // Check for conflicts (multiple branches pointing to same snapshot) + for (Map.Entry> entry : snapshotToBranches.entrySet()) { + List branches = entry.getValue(); + if (branches.size() > 1) { + throw new IllegalStateException( + String.format( + "Multiple branches (%s) specify the same target snapshot %d. " + + "This indicates an ambiguous commit operation - each snapshot can only be assigned to one branch.", + branches, entry.getKey())); + } + } + } + + /** Validates that snapshots to be deleted are not referenced by any branches or tags. */ + private void validateSnapshotsNotReferenced( + TableMetadata metadata, List snapshotsToDelete) { + if (MapUtils.isEmpty(metadata.refs()) || CollectionUtils.isEmpty(snapshotsToDelete)) { + return; // No refs to check or no snapshots to delete + } + + Set snapshotIdsToDelete = + snapshotsToDelete.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + // Check if any snapshot to delete is referenced by branches or tags + for (Map.Entry refEntry : metadata.refs().entrySet()) { + String refName = refEntry.getKey(); + SnapshotRef ref = refEntry.getValue(); + + if (snapshotIdsToDelete.contains(ref.snapshotId())) { + List referencingRefs = + metadata.refs().entrySet().stream() + .filter(entry -> snapshotIdsToDelete.contains(entry.getValue().snapshotId())) + .map(Map.Entry::getKey) + .collect(Collectors.toList()); + + throw new IllegalArgumentException( + String.format( + "Cannot expire %d. Still referenced by refs: %s", + ref.snapshotId(), referencingRefs)); + } + } + } + /** Records snapshot actions in table properties and reports metrics. */ private void recordSnapshotActions( TableMetadata metadata, @@ -675,6 +732,9 @@ private Optional computeStateDiff( currentMetadata, entry.getKey(), entry.getValue().snapshotId())) .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().snapshotId())); + // Check for ambiguous commits: multiple branches trying to point to the same snapshot + validateNoBranchConflicts(branchUpdates); + return Optional.of( new StateDiff( newSnapshots, diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index 125966bf5..e09e45d0d 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -29,6 +29,7 @@ import java.util.UUID; import java.util.function.Consumer; import java.util.stream.Collectors; +import java.util.stream.IntStream; import lombok.SneakyThrows; import org.apache.commons.compress.utils.Lists; import org.apache.hadoop.conf.Configuration; @@ -480,35 +481,95 @@ void testDoCommitExceptionHandling() { } @Test - void testDoCommitSnapshotsValidationExceptionHandling() throws IOException { + void testDoCommitWithValidSnapshotDeletion() throws IOException { TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(ImmutableMap.of("random", "value")); List testSnapshots = IcebergTestUtil.getSnapshots(); Map properties = new HashMap<>(metadata.properties()); + + // The key insight: SNAPSHOTS_JSON_KEY determines what snapshots SHOULD exist after commit + // Only include snapshot 2 - this means snapshots 0 and 1 should be deleted properties.put( CatalogConstants.SNAPSHOTS_JSON_KEY, - SnapshotsUtil.serializedSnapshots(testSnapshots.subList(1, 3))); + SnapshotsUtil.serializedSnapshots(testSnapshots.subList(2, 3))); // Only snapshot 2 properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - testSnapshots.get(testSnapshots.size() - 1)))); + testSnapshots.get(2)))); // snapshot 2 -> main properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); metadata = metadata.replaceProperties(properties); + + // Create initial metadata with snapshots 0, 1, 2 where only snapshot 2 is referenced TableMetadata metadataWithSnapshots = TableMetadata.buildFrom(metadata) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .addSnapshot(testSnapshots.get(0)) // Unreferenced - will be deleted + .addSnapshot(testSnapshots.get(1)) // Unreferenced - will be deleted + .setBranchSnapshot( + testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) // Referenced - will be kept .build(); + + // Target metadata: same branch setup but snapshots 0,1 removed via SNAPSHOTS_JSON_KEY TableMetadata metadataWithSnapshotsDeleted = TableMetadata.buildFrom(metadata) - .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) + .setBranchSnapshot( + testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) // Only snapshot 2 remains .build(); + // This should succeed because snapshots 0 and 1 are unreferenced and can be safely deleted Assertions.assertDoesNotThrow( () -> openHouseInternalTableOperations.doCommit( metadataWithSnapshots, metadataWithSnapshotsDeleted)); + + // ideally we also verify that snapshots 0 and 1 are deleted, but doCommit doesn't return the + // metadata with the deleted snapshots + } + + @Test + void testDoCommitSnapshotsValidationThrowsException() throws IOException { + TableMetadata metadata = + BASE_TABLE_METADATA.replaceProperties(ImmutableMap.of("random", "value")); + List testSnapshots = IcebergTestUtil.getSnapshots(); + Map properties = new HashMap<>(metadata.properties()); + + // The key issue: SNAPSHOTS_JSON_KEY says to keep only snapshot 2, but snapshot 1 is referenced + // by main + // This creates a conflict - we're trying to delete snapshot 1 but it's still referenced + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots( + testSnapshots.subList(2, 3))); // Only snapshot 2 should remain + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + testSnapshots.get(1)))); // But main refs snapshot 1 + properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); + metadata = metadata.replaceProperties(properties); + + // Create initial metadata with snapshots 1 and 2, where snapshot 1 is referenced by main + TableMetadata metadataWithSnapshots = + TableMetadata.buildFrom(metadata) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) // snapshot 1 -> main + .addSnapshot(testSnapshots.get(2)) // snapshot 2 exists but unreferenced initially + .build(); + + // Target metadata tries to delete snapshot 1 (not in SNAPSHOTS_JSON_KEY) but main still refs it + TableMetadata metadataWithSnapshotsDeleted = + TableMetadata.buildFrom(metadata) + .setBranchSnapshot( + testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) // main still points to snapshot 1 + .build(); + + // This should throw exception because snapshot 1 is marked for deletion but still referenced by + // main + Assertions.assertThrows( + CommitStateUnknownException.class, + () -> + openHouseInternalTableOperations.doCommit( + metadataWithSnapshots, metadataWithSnapshotsDeleted), + "Should throw exception when trying to delete referenced snapshots"); } @Test @@ -1202,4 +1263,498 @@ private void verifyMetricHistogramBuckets( Assertions.assertFalse(Double.isNaN(totalTime), "Timer total time should not be NaN"); Assertions.assertFalse(Double.isNaN(maxTime), "Timer max time should not be NaN"); } + + // ===== SNAPSHOT DELETION SAFETY TESTS ===== + + @Test + void testDeleteSnapshotWithMainReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with multiple snapshots + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(2)) // Unreferenced - can be deleted + .setBranchSnapshot( + testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted + .build(); + + // Get the current head snapshot that is referenced by main branch + Snapshot referencedSnapshot = testSnapshots.get(testSnapshots.size() - 1); + + // Attempt to delete a snapshot that is currently referenced by a branch + List snapshotsToDelete = List.of(referencedSnapshot); + + // Capture final variables for lambda + final TableMetadata finalBase = baseMetadata; + final List finalSnapshotsToDelete = snapshotsToDelete; + + // This MUST throw IllegalArgumentException for referenced snapshots + IllegalArgumentException exception = + Assertions.assertThrows( + IllegalArgumentException.class, + () -> + openHouseInternalTableOperations.maybeDeleteSnapshots( + finalBase, finalSnapshotsToDelete), + "Should throw IllegalArgumentException when trying to delete referenced snapshot"); + + // Verify error message mentions the reference + String expectedMessage = + "Cannot expire " + referencedSnapshot.snapshotId() + ". Still referenced by refs:"; + Assertions.assertTrue( + exception.getMessage().contains(expectedMessage) + || exception.getMessage().contains("Still referenced by") + || exception.getMessage().contains("referenced"), + "Error message should indicate snapshot is still referenced: " + exception.getMessage()); + } + + @Test + void testDeleteSnapshotWithNoReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with multiple snapshots + TableMetadata base = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(2)) // Unreferenced - can be deleted + .setBranchSnapshot( + testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted + .build(); + + // Delete unreferenced snapshots (first two snapshots) + List unreferencedSnapshots = testSnapshots.subList(0, 2); + + TableMetadata result = + openHouseInternalTableOperations.maybeDeleteSnapshots(base, unreferencedSnapshots); + + // Verify unreferenced snapshots were removed + for (Snapshot unreferenced : unreferencedSnapshots) { + boolean snapshotExists = + result.snapshots().stream().anyMatch(s -> s.snapshotId() == unreferenced.snapshotId()); + Assertions.assertFalse( + snapshotExists, + "Unreferenced snapshot " + unreferenced.snapshotId() + " should be deleted"); + } + + // Verify referenced snapshot still exists + Snapshot referencedSnapshot = testSnapshots.get(3); + boolean referencedExists = + result.snapshots().stream() + .anyMatch(s -> s.snapshotId() == referencedSnapshot.snapshotId()); + Assertions.assertTrue(referencedExists, "Referenced snapshot should still exist"); + + // Verify deletion tracking + Map properties = result.properties(); + String deletedSnapshots = + properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNotNull(deletedSnapshots); + + for (Snapshot unreferenced : unreferencedSnapshots) { + Assertions.assertTrue( + deletedSnapshots.contains(Long.toString(unreferenced.snapshotId())), + "Unreferenced snapshot should be tracked as deleted"); + } + } + + @Test + void testDeleteSnapshotWithMultipleReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create metadata with snapshot referenced by multiple branches + // Reference the same snapshot from multiple branches + Snapshot sharedSnapshot = testSnapshots.get(1); + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(sharedSnapshot) // Add snapshot first + .setRef( + SnapshotRef.MAIN_BRANCH, + SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) + .setRef( + "feature_branch", SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) + .build(); + // Add other snapshots to the metadata (skip index 1 - shared snapshot already added) + List snapshotsToAdd = + IntStream.range(0, testSnapshots.size()) + .filter(i -> i != 1) + .mapToObj(testSnapshots::get) + .collect(Collectors.toList()); + + for (Snapshot snapshot : snapshotsToAdd) { + baseMetadata = TableMetadata.buildFrom(baseMetadata).addSnapshot(snapshot).build(); + } + + // Attempt to delete the shared snapshot + List snapshotsToDelete = List.of(sharedSnapshot); + + // Capture final variables for lambda + final TableMetadata finalBase = baseMetadata; + final List finalSnapshotsToDelete = snapshotsToDelete; + + // This MUST throw IllegalArgumentException for snapshots referenced by multiple branches + IllegalArgumentException exception = + Assertions.assertThrows( + IllegalArgumentException.class, + () -> + openHouseInternalTableOperations.maybeDeleteSnapshots( + finalBase, finalSnapshotsToDelete), + "Should throw IllegalArgumentException when trying to delete snapshot referenced by multiple branches"); + + // Verify error message mentions multiple references + String exceptionMessage = exception.getMessage(); + Assertions.assertTrue( + exceptionMessage.contains("Still referenced by refs"), + "Error message should indicate snapshot is still referenced by branches: " + + exceptionMessage); + } + + @Test + void testDeleteSnapshotWithBranchReference() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with snapshots - add the tagged snapshot first + Snapshot taggedSnapshot = testSnapshots.get(0); + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(taggedSnapshot) // Add the snapshot first so it exists + .setBranchSnapshot(testSnapshots.get(testSnapshots.size() - 1), SnapshotRef.MAIN_BRANCH) + .setRef( + "feature_branch", + SnapshotRef.tagBuilder(taggedSnapshot.snapshotId()).build()) // Now create the tag + .build(); + // Add remaining snapshots + for (int i = 1; i < testSnapshots.size() - 1; i++) { + baseMetadata = + TableMetadata.buildFrom(baseMetadata).addSnapshot(testSnapshots.get(i)).build(); + } + + // Attempt to delete snapshot that has a tag reference + List snapshotsToDelete = List.of(taggedSnapshot); + + // Capture final variables for lambda + final TableMetadata finalBase = baseMetadata; + final List finalSnapshotsToDelete = snapshotsToDelete; + + // This MUST throw IllegalArgumentException for snapshots referenced by tags + IllegalArgumentException exception = + Assertions.assertThrows( + IllegalArgumentException.class, + () -> + openHouseInternalTableOperations.maybeDeleteSnapshots( + finalBase, finalSnapshotsToDelete), + "Should throw IllegalArgumentException when trying to delete snapshot referenced by tag"); + + // Verify error message mentions tag reference + String exceptionMessage = exception.getMessage(); + Assertions.assertTrue( + exceptionMessage.contains("Still referenced by refs"), + "Error message should indicate snapshot is still referenced by branches: " + + exceptionMessage); + } + + @Test + void testDeleteEmptySnapshotList() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata base = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + base = + TableMetadata.buildFrom(base) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Delete empty list + List emptyList = List.of(); + + TableMetadata result = openHouseInternalTableOperations.maybeDeleteSnapshots(base, emptyList); + + // Verify no changes were made + Assertions.assertEquals( + base.snapshots().size(), + result.snapshots().size(), + "No snapshots should be deleted when list is empty"); + + // Verify no deletion tracking properties were added + Map properties = result.properties(); + String deletedSnapshots = + properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); + } + + @Test + void testDeleteNullSnapshotList() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata base = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + base = + TableMetadata.buildFrom(base) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Delete null list + TableMetadata result = openHouseInternalTableOperations.maybeDeleteSnapshots(base, null); + + // Verify no changes were made + Assertions.assertEquals( + base.snapshots().size(), + result.snapshots().size(), + "No snapshots should be deleted when list is null"); + + // Verify no deletion tracking properties were added + Map properties = result.properties(); + String deletedSnapshots = + properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); + } + + @Test + void testDeleteNonExistentSnapshot() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata base = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + base = + TableMetadata.buildFrom(base) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Create a snapshot that doesn't exist in the metadata + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + Snapshot nonExistentSnapshot = extraSnapshots.get(0); + + List snapshotsToDelete = List.of(nonExistentSnapshot); + + TableMetadata result = + openHouseInternalTableOperations.maybeDeleteSnapshots(base, snapshotsToDelete); + + // Verify original snapshots are unchanged + Assertions.assertEquals( + base.snapshots().size(), + result.snapshots().size(), + "Snapshot count should be unchanged when deleting non-existent snapshot"); + + // Verify deletion is still tracked (documenting current behavior) + Map properties = result.properties(); + String deletedSnapshots = + properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNotNull(deletedSnapshots); + Assertions.assertTrue( + deletedSnapshots.contains(Long.toString(nonExistentSnapshot.snapshotId())), + "Non-existent snapshot should still be tracked as deleted"); + } + + @Test + void testDeleteSnapshotMetricsRecorded() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata base = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + base = TableMetadata.buildFrom(base).addSnapshot(snapshot).build(); + } + + // Delete some snapshots + List snapshotsToDelete = testSnapshots.subList(0, 2); + + // Use the operations instance with mock metrics reporter + openHouseInternalTableOperationsWithMockMetrics.maybeDeleteSnapshots(base, snapshotsToDelete); + + // Verify metrics were recorded + Mockito.verify(mockMetricsReporter) + .count( + eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), + eq((double) snapshotsToDelete.size())); + } + + @Test + void testDeleteSnapshotMetricsRecordedBranch() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with snapshots that have branch references + TableMetadata base = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted + .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted + .setBranchSnapshot( + testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted + .build(); + + // Delete unreferenced snapshots (emits metrics for basic deletion) + List snapshotsToDelete = testSnapshots.subList(0, 2); + + // Use the operations instance with mock metrics reporter + openHouseInternalTableOperationsWithMockMetrics.maybeDeleteSnapshots(base, snapshotsToDelete); + + // Verify metrics were recorded for the basic deletion + Mockito.verify(mockMetricsReporter) + .count( + eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), + eq((double) snapshotsToDelete.size())); + } + + @Test + void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata base = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + base = + TableMetadata.buildFrom(base) + .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) + .build(); + } + + // Create a snapshot that doesn't exist in the metadata + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + Snapshot nonExistentSnapshot = extraSnapshots.get(0); + List snapshotsToDelete = List.of(nonExistentSnapshot); + + // Use the operations instance with mock metrics reporter + openHouseInternalTableOperationsWithMockMetrics.maybeDeleteSnapshots(base, snapshotsToDelete); + + // Verify metrics are still recorded even for non-existent snapshots + Mockito.verify(mockMetricsReporter) + .count( + eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), + eq((double) snapshotsToDelete.size())); + } + + @Test + void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with all snapshots, where the last one is referenced by main branch + TableMetadata base = + testSnapshots.subList(0, testSnapshots.size() - 1).stream() + .reduce( + BASE_TABLE_METADATA, + (metadata, snapshot) -> + TableMetadata.buildFrom(metadata).addSnapshot(snapshot).build(), + (m1, m2) -> m2); + base = + TableMetadata.buildFrom(base) + .setBranchSnapshot(testSnapshots.get(testSnapshots.size() - 1), SnapshotRef.MAIN_BRANCH) + .build(); + + // Attempt to delete ALL snapshots (including the one referenced by main) + List allSnapshots = new ArrayList<>(testSnapshots); + + // This should fail because we cannot delete the snapshot referenced by main branch + IllegalArgumentException exception = + Assertions.assertThrows( + IllegalArgumentException.class, + () -> openHouseInternalTableOperations.maybeDeleteSnapshots(base, allSnapshots), + "Should throw IllegalArgumentException when trying to delete all snapshots including main branch reference"); + + // Verify error message indicates the snapshot is still referenced + String exceptionMessage = exception.getMessage(); + Assertions.assertTrue( + exceptionMessage.contains("Still referenced by refs") + || exceptionMessage.contains("referenced") + || exceptionMessage.contains("Cannot expire"), + "Error message should indicate snapshot is still referenced: " + exceptionMessage); + } + + @Test + void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata with unreferenced snapshots only (no main branch or other refs) + TableMetadata base = BASE_TABLE_METADATA; + for (Snapshot snapshot : testSnapshots) { + base = TableMetadata.buildFrom(base).addSnapshot(snapshot).build(); + } + // Note: No setBranchSnapshot or setRef calls - all snapshots are unreferenced + + // Attempt to delete all unreferenced snapshots + List allSnapshots = new ArrayList<>(testSnapshots); + + // This should succeed since no snapshots are referenced by any branch/tag + TableMetadata result = + Assertions.assertDoesNotThrow( + () -> openHouseInternalTableOperations.maybeDeleteSnapshots(base, allSnapshots), + "Should succeed when deleting all unreferenced snapshots"); + + // Verify all snapshots were removed from the metadata + Assertions.assertEquals( + 0, + result.snapshots().size(), + "All unreferenced snapshots should be deleted, resulting in empty snapshots list"); + + // Verify deletion tracking shows all snapshots were deleted + Map properties = result.properties(); + String deletedSnapshots = + properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNotNull(deletedSnapshots, "Deleted snapshots should be tracked"); + + for (Snapshot snapshot : allSnapshots) { + Assertions.assertTrue( + deletedSnapshots.contains(Long.toString(snapshot.snapshotId())), + "Snapshot " + snapshot.snapshotId() + " should be tracked as deleted"); + } + } + + @Test + void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + // Create base metadata + TableMetadata base = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // Add multiple new snapshots + List newSnapshots = testSnapshots.subList(1, 4); // snapshots 1, 2, 3 + + // Create snapshotRefs where each branch points to a DIFFERENT snapshot (valid scenario) + Map validRefs = new HashMap<>(); + validRefs.put("branch_a", SnapshotRef.branchBuilder(testSnapshots.get(1).snapshotId()).build()); + validRefs.put("branch_b", SnapshotRef.branchBuilder(testSnapshots.get(2).snapshotId()).build()); + validRefs.put("branch_c", SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build()); + + // This should NOT throw an exception + Assertions.assertDoesNotThrow( + () -> + openHouseInternalTableOperations.applySnapshotOperations( + base, newSnapshots, validRefs, false), + "Should NOT throw exception when branches target different snapshots"); + } + + @Test + void testStandardWAPScenario() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + List wapSnapshots = IcebergTestUtil.getWapSnapshots(); + + // Create base with existing snapshots and a WAP snapshot + TableMetadata base = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .addSnapshot(wapSnapshots.get(0)) // WAP snapshot (not referenced by any branch) + .build(); + + // Standard WAP scenario: pull the WAP snapshot into main branch + Snapshot wapSnapshot = wapSnapshots.get(0); + List newSnapshots = List.of(); // No new snapshots, just referencing the existing WAP + + // Create refs to pull WAP snapshot into main branch + Map refs = new HashMap<>(); + refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRef.branchBuilder(wapSnapshot.snapshotId()).build()); + + // Should succeed - standard WAP workflow where WAP snapshot becomes the new main + Assertions.assertDoesNotThrow( + () -> + openHouseInternalTableOperations.applySnapshotOperations( + base, newSnapshots, refs, false), + "Should successfully pull WAP snapshot into main branch"); + } } From 4d9dae02bc1fcd838e5c666be6238257b458fad0 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 7 Oct 2025 13:15:48 -0700 Subject: [PATCH 11/35] tests for the replication use case --- .../OpenHouseInternalTableOperationsTest.java | 345 +++++++++++++++++- 1 file changed, 340 insertions(+), 5 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index e09e45d0d..7bb945c44 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -26,6 +26,7 @@ import java.util.List; import java.util.Map; import java.util.Optional; +import java.util.Set; import java.util.UUID; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -42,6 +43,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotRefParser; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortOrder; import org.apache.iceberg.TableMetadata; @@ -1633,15 +1635,15 @@ void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata with all snapshots, where the last one is referenced by main branch - TableMetadata base = + TableMetadata tempBase = testSnapshots.subList(0, testSnapshots.size() - 1).stream() .reduce( BASE_TABLE_METADATA, (metadata, snapshot) -> TableMetadata.buildFrom(metadata).addSnapshot(snapshot).build(), (m1, m2) -> m2); - base = - TableMetadata.buildFrom(base) + final TableMetadata base = + TableMetadata.buildFrom(tempBase) .setBranchSnapshot(testSnapshots.get(testSnapshots.size() - 1), SnapshotRef.MAIN_BRANCH) .build(); @@ -1669,10 +1671,11 @@ void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata with unreferenced snapshots only (no main branch or other refs) - TableMetadata base = BASE_TABLE_METADATA; + TableMetadata tempBase = BASE_TABLE_METADATA; for (Snapshot snapshot : testSnapshots) { - base = TableMetadata.buildFrom(base).addSnapshot(snapshot).build(); + tempBase = TableMetadata.buildFrom(tempBase).addSnapshot(snapshot).build(); } + final TableMetadata base = tempBase; // Note: No setBranchSnapshot or setRef calls - all snapshots are unreferenced // Attempt to delete all unreferenced snapshots @@ -1757,4 +1760,336 @@ void testStandardWAPScenario() throws IOException { base, newSnapshots, refs, false), "Should successfully pull WAP snapshot into main branch"); } + + /** + * Integration test that verifies committing with base and metadata that are at least two commits + * divergent. This simulates scenarios where: + * + *
    + *
  • Base metadata is at version N + *
  • New metadata represents state at version N+2 or later (skipping intermediate versions) + *
  • The commit should still succeed and write complete metadata + *
+ * + *

This test validates that Iceberg can handle "jump" commits where the metadata being + * committed has evolved significantly from the base. + */ + @Test + void testMultipleDiffCommit() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create divergent metadata at N+3 with 4 snapshots ========== + // Simulate evolving through N+1 and N+2 without committing + TableMetadata intermediate1 = + TableMetadata.buildFrom(baseAtN) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata intermediate2 = + TableMetadata.buildFrom(intermediate1) + .setBranchSnapshot(testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata metadataAtNPlus3 = + TableMetadata.buildFrom(intermediate2) + .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) + .build(); + + // Add custom properties for commit + Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); + List snapshots4 = testSnapshots.subList(0, 4); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots4.get(3)))); + + TableMetadata finalDivergentMetadata = + metadataAtNPlus3.replaceProperties(divergentProperties); + + // ========== COMMIT: Base at N, Metadata at N+3 (divergent by 3 commits) ========== + openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + + // Verify the divergent commit contains all 4 snapshots + Assertions.assertEquals( + 4, + capturedMetadata.snapshots().size(), + "Divergent commit should contain all 4 snapshots despite jumping from base with 1 snapshot"); + + Set expectedSnapshotIds = + snapshots4.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set actualSnapshotIds = + capturedMetadata.snapshots().stream() + .map(Snapshot::snapshotId) + .collect(Collectors.toSet()); + Assertions.assertEquals( + expectedSnapshotIds, + actualSnapshotIds, + "All snapshot IDs should be present after divergent commit"); + + // Verify main ref points to the expected snapshot (the 4th snapshot) + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(3).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to the 4th snapshot after divergent commit"); + } + } + + /** + * Test committing with divergent metadata and multiple valid branches. Base is at N with MAIN, + * metadata is at N+3 with both MAIN and feature_a branches pointing to different snapshots. + */ + @Test + void testMultipleDiffCommitWithValidBranch() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create divergent metadata at N+3 with 4 snapshots and 2 branches ========== + TableMetadata intermediate1 = + TableMetadata.buildFrom(baseAtN) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata intermediate2 = + TableMetadata.buildFrom(intermediate1) + .setBranchSnapshot(testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) + .build(); + + TableMetadata metadataAtNPlus3 = + TableMetadata.buildFrom(intermediate2) + .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) + .build(); + + // Add custom properties for commit with multiple branches + Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); + List snapshots4 = testSnapshots.subList(0, 4); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); + + // Create refs for both MAIN (pointing to snapshot 3) and feature_a (pointing to snapshot 2) + Map multipleRefs = new HashMap<>(); + multipleRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); + multipleRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(2).snapshotId()).build())); + + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(multipleRefs)); + + TableMetadata finalDivergentMetadata = + metadataAtNPlus3.replaceProperties(divergentProperties); + + // ========== COMMIT: Should succeed with multiple valid branches ========== + openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + + // Verify all 4 snapshots are present + Assertions.assertEquals( + 4, + capturedMetadata.snapshots().size(), + "Divergent commit with multiple branches should contain all 4 snapshots"); + + // Verify main ref points to the expected snapshot + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(3).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to the 4th snapshot"); + + // Verify feature_a ref points to the expected snapshot + SnapshotRef featureRef = capturedMetadata.ref("feature_a"); + Assertions.assertNotNull(featureRef, "Feature_a branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(2).snapshotId(), + featureRef.snapshotId(), + "Feature_a branch should point to the 3rd snapshot"); + } + } + + /** + * Test committing with divergent metadata where multiple branches point to the same snapshot. + * This is VALID when done through setBranchSnapshot() - the end state is allowed. + */ + @Test + void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create divergent metadata with MAIN and feature_a both pointing to snapshot 3 + // ========== + TableMetadata.Builder builder = TableMetadata.buildFrom(baseAtN); + // Add snapshots 1, 2, 3 without assigning to branches + builder.addSnapshot(testSnapshots.get(1)); + builder.addSnapshot(testSnapshots.get(2)); + builder.addSnapshot(testSnapshots.get(3)); + // Set BOTH branches to point to the same existing snapshot (using snapshot ID) + builder.setBranchSnapshot(testSnapshots.get(3).snapshotId(), SnapshotRef.MAIN_BRANCH); + builder.setBranchSnapshot(testSnapshots.get(3).snapshotId(), "feature_a"); + TableMetadata metadataWithBothBranches = builder.build(); + + // Add custom properties with snapshots + Map divergentProperties = + new HashMap<>(metadataWithBothBranches.properties()); + List snapshots4 = testSnapshots.subList(0, 4); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); + + // Create refs matching the setBranchSnapshot calls - both pointing to snapshot 3 + Map sameSnapshotRefs = new HashMap<>(); + sameSnapshotRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); + sameSnapshotRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); + + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(sameSnapshotRefs)); + + TableMetadata finalDivergentMetadata = + metadataWithBothBranches.replaceProperties(divergentProperties); + + // ========== COMMIT: Should SUCCEED - this is a valid end state ========== + openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + + // Verify all 4 snapshots are present + Assertions.assertEquals( + 4, + capturedMetadata.snapshots().size(), + "Commit with multiple branches pointing to same snapshot should contain all 4 snapshots"); + + // Verify BOTH refs point to the same snapshot + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(3).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to the 4th snapshot"); + + SnapshotRef featureRef = capturedMetadata.ref("feature_a"); + Assertions.assertNotNull(featureRef, "Feature_a branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(3).snapshotId(), + featureRef.snapshotId(), + "Feature_a branch should also point to the 4th snapshot (same as main)"); + + // Verify they point to the SAME snapshot + Assertions.assertEquals( + mainRef.snapshotId(), + featureRef.snapshotId(), + "Both branches should point to the same snapshot ID"); + } + } + + /** + * Test committing with divergent metadata where multiple branches try to point to the same + * snapshot (ambiguous commit). This should throw an IllegalStateException. + */ + @Test + void testMultipleDiffCommitWithInvalidBranch() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create metadata with 4 snapshots but only snapshot 0 in refs ========== + // Build metadata with all 4 snapshots added, but keep MAIN pointing to snapshot 0 + TableMetadata.Builder builder = TableMetadata.buildFrom(baseAtN); + // Add snapshots 1, 2, 3 without assigning them to any branch + builder.addSnapshot(testSnapshots.get(1)); + builder.addSnapshot(testSnapshots.get(2)); + builder.addSnapshot(testSnapshots.get(3)); + TableMetadata metadataWithAllSnapshots = builder.build(); + + // Add custom properties with AMBIGUOUS branch refs - both pointing to same snapshot + Map divergentProperties = + new HashMap<>(metadataWithAllSnapshots.properties()); + List snapshots4 = testSnapshots.subList(0, 4); + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); + + // Create INVALID refs: both MAIN and feature_a pointing to the SAME snapshot (ambiguous!) + Map ambiguousRefs = new HashMap<>(); + ambiguousRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); + ambiguousRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()) + .build())); // Same snapshot! + + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(ambiguousRefs)); + + TableMetadata finalDivergentMetadata = + metadataWithAllSnapshots.replaceProperties(divergentProperties); + + // ========== COMMIT: Should throw CommitStateUnknownException due to ambiguous branches + // ========== + CommitStateUnknownException exception = + Assertions.assertThrows( + CommitStateUnknownException.class, + () -> openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata), + "Should throw CommitStateUnknownException when multiple branches point to same snapshot"); + + // Verify error message indicates the ambiguous commit + String exceptionMessage = exception.getMessage(); + Assertions.assertTrue( + exceptionMessage.contains("Multiple branches") + && exceptionMessage.contains("same target snapshot"), + "Error message should indicate multiple branches targeting same snapshot: " + + exceptionMessage); + } + } } From abdf335f3d848a3148720b9932b43577f7ce8a8f Mon Sep 17 00:00:00 2001 From: cbb330 Date: Wed, 8 Oct 2025 10:55:50 -0700 Subject: [PATCH 12/35] refactoring pipeline --- .../OpenHouseInternalTableOperations.java | 872 ++++++++++-------- 1 file changed, 511 insertions(+), 361 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index dc9ab7ead..a89a5d570 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -23,7 +23,6 @@ import java.io.IOException; import java.time.Clock; import java.time.Instant; -import java.util.ArrayList; import java.util.Collections; import java.util.HashMap; import java.util.List; @@ -35,8 +34,6 @@ import java.util.stream.Collectors; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.collections.MapUtils; import org.apache.hadoop.fs.FileSystem; import org.apache.iceberg.BaseMetastoreTableOperations; import org.apache.iceberg.PartitionField; @@ -61,7 +58,6 @@ import org.apache.iceberg.expressions.Term; import org.apache.iceberg.io.FileIO; import org.apache.iceberg.relocated.com.google.common.base.Objects; -import org.springframework.data.util.Pair; @AllArgsConstructor @Slf4j @@ -230,6 +226,8 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { metadata = rebuildTblMetaWithSchema(metadata, CatalogConstants.EVOLVED_SCHEMA_KEY, true); } + metadata = applySnapshots(base, metadata); + int version = currentVersion() + 1; CommitStatus commitStatus = CommitStatus.FAILURE; @@ -261,8 +259,6 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { if (properties.containsKey(CatalogConstants.EVOLVED_SCHEMA_KEY)) { properties.remove(CatalogConstants.EVOLVED_SCHEMA_KEY); } - String serializedSnapshotsToPut = properties.remove(CatalogConstants.SNAPSHOTS_JSON_KEY); - String serializedSnapshotRefs = properties.remove(CatalogConstants.SNAPSHOTS_REFS_KEY); boolean isStageCreate = Boolean.parseBoolean(properties.remove(CatalogConstants.IS_STAGE_CREATE_KEY)); String sortOrderJson = properties.remove(CatalogConstants.SORT_ORDER_KEY); @@ -275,27 +271,6 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { updatedMetadata = updatedMetadata.replaceSortOrder(sortOrder); } - if (serializedSnapshotsToPut != null) { - List snapshotsToPut = - SnapshotsUtil.parseSnapshots(fileIO, serializedSnapshotsToPut); - Pair, List> snapshotsDiff = - SnapshotsUtil.symmetricDifferenceSplit(snapshotsToPut, updatedMetadata.snapshots()); - List appendedSnapshots = snapshotsDiff.getFirst(); - List deletedSnapshots = snapshotsDiff.getSecond(); - snapshotInspector.validateSnapshotsUpdate( - updatedMetadata, appendedSnapshots, deletedSnapshots); - Map snapshotRefs = - serializedSnapshotRefs == null - ? new HashMap<>() - : SnapshotsUtil.parseSnapshotRefs(serializedSnapshotRefs); - - // Multi-branch support is now enabled with snapshot ID matching - - updatedMetadata = - applySnapshotOperations(updatedMetadata, appendedSnapshots, snapshotRefs, true); - updatedMetadata = maybeDeleteSnapshots(updatedMetadata, deletedSnapshots); - } - final TableMetadata updatedMtDataRef = updatedMetadata; long metadataUpdateStartTime = System.currentTimeMillis(); try { @@ -506,403 +481,578 @@ static SortOrder rebuildSortOrder(SortOrder originalSortOrder, Schema newSchema) return builder.build(); } + // ==================== Functional Snapshot Application Pipeline ==================== + /** - * If this commit comes from Iceberg built-in retry in - * org.apache.iceberg.PropertiesUpdate#commit() Then throw fatal {@link CommitFailedException} to - * inform users. + * Immutable state object representing the complete snapshot diff and categorization. All fields + * are final and collections are unmodifiable. */ - private void failIfRetryUpdate(Map properties) { - if (properties.containsKey(CatalogConstants.COMMIT_KEY)) { - String userProvidedTblVer = properties.get(CatalogConstants.COMMIT_KEY); - - // If the commit is ever seen in the past, that indicates this commit is a retry and should - // abort - if (CACHE.getIfPresent(userProvidedTblVer) != null) { - throw new CommitFailedException( - String.format( - "The user provided table version [%s] for table [%s] is stale, please consider retry from application", - userProvidedTblVer, tableIdentifier)); - } else { - CACHE.put(userProvidedTblVer, 1); - } + @lombok.Value + @lombok.Builder + private static class SnapshotState { + List providedSnapshots; + Map providedRefs; + List existingSnapshots; + Map existingRefs; + + // Categorization + List wapSnapshots; + List cherryPickedSnapshots; + List regularSnapshots; + + // Diff results + List newSnapshots; + List existingRetainedSnapshots; + List deletedSnapshots; + + // Branch updates + Map branchUpdates; + + // Metrics for recording + int appendedCount; + int stagedCount; + int cherryPickedCount; + int deletedCount; + } - properties.remove(CatalogConstants.COMMIT_KEY); - } else { - // This should never occur except table-creation. However, when table-creation hits - // concurrency issue - // it throw AlreadyExistsException and will not trigger retry. - metricsReporter.count(InternalCatalogMetricsConstant.MISSING_COMMIT_KEY); + /** + * Applies snapshot updates from metadata properties using a functional pipeline. This method + * follows principles: immutability, pure functions, and composition. + * + *

Pipeline stages: 1. Extract snapshots from properties 2. Parse snapshots from JSON 3. Parse + * references from JSON 4. Compute complete state diff (categorize, identify changes) 5. Validate + * entire operation 6. Apply state changes 7. Record metrics/properties + * + * @param base The base table metadata (may be null for table creation) + * @param metadata The new metadata with properties containing snapshot updates + * @return Updated metadata with snapshots applied + */ + TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) { + // Check if snapshots update is requested + if (!metadata.properties().containsKey(CatalogConstants.SNAPSHOTS_JSON_KEY)) { + // No snapshot updates requested, return unchanged + return metadata; } + + return Optional.ofNullable(metadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY)) + .map( + snapshotsJson -> { + // Stage 1-3: Extract and parse + SnapshotState.SnapshotStateBuilder stateBuilder = SnapshotState.builder(); + + // Extract and parse snapshots (Stage 1-2) + List providedSnapshots = parseSnapshotsFromJson(snapshotsJson); + stateBuilder.providedSnapshots(Collections.unmodifiableList(providedSnapshots)); + + // Extract and parse references (Stage 3) + Map providedRefs = + Optional.ofNullable( + metadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) + .map(this::parseReferencesFromJson) + .orElse(Collections.emptyMap()); + stateBuilder.providedRefs(Collections.unmodifiableMap(providedRefs)); + + // Get existing state from base + List existingSnapshots = + Optional.ofNullable(base) + .map(TableMetadata::snapshots) + .orElse(Collections.emptyList()); + stateBuilder.existingSnapshots(Collections.unmodifiableList(existingSnapshots)); + + Map existingRefs = + Optional.ofNullable(base).map(TableMetadata::refs).orElse(Collections.emptyMap()); + stateBuilder.existingRefs(Collections.unmodifiableMap(existingRefs)); + + // Stage 4: Compute complete state diff + SnapshotState state = computeStateDiff(stateBuilder); + + // Stage 5: Validate entire operation + validateOperation(state, base); + + // Stage 6: Apply state changes + TableMetadata updated = applyStateChanges(metadata, state); + + // Stage 7: Record metrics/properties + return recordMetrics(updated, state); + }) + .orElse(metadata); // No snapshot updates if key not present } - public TableMetadata maybeDeleteSnapshots( - TableMetadata metadata, List snapshotsToDelete) { - TableMetadata result = metadata; - if (CollectionUtils.isNotEmpty(snapshotsToDelete)) { - // Validate that snapshots to delete are not referenced by any branches or tags - validateSnapshotsNotReferenced(metadata, snapshotsToDelete); + /** Stage 2: Parse snapshots from JSON string. Pure function - no side effects. */ + private List parseSnapshotsFromJson(String snapshotsJson) { + return SnapshotsUtil.parseSnapshots(fileIO, snapshotsJson); + } - Set snapshotIds = - snapshotsToDelete.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - Map updatedProperties = new HashMap<>(result.properties()); - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - snapshotsToDelete.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(","))); - result = - TableMetadata.buildFrom(result) - .setProperties(updatedProperties) - .build() - .removeSnapshotsIf(s -> snapshotIds.contains(s.snapshotId())); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, (double) snapshotsToDelete.size()); - } - return result; - } - - /** Represents the semantic difference between current server state and client-desired state. */ - private static class StateDiff { - final List newSnapshots; - final Map branchUpdates; // branch -> snapshotId - final Map snapshotLookup; // snapshotId -> Snapshot for efficiency - - StateDiff( - List newSnapshots, - Map branchUpdates, - List allClientSnapshots) { - this.newSnapshots = List.copyOf(newSnapshots); - this.branchUpdates = Map.copyOf(branchUpdates); - this.snapshotLookup = - allClientSnapshots.stream() - .collect(Collectors.toMap(s -> String.valueOf(s.snapshotId()), s -> s)); - } + /** Stage 3: Parse references from JSON string. Pure function - no side effects. */ + private Map parseReferencesFromJson(String refsJson) { + return SnapshotsUtil.parseSnapshotRefs(refsJson); } - /** Checks if a branch needs to be updated based on current refs and new snapshot ID. */ - private boolean needsBranchUpdate(TableMetadata metadata, String branchName, long newSnapshotId) { - if (MapUtils.isEmpty(metadata.refs())) { - // No refs exist yet, this is a new branch - return true; - } + /** + * Stage 4: Compute complete state diff. Pure function that categorizes snapshots and identifies + * changes. + */ + private SnapshotState computeStateDiff(SnapshotState.SnapshotStateBuilder builder) { + SnapshotState partial = builder.build(); + + Map providedById = + partial.getProvidedSnapshots().stream() + .collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + Map existingById = + partial.getExistingSnapshots().stream() + .collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + + // Categorize all snapshots by type + SnapshotCategories categories = + categorizeAllSnapshots(partial.getProvidedSnapshots(), existingById); + + // Identify snapshot changes (new, retained, deleted) + SnapshotChanges changes = + identifySnapshotChanges( + partial.getProvidedSnapshots(), + partial.getExistingSnapshots(), + providedById, + existingById); + + // Identify branch updates + Map branchUpdates = + computeBranchUpdates(partial.getProvidedRefs(), partial.getExistingRefs()); + + // Compute metrics + SnapshotMetrics metrics = computeSnapshotMetrics(categories, changes, existingById); + + // Build complete state + return builder + .wapSnapshots(Collections.unmodifiableList(categories.wapSnapshots)) + .cherryPickedSnapshots(Collections.unmodifiableList(categories.cherryPickedSnapshots)) + .regularSnapshots(Collections.unmodifiableList(categories.regularSnapshots)) + .newSnapshots(Collections.unmodifiableList(changes.newSnapshots)) + .existingRetainedSnapshots(Collections.unmodifiableList(changes.existingRetainedSnapshots)) + .deletedSnapshots(Collections.unmodifiableList(changes.deletedSnapshots)) + .branchUpdates(Collections.unmodifiableMap(branchUpdates)) + .appendedCount(metrics.appendedCount) + .stagedCount(metrics.stagedCount) + .cherryPickedCount(metrics.cherryPickedCount) + .deletedCount(metrics.deletedCount) + .build(); + } - SnapshotRef currentRef = metadata.refs().get(branchName); - return currentRef == null || currentRef.snapshotId() != newSnapshotId; + /** Container for categorized snapshots. */ + @lombok.Value + private static class SnapshotCategories { + List wapSnapshots; + List cherryPickedSnapshots; + List regularSnapshots; } - /** Validates that no two branches are trying to point to the same snapshot (ambiguous commit). */ - private void validateNoBranchConflicts(Map branchUpdates) { - // Group branches by target snapshot ID - Map> snapshotToBranches = new HashMap<>(); - for (Map.Entry entry : branchUpdates.entrySet()) { - snapshotToBranches - .computeIfAbsent(entry.getValue(), k -> new ArrayList<>()) - .add(entry.getKey()); - } + /** Categorize all snapshots into WAP, cherry-picked, and regular. */ + private SnapshotCategories categorizeAllSnapshots( + List providedSnapshots, Map existingById) { + List wapSnapshots = categorizeWapSnapshots(providedSnapshots); + List cherryPickedSnapshots = + categorizeCherryPickedSnapshots(providedSnapshots, existingById); + List regularSnapshots = + categorizeRegularSnapshots(providedSnapshots, wapSnapshots, cherryPickedSnapshots); - // Check for conflicts (multiple branches pointing to same snapshot) - for (Map.Entry> entry : snapshotToBranches.entrySet()) { - List branches = entry.getValue(); - if (branches.size() > 1) { - throw new IllegalStateException( - String.format( - "Multiple branches (%s) specify the same target snapshot %d. " - + "This indicates an ambiguous commit operation - each snapshot can only be assigned to one branch.", - branches, entry.getKey())); - } - } + return new SnapshotCategories(wapSnapshots, cherryPickedSnapshots, regularSnapshots); } - /** Validates that snapshots to be deleted are not referenced by any branches or tags. */ - private void validateSnapshotsNotReferenced( - TableMetadata metadata, List snapshotsToDelete) { - if (MapUtils.isEmpty(metadata.refs()) || CollectionUtils.isEmpty(snapshotsToDelete)) { - return; // No refs to check or no snapshots to delete - } - - Set snapshotIdsToDelete = - snapshotsToDelete.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + /** Container for snapshot changes. */ + @lombok.Value + private static class SnapshotChanges { + List newSnapshots; + List existingRetainedSnapshots; + List deletedSnapshots; + } - // Check if any snapshot to delete is referenced by branches or tags - for (Map.Entry refEntry : metadata.refs().entrySet()) { - String refName = refEntry.getKey(); - SnapshotRef ref = refEntry.getValue(); + /** Identify which snapshots are new, retained, or deleted. */ + private SnapshotChanges identifySnapshotChanges( + List providedSnapshots, + List existingSnapshots, + Map providedById, + Map existingById) { - if (snapshotIdsToDelete.contains(ref.snapshotId())) { - List referencingRefs = - metadata.refs().entrySet().stream() - .filter(entry -> snapshotIdsToDelete.contains(entry.getValue().snapshotId())) - .map(Map.Entry::getKey) - .collect(Collectors.toList()); + List newSnapshots = + providedSnapshots.stream() + .filter(s -> !existingById.containsKey(s.snapshotId())) + .collect(Collectors.toList()); - throw new IllegalArgumentException( - String.format( - "Cannot expire %d. Still referenced by refs: %s", - ref.snapshotId(), referencingRefs)); - } - } - } + List existingRetainedSnapshots = + providedSnapshots.stream() + .filter(s -> existingById.containsKey(s.snapshotId())) + .collect(Collectors.toList()); - /** Records snapshot actions in table properties and reports metrics. */ - private void recordSnapshotActions( - TableMetadata metadata, - TableMetadata.Builder metadataBuilder, - List appendedSnapshots, - List stagedSnapshots, - List cherryPickedSnapshots) { + List deletedSnapshots = + existingSnapshots.stream() + .filter(s -> !providedById.containsKey(s.snapshotId())) + .collect(Collectors.toList()); - Map updatedProperties = new HashMap<>(metadata.properties()); + return new SnapshotChanges(newSnapshots, existingRetainedSnapshots, deletedSnapshots); + } - if (CollectionUtils.isNotEmpty(appendedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - appendedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); - } + /** Container for snapshot metrics. */ + @lombok.Value + private static class SnapshotMetrics { + int appendedCount; + int stagedCount; + int cherryPickedCount; + int deletedCount; + } - if (CollectionUtils.isNotEmpty(stagedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - stagedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); - } + /** Compute metrics based on categorized snapshots and changes. */ + private SnapshotMetrics computeSnapshotMetrics( + SnapshotCategories categories, SnapshotChanges changes, Map existingById) { - if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - cherryPickedSnapshots.stream().collect(Collectors.joining(","))); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickedSnapshots.size()); - } + int appendedCount = + (int) + categories.regularSnapshots.stream() + .filter(s -> !existingById.containsKey(s.snapshotId())) + .count(); + int stagedCount = categories.wapSnapshots.size(); + int cherryPickedCount = categories.cherryPickedSnapshots.size(); + int deletedCount = changes.deletedSnapshots.size(); - metadataBuilder.setProperties(updatedProperties); + return new SnapshotMetrics(appendedCount, stagedCount, cherryPickedCount, deletedCount); } /** - * Applies client-requested changes to server state using functional approach. - * - *

Contract: currentState + clientDesiredState -> newState + metrics - * - *

Client sends desired final state, server computes semantic diff and applies it. + * Categorize WAP (Write-Audit-Publish) snapshots. A snapshot is WAP if it has the WAP ID in its + * summary. */ - public TableMetadata applySnapshotOperations( - TableMetadata currentMetadata, - List clientSnapshots, - Map clientRefs, - boolean recordAction) { + private List categorizeWapSnapshots(List snapshots) { + return snapshots.stream() + .filter( + s -> s.summary() != null && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) + .collect(Collectors.toList()); + } - return computeStateDiff(currentMetadata, clientSnapshots, clientRefs) - .map( - diff -> { - TableMetadata newMetadata = applyStateDiff(currentMetadata, diff); - return recordAction - ? recordTransition(currentMetadata, newMetadata, diff) - : newMetadata; + /** + * Categorize cherry-picked snapshots. A snapshot is cherry-picked if it exists in the current + * metadata but has a different parent than in the provided snapshots (indicating it was moved to + * a different branch). + */ + private List categorizeCherryPickedSnapshots( + List providedSnapshots, Map existingById) { + + return providedSnapshots.stream() + .filter( + provided -> { + Snapshot existing = existingById.get(provided.snapshotId()); + if (existing == null) { + return false; // New snapshot, not cherry-picked + } + // Check if parent changed (indicating cherry-pick to different branch) + Long providedParent = provided.parentId(); + Long existingParent = existing.parentId(); + return !Objects.equal(providedParent, existingParent); }) - .orElse(currentMetadata); + .collect(Collectors.toList()); } - /** Computes semantic difference between current server state and client-desired state. */ - private Optional computeStateDiff( - TableMetadata currentMetadata, - List clientSnapshots, - Map clientRefs) { + /** + * Categorize regular (appended) snapshots. Regular snapshots are those that are not WAP or + * cherry-picked. + */ + private List categorizeRegularSnapshots( + List allSnapshots, + List wapSnapshots, + List cherryPickedSnapshots) { + + Set wapIds = wapSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set cherryPickedIds = + cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + return allSnapshots.stream() + .filter(s -> !wapIds.contains(s.snapshotId()) && !cherryPickedIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + } - if (CollectionUtils.isEmpty(clientSnapshots) && MapUtils.isEmpty(clientRefs)) { - return Optional.empty(); // No changes requested - } + /** Compute branch updates by comparing provided and existing refs. */ + private Map computeBranchUpdates( + Map providedRefs, Map existingRefs) { - Set currentSnapshotIds = - currentMetadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + return providedRefs.entrySet().stream() + .filter( + entry -> { + SnapshotRef existing = existingRefs.get(entry.getKey()); + return existing == null || existing.snapshotId() != entry.getValue().snapshotId(); + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } - // Find truly new snapshots (not in current metadata) - List newSnapshots = - Optional.ofNullable(clientSnapshots).orElse(Collections.emptyList()).stream() - .filter(s -> !currentSnapshotIds.contains(s.snapshotId())) - .collect(Collectors.toList()); + /** Stage 5: Validate entire operation. Throws exceptions for invalid operations. */ + private void validateOperation(SnapshotState state, TableMetadata base) { + // Validation 1: Current snapshot not deleted without replacements + validateCurrentSnapshotNotDeleted(state, base); - // Find branch updates needed - Map branchUpdates = - Optional.ofNullable(clientRefs).orElse(Collections.emptyMap()).entrySet().stream() - .filter( - entry -> - needsBranchUpdate( - currentMetadata, entry.getKey(), entry.getValue().snapshotId())) - .collect(Collectors.toMap(Map.Entry::getKey, entry -> entry.getValue().snapshotId())); - - // Check for ambiguous commits: multiple branches trying to point to the same snapshot - validateNoBranchConflicts(branchUpdates); - - return Optional.of( - new StateDiff( - newSnapshots, - branchUpdates, - Optional.ofNullable(clientSnapshots).orElse(Collections.emptyList()))); - } - - /** Applies the computed state diff to create new metadata. */ - private TableMetadata applyStateDiff(TableMetadata currentMetadata, StateDiff diff) { - TableMetadata.Builder builder = TableMetadata.buildFrom(currentMetadata); - - // Add new snapshots (respecting Iceberg semantics) - diff.newSnapshots.forEach( - snapshot -> { - snapshotInspector.validateSnapshot(snapshot); - - if (isWapStaged(snapshot)) { - // WAP snapshots are always staged (never assigned to branches initially) - builder.addSnapshot(snapshot); - } else { - // All other snapshots: assign to branch if specified, otherwise stage - findTargetBranchForSnapshot(snapshot, diff.branchUpdates) - .ifPresentOrElse( - targetBranch -> builder.setBranchSnapshot(snapshot, targetBranch), - () -> builder.addSnapshot(snapshot)); - } - }); + // Validation 2: No ambiguous commits (multiple branches → same snapshot) + validateNoAmbiguousCommits(state); - // Update branch pointers to existing snapshots - diff.branchUpdates.entrySet().stream() - .filter(entry -> !isNewSnapshot(entry.getValue(), diff.newSnapshots)) - .forEach(entry -> builder.setBranchSnapshot(entry.getValue(), entry.getKey())); + // Validation 3: Deleted snapshots not referenced by branches/tags + validateDeletedSnapshotsNotReferenced(state); - return builder.build(); + // Validation 4: Individual snapshot validation using SnapshotInspector + validateIndividualSnapshots(state); } - /** Checks if snapshot is WAP staged (should not be assigned to any branch). */ - private boolean isWapStaged(Snapshot snapshot) { - return snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + /** + * Validate that current snapshot is not deleted without replacements. Package-private for + * testing. + */ + void validateCurrentSnapshotNotDeleted(SnapshotState state, TableMetadata base) { + if (base == null || base.currentSnapshot() == null) { + return; // No current snapshot to validate + } + + long currentSnapshotId = base.currentSnapshot().snapshotId(); + boolean currentDeleted = + state.getDeletedSnapshots().stream().anyMatch(s -> s.snapshotId() == currentSnapshotId); + + if (currentDeleted && state.getNewSnapshots().isEmpty()) { + throw new InvalidIcebergSnapshotException( + String.format( + "Cannot delete the current snapshot %s without adding replacement snapshots. " + + "Deleted: [%s], New: [%s]", + currentSnapshotId, + state.getDeletedSnapshots().stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(", ")), + state.getNewSnapshots().stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(", ")))); + } } - /** Checks if snapshot is cherry-picked (should go directly to target branch). */ - private boolean isCherryPicked(Snapshot snapshot) { - return snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP); + /** + * Validate no ambiguous commits (multiple branches pointing to same snapshot in one commit). + * Package-private for testing. + */ + void validateNoAmbiguousCommits(SnapshotState state) { + Map> snapshotToBranches = + state.getBranchUpdates().entrySet().stream() + .collect( + Collectors.groupingBy( + e -> e.getValue().snapshotId(), + Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); + + snapshotToBranches.forEach( + (snapshotId, branches) -> { + if (branches.size() > 1) { + throw new InvalidIcebergSnapshotException( + String.format( + "Ambiguous commit: snapshot %s is referenced by multiple branches [%s] in a single commit. " + + "Each snapshot can only be referenced by one branch per commit.", + snapshotId, String.join(", ", branches))); + } + }); } /** - * Finds which branch this snapshot should be assigned to based on branch updates. Fails fast if - * multiple branches want the same snapshot (ambiguous commit). + * Validate that deleted snapshots are not referenced by any branches or tags. Package-private for + * testing. */ - private Optional findTargetBranchForSnapshot( - Snapshot snapshot, Map branchUpdates) { - List matchingBranches = - branchUpdates.entrySet().stream() - .filter(entry -> entry.getValue() == snapshot.snapshotId()) - .map(Map.Entry::getKey) - .toList(); - - if (matchingBranches.size() > 1) { - throw new IllegalStateException( - "Multiple branches (%s) specify the same target snapshot %d. " - + "This indicates an ambiguous commit operation - each snapshot can only be assigned to one branch." - .formatted(matchingBranches, snapshot.snapshotId())); - } + void validateDeletedSnapshotsNotReferenced(SnapshotState state) { + Set deletedIds = + state.getDeletedSnapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - return matchingBranches.stream().findFirst(); + Map> referencedIdsToRefs = + state.getProvidedRefs().entrySet().stream() + .collect( + Collectors.groupingBy( + e -> e.getValue().snapshotId(), + Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); + + Map> invalidDeletes = + deletedIds.stream() + .filter(referencedIdsToRefs::containsKey) + .collect(Collectors.toMap(id -> id, referencedIdsToRefs::get)); + + if (!invalidDeletes.isEmpty()) { + String details = + invalidDeletes.entrySet().stream() + .map( + e -> + String.format( + "snapshot %s (referenced by: %s)", + e.getKey(), String.join(", ", e.getValue()))) + .collect(Collectors.joining("; ")); + throw new InvalidIcebergSnapshotException( + String.format( + "Cannot delete snapshots that are still referenced by branches/tags: %s", details)); + } } - /** Checks if this snapshot ID is in the list of new snapshots being added. */ - private boolean isNewSnapshot(Long snapshotId, List newSnapshots) { - return newSnapshots.stream().anyMatch(s -> s.snapshotId() == snapshotId); + /** + * Validate individual snapshots using existing SnapshotInspector. Package-private for testing. + */ + void validateIndividualSnapshots(SnapshotState state) { + state + .getNewSnapshots() + .forEach( + snapshot -> { + if (snapshotInspector != null) { + snapshotInspector.validateSnapshot(snapshot); + } + }); } - /** Records metrics and properties about the state transition that occurred. */ - private TableMetadata recordTransition( - TableMetadata originalMetadata, TableMetadata newMetadata, StateDiff diff) { + /** + * Stage 6: Apply state changes to create new TableMetadata. Pure function - creates new metadata + * without mutating existing. + * + *

This method uses Iceberg's proper APIs: - removeSnapshots() to delete snapshots - + * addSnapshot() to add new snapshots - setBranchSnapshot() to set branch references + * + *

The order of operations matters: 1. Start with base metadata (buildFrom copies all existing + * state) 2. Remove deleted snapshots first (using proper removeSnapshots API) 3. Remove stale + * branch references 4. Add new snapshots and set branch pointers + */ + private TableMetadata applyStateChanges(TableMetadata metadata, SnapshotState state) { + TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); + + // Step 1: Remove deleted snapshots using proper Iceberg API + if (!state.getDeletedSnapshots().isEmpty()) { + Set deletedIds = + state.getDeletedSnapshots().stream() + .map(Snapshot::snapshotId) + .collect(Collectors.toSet()); + builder.removeSnapshots(deletedIds); + } - Map properties = new HashMap<>(newMetadata.properties()); + // Step 2: Remove stale branch references (branches that are no longer in provided refs) + Set providedRefNames = state.getProvidedRefs().keySet(); + metadata.refs().keySet().stream() + .filter(refName -> !providedRefNames.contains(refName)) + .forEach(builder::removeRef); + + // Step 3: Identify existing snapshots (after deletions) + Set existingSnapshotIds = + metadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set deletedIds = + state.getDeletedSnapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + existingSnapshotIds.removeAll(deletedIds); + + // Step 4: Identify snapshots referenced by branches + Set referencedByBranches = + state.getProvidedRefs().values().stream() + .map(SnapshotRef::snapshotId) + .collect(Collectors.toSet()); + + // Step 5: Add unreferenced new snapshots (referenced ones are added via setBranchSnapshot) + state.getProvidedSnapshots().stream() + .filter(s -> !existingSnapshotIds.contains(s.snapshotId())) + .filter(s -> !referencedByBranches.contains(s.snapshotId())) + .forEach(builder::addSnapshot); + + // Step 6: Set branch pointers for all provided refs + state + .getProvidedRefs() + .forEach( + (branchName, ref) -> { + Snapshot snapshot = + state.getProvidedSnapshots().stream() + .filter(s -> s.snapshotId() == ref.snapshotId()) + .findFirst() + .orElseThrow( + () -> + new InvalidIcebergSnapshotException( + String.format( + "Branch %s references non-existent snapshot %s", + branchName, ref.snapshotId()))); + + if (existingSnapshotIds.contains(snapshot.snapshotId())) { + // Snapshot already exists - just update the branch pointer if needed + SnapshotRef existingRef = metadata.refs().get(branchName); + if (existingRef == null || existingRef.snapshotId() != ref.snapshotId()) { + builder.setRef(branchName, ref); + } + } else { + // Snapshot is new - setBranchSnapshot will add it and set the branch pointer + builder.setBranchSnapshot(snapshot, branchName); + } + }); - // Categorize new snapshots by their semantic type for metrics - Map> snapshotsByType = - diff.newSnapshots.stream() - .collect( - Collectors.groupingBy( - this::getSnapshotCategory, - Collectors.mapping(s -> String.valueOf(s.snapshotId()), Collectors.toList()))); - - // Record snapshot metrics by type - recordIfPresent( - properties, - snapshotsByType, - "appended", - CatalogConstants.APPENDED_SNAPSHOTS, - InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR); - recordIfPresent( - properties, - snapshotsByType, - "staged", - CatalogConstants.STAGED_SNAPSHOTS, - InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR); - - // For cherry-picked snapshots, record the SOURCE snapshot IDs that were cherry-picked - List cherryPickSourceIds = - diff.newSnapshots.stream() - .filter(this::isCherryPicked) - .map(this::getCherryPickSourceId) - .filter(Optional::isPresent) - .map(Optional::get) - .collect(Collectors.toList()); + return builder.build(); + } - if (!cherryPickSourceIds.isEmpty()) { - properties.put( + /** + * Stage 7: Record metrics and add properties to metadata. Returns new metadata with updated + * properties. + */ + private TableMetadata recordMetrics(TableMetadata metadata, SnapshotState state) { + Map newProperties = new HashMap<>(metadata.properties()); + + // Helper to format snapshot IDs as comma-separated string + java.util.function.Function, String> formatIds = + snapshots -> + snapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")); + + // Record categorization metrics as comma-separated snapshot IDs + if (!state.getRegularSnapshots().isEmpty()) { + List newRegularSnapshots = + state.getRegularSnapshots().stream() + .filter(s -> state.getNewSnapshots().contains(s)) + .collect(Collectors.toList()); + if (!newRegularSnapshots.isEmpty()) { + newProperties.put( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + formatIds.apply(newRegularSnapshots)); + } + } + if (!state.getWapSnapshots().isEmpty()) { + newProperties.put( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + formatIds.apply(state.getWapSnapshots())); + } + if (!state.getCherryPickedSnapshots().isEmpty()) { + newProperties.put( getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - String.join(",", cherryPickSourceIds)); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickSourceIds.size()); + formatIds.apply(state.getCherryPickedSnapshots())); } - - // Record branch updates that don't involve new snapshots (pure ref moves) - List refOnlyCherryPicks = - diff.branchUpdates.entrySet().stream() - .filter(entry -> !isNewSnapshot(entry.getValue(), diff.newSnapshots)) - .map(entry -> String.valueOf(entry.getValue())) - .collect(Collectors.toList()); - - if (!refOnlyCherryPicks.isEmpty()) { - String existing = - properties.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); - String combined = - existing != null - ? existing + "," + String.join(",", refOnlyCherryPicks) - : String.join(",", refOnlyCherryPicks); - properties.put(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), combined); - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, refOnlyCherryPicks.size()); + if (!state.getDeletedSnapshots().isEmpty()) { + newProperties.put( + getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), + formatIds.apply(state.getDeletedSnapshots())); } - return TableMetadata.buildFrom(newMetadata).setProperties(properties).build(); - } + // Remove the transient snapshot keys from properties + newProperties.remove(CatalogConstants.SNAPSHOTS_JSON_KEY); + newProperties.remove(CatalogConstants.SNAPSHOTS_REFS_KEY); - /** Categorizes snapshot for metrics based on its semantic type. */ - private String getSnapshotCategory(Snapshot snapshot) { - if (isWapStaged(snapshot)) return "staged"; - if (isCherryPicked(snapshot)) - return "appended"; // Cherry-picked snapshots are NEW, so they're "appended" - return "appended"; + return metadata.replaceProperties(newProperties); } - /** Extracts the source snapshot ID for cherry-picked snapshots. */ - private Optional getCherryPickSourceId(Snapshot snapshot) { - return Optional.ofNullable(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)); - } + // ==================== End Functional Snapshot Application Pipeline ==================== - /** Records snapshot category in properties if snapshots exist. */ - private void recordIfPresent( - Map properties, - Map> categorized, - String category, - String propertyKey, - String metricKey) { + /** + * If this commit comes from Iceberg built-in retry in + * org.apache.iceberg.PropertiesUpdate#commit() Then throw fatal {@link CommitFailedException} to + * inform users. + */ + private void failIfRetryUpdate(Map properties) { + if (properties.containsKey(CatalogConstants.COMMIT_KEY)) { + String userProvidedTblVer = properties.get(CatalogConstants.COMMIT_KEY); - Optional.ofNullable(categorized.get(category)) - .filter(CollectionUtils::isNotEmpty) - .ifPresent( - snapshots -> { - properties.put(getCanonicalFieldName(propertyKey), String.join(",", snapshots)); - metricsReporter.count(metricKey, snapshots.size()); - }); + // If the commit is ever seen in the past, that indicates this commit is a retry and should + // abort + if (CACHE.getIfPresent(userProvidedTblVer) != null) { + throw new CommitFailedException( + String.format( + "The user provided table version [%s] for table [%s] is stale, please consider retry from application", + userProvidedTblVer, tableIdentifier)); + } else { + CACHE.put(userProvidedTblVer, 1); + } + + properties.remove(CatalogConstants.COMMIT_KEY); + } else { + // This should never occur except table-creation. However, when table-creation hits + // concurrency issue + // it throw AlreadyExistsException and will not trigger retry. + metricsReporter.count(InternalCatalogMetricsConstant.MISSING_COMMIT_KEY); + } } /** Helper function to dump contents for map in debugging mode. */ From 4087462e1462d11d5c21016bcc9fe4e2eb324f92 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Thu, 9 Oct 2025 11:10:16 -0700 Subject: [PATCH 13/35] working tests and restructured code --- .../OpenHouseInternalTableOperations.java | 178 +++- .../OpenHouseInternalTableOperationsTest.java | 820 +++++++++++------- 2 files changed, 630 insertions(+), 368 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index a89a5d570..6a2c43305 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -4,6 +4,7 @@ import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; +import com.google.common.collect.Sets; import com.google.gson.Gson; import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.cluster.storage.Storage; @@ -521,7 +522,8 @@ private static class SnapshotState { * *

Pipeline stages: 1. Extract snapshots from properties 2. Parse snapshots from JSON 3. Parse * references from JSON 4. Compute complete state diff (categorize, identify changes) 5. Validate - * entire operation 6. Apply state changes 7. Record metrics/properties + * entire operation 6. Apply state changes (returns builder) 7. Add metric properties to builder + * 8. Build once at top level to preserve lastUpdatedMillis from snapshot operations * * @param base The base table metadata (may be null for table creation) * @param metadata The new metadata with properties containing snapshot updates @@ -569,11 +571,14 @@ TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) { // Stage 5: Validate entire operation validateOperation(state, base); - // Stage 6: Apply state changes - TableMetadata updated = applyStateChanges(metadata, state); + // Stage 6: Apply state changes - returns builder + TableMetadata.Builder builder = applyStateChanges(metadata, state); - // Stage 7: Record metrics/properties - return recordMetrics(updated, state); + // Stage 7: Record metrics and add metric properties to builder + builder = recordMetrics(builder, state); + + // Build once at the end to preserve lastUpdatedMillis from snapshot operations + return builder.build(); }) .orElse(metadata); // No snapshot updates if key not present } @@ -604,7 +609,11 @@ private SnapshotState computeStateDiff(SnapshotState.SnapshotStateBuilder builde // Categorize all snapshots by type SnapshotCategories categories = - categorizeAllSnapshots(partial.getProvidedSnapshots(), existingById); + categorizeAllSnapshots( + partial.getProvidedSnapshots(), + existingById, + partial.getExistingRefs(), + partial.getProvidedRefs()); // Identify snapshot changes (new, retained, deleted) SnapshotChanges changes = @@ -647,10 +656,24 @@ private static class SnapshotCategories { /** Categorize all snapshots into WAP, cherry-picked, and regular. */ private SnapshotCategories categorizeAllSnapshots( - List providedSnapshots, Map existingById) { - List wapSnapshots = categorizeWapSnapshots(providedSnapshots); + List providedSnapshots, + Map existingById, + Map existingRefs, + Map providedRefs) { + List wapSnapshots = + categorizeWapSnapshots(providedSnapshots, existingRefs, providedRefs); List cherryPickedSnapshots = - categorizeCherryPickedSnapshots(providedSnapshots, existingById); + categorizeCherryPickedSnapshots( + providedSnapshots, existingById, existingRefs, providedRefs); + + // Cherry-picked snapshots should not be considered WAP/staged anymore + Set cherryPickedIds = + cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + wapSnapshots = + wapSnapshots.stream() + .filter(s -> !cherryPickedIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + List regularSnapshots = categorizeRegularSnapshots(providedSnapshots, wapSnapshots, cherryPickedSnapshots); @@ -716,23 +739,60 @@ private SnapshotMetrics computeSnapshotMetrics( } /** - * Categorize WAP (Write-Audit-Publish) snapshots. A snapshot is WAP if it has the WAP ID in its - * summary. + * Categorize WAP (Write-Audit-Publish) snapshots. A snapshot is considered WAP/staged if it has + * the wap.id property AND is not on any branch in either the existing or provided metadata. This + * correctly handles: 1. Snapshots that were on branches in base - not WAP even if unreferenced in + * new metadata 2. Snapshots being published (staged -> branch) - not WAP as they're now on a + * branch + * + * @param snapshots List of provided snapshots + * @param existingRefs Existing snapshot refs from base metadata + * @param providedRefs Provided snapshot refs from new metadata + * @return List of WAP snapshots */ - private List categorizeWapSnapshots(List snapshots) { + private List categorizeWapSnapshots( + List snapshots, + Map existingRefs, + Map providedRefs) { + // Get set of snapshot IDs that are/were on branches + Set branchSnapshotIds = new java.util.HashSet<>(); + branchSnapshotIds.addAll( + existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet())); + branchSnapshotIds.addAll( + providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet())); + return snapshots.stream() .filter( s -> s.summary() != null && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) + .filter(s -> !branchSnapshotIds.contains(s.snapshotId())) .collect(Collectors.toList()); } /** - * Categorize cherry-picked snapshots. A snapshot is cherry-picked if it exists in the current + * Categorize cherry-picked snapshots. A snapshot is cherry-picked if: 1. It exists in the current * metadata but has a different parent than in the provided snapshots (indicating it was moved to - * a different branch). + * a different branch), OR 2. It is referenced as the source of a cherry-pick by another + * snapshot's "source-snapshot-id", OR 3. It has wap.id AND was staged (not on a branch) in + * existing refs AND is now on a branch in provided refs (indicating it's being published) */ private List categorizeCherryPickedSnapshots( - List providedSnapshots, Map existingById) { + List providedSnapshots, + Map existingById, + Map existingRefs, + Map providedRefs) { + + // Find snapshots that are sources of cherry-picks + Set cherryPickSourceIds = + providedSnapshots.stream() + .filter(s -> s.summary() != null && s.summary().containsKey("source-snapshot-id")) + .map(s -> Long.parseLong(s.summary().get("source-snapshot-id"))) + .collect(Collectors.toSet()); + + // Get snapshot IDs on branches + Set existingBranchSnapshotIds = + existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); + Set providedBranchSnapshotIds = + providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); return providedSnapshots.stream() .filter( @@ -744,7 +804,20 @@ private List categorizeCherryPickedSnapshots( // Check if parent changed (indicating cherry-pick to different branch) Long providedParent = provided.parentId(); Long existingParent = existing.parentId(); - return !Objects.equal(providedParent, existingParent); + boolean parentChanged = !Objects.equal(providedParent, existingParent); + + // Check if this snapshot is the source of a cherry-pick + boolean isCherryPickSource = cherryPickSourceIds.contains(provided.snapshotId()); + + // Check if this is a WAP snapshot being published (staged -> branch) + boolean hasWapId = + provided.summary() != null + && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + boolean wasStaged = !existingBranchSnapshotIds.contains(provided.snapshotId()); + boolean isNowOnBranch = providedBranchSnapshotIds.contains(provided.snapshotId()); + boolean isBeingPublished = hasWapId && wasStaged && isNowOnBranch; + + return parentChanged || isCherryPickSource || isBeingPublished; }) .collect(Collectors.toList()); } @@ -897,8 +970,8 @@ void validateIndividualSnapshots(SnapshotState state) { } /** - * Stage 6: Apply state changes to create new TableMetadata. Pure function - creates new metadata - * without mutating existing. + * Stage 6: Apply state changes to create TableMetadata builder. Returns builder (not built) to + * allow metric properties to be added before the final build, preserving lastUpdatedMillis. * *

This method uses Iceberg's proper APIs: - removeSnapshots() to delete snapshots - * addSnapshot() to add new snapshots - setBranchSnapshot() to set branch references @@ -906,8 +979,10 @@ void validateIndividualSnapshots(SnapshotState state) { *

The order of operations matters: 1. Start with base metadata (buildFrom copies all existing * state) 2. Remove deleted snapshots first (using proper removeSnapshots API) 3. Remove stale * branch references 4. Add new snapshots and set branch pointers + * + * @return Builder with all snapshot changes applied but not yet built */ - private TableMetadata applyStateChanges(TableMetadata metadata, SnapshotState state) { + private TableMetadata.Builder applyStateChanges(TableMetadata metadata, SnapshotState state) { TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); // Step 1: Remove deleted snapshots using proper Iceberg API @@ -972,15 +1047,36 @@ private TableMetadata applyStateChanges(TableMetadata metadata, SnapshotState st } }); - return builder.build(); + return builder; } /** - * Stage 7: Record metrics and add properties to metadata. Returns new metadata with updated - * properties. + * Stage 7: Add metric properties to builder. Returns the builder for final build in + * applySnapshots. This allows the single build to preserve lastUpdatedMillis from snapshot + * operations. + * + * @param builder Builder with snapshot changes already applied + * @param state Snapshot state containing metrics to record + * @return Builder with metric properties added, ready to be built */ - private TableMetadata recordMetrics(TableMetadata metadata, SnapshotState state) { - Map newProperties = new HashMap<>(metadata.properties()); + private TableMetadata.Builder recordMetrics(TableMetadata.Builder builder, SnapshotState state) { + // Emit metrics to reporter + if (state.getAppendedCount() > 0) { + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, state.getAppendedCount()); + } + if (state.getStagedCount() > 0) { + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, state.getStagedCount()); + } + if (state.getCherryPickedCount() > 0) { + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, state.getCherryPickedCount()); + } + if (state.getDeletedCount() > 0) { + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, state.getDeletedCount()); + } // Helper to format snapshot IDs as comma-separated string java.util.function.Function, String> formatIds = @@ -996,32 +1092,36 @@ private TableMetadata recordMetrics(TableMetadata metadata, SnapshotState state) .filter(s -> state.getNewSnapshots().contains(s)) .collect(Collectors.toList()); if (!newRegularSnapshots.isEmpty()) { - newProperties.put( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - formatIds.apply(newRegularSnapshots)); + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + formatIds.apply(newRegularSnapshots))); } } if (!state.getWapSnapshots().isEmpty()) { - newProperties.put( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - formatIds.apply(state.getWapSnapshots())); + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + formatIds.apply(state.getWapSnapshots()))); } if (!state.getCherryPickedSnapshots().isEmpty()) { - newProperties.put( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - formatIds.apply(state.getCherryPickedSnapshots())); + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + formatIds.apply(state.getCherryPickedSnapshots()))); } if (!state.getDeletedSnapshots().isEmpty()) { - newProperties.put( - getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - formatIds.apply(state.getDeletedSnapshots())); + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), + formatIds.apply(state.getDeletedSnapshots()))); } // Remove the transient snapshot keys from properties - newProperties.remove(CatalogConstants.SNAPSHOTS_JSON_KEY); - newProperties.remove(CatalogConstants.SNAPSHOTS_REFS_KEY); + builder.removeProperties( + Sets.newHashSet(CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY)); - return metadata.replaceProperties(newProperties); + return builder; } // ==================== End Functional Snapshot Application Pipeline ==================== diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index 7bb945c44..2ff4d1e21 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -8,6 +8,7 @@ import com.linkedin.openhouse.cluster.storage.StorageType; import com.linkedin.openhouse.cluster.storage.local.LocalStorage; import com.linkedin.openhouse.cluster.storage.local.LocalStorageClient; +import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -30,7 +31,6 @@ import java.util.UUID; import java.util.function.Consumer; import java.util.stream.Collectors; -import java.util.stream.IntStream; import lombok.SneakyThrows; import org.apache.commons.compress.utils.Lists; import org.apache.hadoop.conf.Configuration; @@ -482,52 +482,6 @@ void testDoCommitExceptionHandling() { () -> openHouseInternalTableOperations.doCommit(base, metadata)); } - @Test - void testDoCommitWithValidSnapshotDeletion() throws IOException { - TableMetadata metadata = - BASE_TABLE_METADATA.replaceProperties(ImmutableMap.of("random", "value")); - List testSnapshots = IcebergTestUtil.getSnapshots(); - Map properties = new HashMap<>(metadata.properties()); - - // The key insight: SNAPSHOTS_JSON_KEY determines what snapshots SHOULD exist after commit - // Only include snapshot 2 - this means snapshots 0 and 1 should be deleted - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, - SnapshotsUtil.serializedSnapshots(testSnapshots.subList(2, 3))); // Only snapshot 2 - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - testSnapshots.get(2)))); // snapshot 2 -> main - properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); - metadata = metadata.replaceProperties(properties); - - // Create initial metadata with snapshots 0, 1, 2 where only snapshot 2 is referenced - TableMetadata metadataWithSnapshots = - TableMetadata.buildFrom(metadata) - .addSnapshot(testSnapshots.get(0)) // Unreferenced - will be deleted - .addSnapshot(testSnapshots.get(1)) // Unreferenced - will be deleted - .setBranchSnapshot( - testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) // Referenced - will be kept - .build(); - - // Target metadata: same branch setup but snapshots 0,1 removed via SNAPSHOTS_JSON_KEY - TableMetadata metadataWithSnapshotsDeleted = - TableMetadata.buildFrom(metadata) - .setBranchSnapshot( - testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) // Only snapshot 2 remains - .build(); - - // This should succeed because snapshots 0 and 1 are unreferenced and can be safely deleted - Assertions.assertDoesNotThrow( - () -> - openHouseInternalTableOperations.doCommit( - metadataWithSnapshots, metadataWithSnapshotsDeleted)); - - // ideally we also verify that snapshots 0 and 1 are deleted, but doCommit doesn't return the - // metadata with the deleted snapshots - } - @Test void testDoCommitSnapshotsValidationThrowsException() throws IOException { TableMetadata metadata = @@ -567,7 +521,7 @@ void testDoCommitSnapshotsValidationThrowsException() throws IOException { // This should throw exception because snapshot 1 is marked for deletion but still referenced by // main Assertions.assertThrows( - CommitStateUnknownException.class, + InvalidIcebergSnapshotException.class, () -> openHouseInternalTableOperations.doCommit( metadataWithSnapshots, metadataWithSnapshotsDeleted), @@ -595,12 +549,10 @@ void testDoCommitAppendStageOnlySnapshotsInitialVersion() throws IOException { .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")), updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } @@ -640,35 +592,52 @@ void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")), updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } @Test void testAppendSnapshotsWithOldSnapshots() throws IOException { - TableMetadata metadata = + // Create base metadata (existing table state) + TableMetadata baseMetadata = TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setPreviousFileLocation("tmp_location") + .setPreviousFileLocation("tmp_location") // this is key .setLocation(BASE_TABLE_METADATA.metadataFileLocation()) .build(); + // all snapshots are from the past and snapshots add should fail the validation List snapshots = IcebergTestUtil.getSnapshots(); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + Assertions.assertThrows( IllegalArgumentException.class, - () -> - openHouseInternalTableOperations.applySnapshotOperations( - metadata, snapshots, ImmutableMap.of(), false)); + () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata)); + // the latest snapshots have larger timestamp than the previous metadata timestamp, so it should // pass the validation snapshots.addAll(IcebergTestUtil.getFutureSnapshots()); - openHouseInternalTableOperations.applySnapshotOperations( - metadata, snapshots, ImmutableMap.of(), false); + Map propertiesWithFuture = new HashMap<>(baseMetadata.properties()); + propertiesWithFuture.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + propertiesWithFuture.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); + + TableMetadata newMetadataWithFuture = baseMetadata.replaceProperties(propertiesWithFuture); + openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadataWithFuture); } @Test @@ -702,15 +671,12 @@ void testDoCommitCherryPickSnapshotBaseUnchanged() throws IOException { Map updatedProperties = tblMetadataCaptor.getValue().properties(); // verify the staged snapshot is cherry picked by use the existing one - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(0).snapshotId()), updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } @@ -727,7 +693,7 @@ void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { Map properties = new HashMap<>(base.properties()); try (MockedStatic ignoreWriteMock = Mockito.mockStatic(TableMetadataParser.class)) { - // cherry pick the staged snapshot whose base has changed + // cherry-pick the staged snapshot whose base has changed properties.put( CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(testWapSnapshots)); properties.put( @@ -742,17 +708,15 @@ void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); Map updatedProperties = tblMetadataCaptor.getValue().properties(); - // verify the staged snapshot is cherry picked by creating a new snapshot and append it - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + // verify the staged snapshot is cherry-picked by creating a new snapshot and append it + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(2).snapshotId()), updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(1).snapshotId()), updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } @@ -781,15 +745,12 @@ void testDoCommitCherryPickFirstSnapshot() throws IOException { Map updatedProperties = tblMetadataCaptor.getValue().properties(); // verify the staged snapshot is cherry picked by using the existing one - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); Assertions.assertEquals( Long.toString(testWapSnapshots.get(0).snapshotId()), updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } @@ -812,14 +773,11 @@ void testDoCommitDeleteLastStagedSnapshotWhenNoRefs() throws IOException { Map updatedProperties = tblMetadataCaptor.getValue().properties(); // verify nothing happens - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); - Assertions.assertEquals( - null, updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(updatedProperties.get(getCanonicalFieldName("deleted_snapshots"))); Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); } } @@ -1283,31 +1241,38 @@ void testDeleteSnapshotWithMainReference() throws IOException { .build(); // Get the current head snapshot that is referenced by main branch - Snapshot referencedSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Snapshot referencedSnapshot = testSnapshots.get(3); - // Attempt to delete a snapshot that is currently referenced by a branch - List snapshotsToDelete = List.of(referencedSnapshot); + // Create new metadata that attempts to delete the referenced snapshot + // The SNAPSHOTS_JSON_KEY will only include first 3 snapshots (excluding the referenced one) + // But SNAPSHOTS_REFS_KEY will still reference snapshot 3, causing a conflict + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots( + testSnapshots.subList(0, 3))); // Only snapshots 0-2, excluding referenced snapshot 3 + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + referencedSnapshot))); // Still references snapshot 3 - // Capture final variables for lambda - final TableMetadata finalBase = baseMetadata; - final List finalSnapshotsToDelete = snapshotsToDelete; + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); // This MUST throw IllegalArgumentException for referenced snapshots - IllegalArgumentException exception = + InvalidIcebergSnapshotException exception = Assertions.assertThrows( - IllegalArgumentException.class, - () -> - openHouseInternalTableOperations.maybeDeleteSnapshots( - finalBase, finalSnapshotsToDelete), - "Should throw IllegalArgumentException when trying to delete referenced snapshot"); + InvalidIcebergSnapshotException.class, + () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete referenced snapshot"); // Verify error message mentions the reference String expectedMessage = - "Cannot expire " + referencedSnapshot.snapshotId() + ". Still referenced by refs:"; + "Cannot delete the current snapshot " + + referencedSnapshot.snapshotId() + + " without adding replacement snapshots"; Assertions.assertTrue( - exception.getMessage().contains(expectedMessage) - || exception.getMessage().contains("Still referenced by") - || exception.getMessage().contains("referenced"), + exception.getMessage().contains(expectedMessage), "Error message should indicate snapshot is still referenced: " + exception.getMessage()); } @@ -1316,7 +1281,7 @@ void testDeleteSnapshotWithNoReference() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata with multiple snapshots - TableMetadata base = + TableMetadata baseMetadata = TableMetadata.buildFrom(BASE_TABLE_METADATA) .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted @@ -1326,12 +1291,25 @@ void testDeleteSnapshotWithNoReference() throws IOException { .build(); // Delete unreferenced snapshots (first two snapshots) - List unreferencedSnapshots = testSnapshots.subList(0, 2); + // New metadata keeps snapshots 2 and 3 + Snapshot referencedSnapshot = testSnapshots.get(3); + List remainingSnapshots = testSnapshots.subList(2, 4); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(referencedSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = - openHouseInternalTableOperations.maybeDeleteSnapshots(base, unreferencedSnapshots); + openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); // Verify unreferenced snapshots were removed + List unreferencedSnapshots = testSnapshots.subList(0, 2); for (Snapshot unreferenced : unreferencedSnapshots) { boolean snapshotExists = result.snapshots().stream().anyMatch(s -> s.snapshotId() == unreferenced.snapshotId()); @@ -1341,21 +1319,20 @@ void testDeleteSnapshotWithNoReference() throws IOException { } // Verify referenced snapshot still exists - Snapshot referencedSnapshot = testSnapshots.get(3); boolean referencedExists = result.snapshots().stream() .anyMatch(s -> s.snapshotId() == referencedSnapshot.snapshotId()); Assertions.assertTrue(referencedExists, "Referenced snapshot should still exist"); // Verify deletion tracking - Map properties = result.properties(); - String deletedSnapshots = - properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); - Assertions.assertNotNull(deletedSnapshots); + Map resultProperties = result.properties(); + String deletedSnapshotsStr = + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNotNull(deletedSnapshotsStr); for (Snapshot unreferenced : unreferencedSnapshots) { Assertions.assertTrue( - deletedSnapshots.contains(Long.toString(unreferenced.snapshotId())), + deletedSnapshotsStr.contains(Long.toString(unreferenced.snapshotId())), "Unreferenced snapshot should be tracked as deleted"); } } @@ -1364,49 +1341,55 @@ void testDeleteSnapshotWithNoReference() throws IOException { void testDeleteSnapshotWithMultipleReference() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); - // Create metadata with snapshot referenced by multiple branches - // Reference the same snapshot from multiple branches - Snapshot sharedSnapshot = testSnapshots.get(1); + // Create metadata with 2 snapshots: one referenced by multiple branches, one unreferenced + Snapshot sharedSnapshot = testSnapshots.get(0); // This will be referenced by both branches + Snapshot mainSnapshot = testSnapshots.get(1); // This one stays but is not referenced + TableMetadata baseMetadata = TableMetadata.buildFrom(BASE_TABLE_METADATA) - .addSnapshot(sharedSnapshot) // Add snapshot first + .addSnapshot(sharedSnapshot) + .addSnapshot(mainSnapshot) .setRef( SnapshotRef.MAIN_BRANCH, - SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) + SnapshotRef.branchBuilder(mainSnapshot.snapshotId()).build()) .setRef( "feature_branch", SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) + .setRef( + "feature_branch1", SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) .build(); - // Add other snapshots to the metadata (skip index 1 - shared snapshot already added) - List snapshotsToAdd = - IntStream.range(0, testSnapshots.size()) - .filter(i -> i != 1) - .mapToObj(testSnapshots::get) - .collect(Collectors.toList()); - for (Snapshot snapshot : snapshotsToAdd) { - baseMetadata = TableMetadata.buildFrom(baseMetadata).addSnapshot(snapshot).build(); - } + // Attempt to delete the shared snapshot by creating new metadata without it + // Keep the unreferenced snapshot so we're not deleting everything + List remainingSnapshots = List.of(mainSnapshot); + + // Keep refs pointing to the shared snapshot (causing conflict) + Map refs = baseMetadata.refs(); + Map serializedRefs = + refs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); - // Attempt to delete the shared snapshot - List snapshotsToDelete = List.of(sharedSnapshot); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); - // Capture final variables for lambda - final TableMetadata finalBase = baseMetadata; - final List finalSnapshotsToDelete = snapshotsToDelete; + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); - // This MUST throw IllegalArgumentException for snapshots referenced by multiple branches - IllegalArgumentException exception = + // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by multiple branches + InvalidIcebergSnapshotException exception = Assertions.assertThrows( - IllegalArgumentException.class, - () -> - openHouseInternalTableOperations.maybeDeleteSnapshots( - finalBase, finalSnapshotsToDelete), - "Should throw IllegalArgumentException when trying to delete snapshot referenced by multiple branches"); + InvalidIcebergSnapshotException.class, + () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); - // Verify error message mentions multiple references + // Verify error message mentions the snapshot is still referenced String exceptionMessage = exception.getMessage(); Assertions.assertTrue( - exceptionMessage.contains("Still referenced by refs"), + exceptionMessage.contains("Still referenced by refs") + || exceptionMessage.contains("still referenced"), "Error message should indicate snapshot is still referenced by branches: " + exceptionMessage); } @@ -1431,26 +1414,44 @@ void testDeleteSnapshotWithBranchReference() throws IOException { TableMetadata.buildFrom(baseMetadata).addSnapshot(testSnapshots.get(i)).build(); } - // Attempt to delete snapshot that has a tag reference - List snapshotsToDelete = List.of(taggedSnapshot); + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; - // Capture final variables for lambda - final TableMetadata finalBase = baseMetadata; - final List finalSnapshotsToDelete = snapshotsToDelete; + // Attempt to delete snapshot that has a tag reference by creating new metadata without it + List remainingSnapshots = + finalBaseMetadata.snapshots().stream() + .filter(s -> s.snapshotId() != taggedSnapshot.snapshotId()) + .collect(Collectors.toList()); - // This MUST throw IllegalArgumentException for snapshots referenced by tags - IllegalArgumentException exception = + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + // Keep refs pointing to the tagged snapshot (causing conflict) + Map serializedRefs = + finalBaseMetadata.refs().entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); + + // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by tags + InvalidIcebergSnapshotException exception = Assertions.assertThrows( - IllegalArgumentException.class, - () -> - openHouseInternalTableOperations.maybeDeleteSnapshots( - finalBase, finalSnapshotsToDelete), - "Should throw IllegalArgumentException when trying to delete snapshot referenced by tag"); + InvalidIcebergSnapshotException.class, + () -> openHouseInternalTableOperations.applySnapshots(finalBaseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by tag"); // Verify error message mentions tag reference String exceptionMessage = exception.getMessage(); + String expectedMessage = + "Cannot delete snapshots that are still referenced by branches/tags: snapshot " + + taggedSnapshot.snapshotId() + + " (referenced by: feature_branch)"; Assertions.assertTrue( - exceptionMessage.contains("Still referenced by refs"), + exceptionMessage.contains(expectedMessage), "Error message should indicate snapshot is still referenced by branches: " + exceptionMessage); } @@ -1460,29 +1461,39 @@ void testDeleteEmptySnapshotList() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata - TableMetadata base = BASE_TABLE_METADATA; + TableMetadata baseMetadata = BASE_TABLE_METADATA; for (Snapshot snapshot : testSnapshots) { - base = - TableMetadata.buildFrom(base) + baseMetadata = + TableMetadata.buildFrom(baseMetadata) .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) .build(); } - // Delete empty list - List emptyList = List.of(); + // Delete empty list - new metadata is same as base (no snapshots deleted) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); - TableMetadata result = openHouseInternalTableOperations.maybeDeleteSnapshots(base, emptyList); + TableMetadata result = + openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); // Verify no changes were made Assertions.assertEquals( - base.snapshots().size(), + baseMetadata.snapshots().size(), result.snapshots().size(), "No snapshots should be deleted when list is empty"); // Verify no deletion tracking properties were added - Map properties = result.properties(); + Map resultProperties = result.properties(); String deletedSnapshots = - properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); } @@ -1491,27 +1502,39 @@ void testDeleteNullSnapshotList() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata - TableMetadata base = BASE_TABLE_METADATA; + TableMetadata baseMetadata = BASE_TABLE_METADATA; for (Snapshot snapshot : testSnapshots) { - base = - TableMetadata.buildFrom(base) + baseMetadata = + TableMetadata.buildFrom(baseMetadata) .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) .build(); } - // Delete null list - TableMetadata result = openHouseInternalTableOperations.maybeDeleteSnapshots(base, null); + // Delete null list - new metadata is same as base (no snapshots deleted) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + TableMetadata result = + openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); // Verify no changes were made Assertions.assertEquals( - base.snapshots().size(), + baseMetadata.snapshots().size(), result.snapshots().size(), "No snapshots should be deleted when list is null"); // Verify no deletion tracking properties were added - Map properties = result.properties(); + Map resultProperties = result.properties(); String deletedSnapshots = - properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); } @@ -1520,10 +1543,10 @@ void testDeleteNonExistentSnapshot() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata - TableMetadata base = BASE_TABLE_METADATA; + TableMetadata baseMetadata = BASE_TABLE_METADATA; for (Snapshot snapshot : testSnapshots) { - base = - TableMetadata.buildFrom(base) + baseMetadata = + TableMetadata.buildFrom(baseMetadata) .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) .build(); } @@ -1532,25 +1555,32 @@ void testDeleteNonExistentSnapshot() throws IOException { List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); Snapshot nonExistentSnapshot = extraSnapshots.get(0); - List snapshotsToDelete = List.of(nonExistentSnapshot); + // New metadata is same as base (non-existent snapshot can't be removed) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = - openHouseInternalTableOperations.maybeDeleteSnapshots(base, snapshotsToDelete); + openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); // Verify original snapshots are unchanged Assertions.assertEquals( - base.snapshots().size(), + baseMetadata.snapshots().size(), result.snapshots().size(), "Snapshot count should be unchanged when deleting non-existent snapshot"); - // Verify deletion is still tracked (documenting current behavior) - Map properties = result.properties(); + // Verify deletion is not tracked (since no actual deletion occurred) + Map resultProperties = result.properties(); String deletedSnapshots = - properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); - Assertions.assertNotNull(deletedSnapshots); - Assertions.assertTrue( - deletedSnapshots.contains(Long.toString(nonExistentSnapshot.snapshotId())), - "Non-existent snapshot should still be tracked as deleted"); + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + Assertions.assertNull(deletedSnapshots, "No deleted snapshots should be tracked"); } @Test @@ -1558,22 +1588,34 @@ void testDeleteSnapshotMetricsRecorded() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata - TableMetadata base = BASE_TABLE_METADATA; + TableMetadata baseMetadata = BASE_TABLE_METADATA; for (Snapshot snapshot : testSnapshots) { - base = TableMetadata.buildFrom(base).addSnapshot(snapshot).build(); + baseMetadata = TableMetadata.buildFrom(baseMetadata).addSnapshot(snapshot).build(); } - // Delete some snapshots - List snapshotsToDelete = testSnapshots.subList(0, 2); + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; + + // Delete some snapshots (first two snapshots) + List remainingSnapshots = testSnapshots.subList(2, testSnapshots.size()); + + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(new HashMap<>())); // No refs since all are unreferenced + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); // Use the operations instance with mock metrics reporter - openHouseInternalTableOperationsWithMockMetrics.maybeDeleteSnapshots(base, snapshotsToDelete); + openHouseInternalTableOperationsWithMockMetrics.applySnapshots(finalBaseMetadata, newMetadata); // Verify metrics were recorded Mockito.verify(mockMetricsReporter) .count( eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), - eq((double) snapshotsToDelete.size())); + eq((double) 2)); // 2 snapshots deleted } @Test @@ -1581,7 +1623,7 @@ void testDeleteSnapshotMetricsRecordedBranch() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata with snapshots that have branch references - TableMetadata base = + TableMetadata baseMetadata = TableMetadata.buildFrom(BASE_TABLE_METADATA) .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted @@ -1589,17 +1631,28 @@ void testDeleteSnapshotMetricsRecordedBranch() throws IOException { testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted .build(); - // Delete unreferenced snapshots (emits metrics for basic deletion) - List snapshotsToDelete = testSnapshots.subList(0, 2); + // Delete unreferenced snapshots (first two snapshots) + Snapshot referencedSnapshot = testSnapshots.get(2); + List remainingSnapshots = List.of(referencedSnapshot); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(referencedSnapshot))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); // Use the operations instance with mock metrics reporter - openHouseInternalTableOperationsWithMockMetrics.maybeDeleteSnapshots(base, snapshotsToDelete); + openHouseInternalTableOperationsWithMockMetrics.applySnapshots(baseMetadata, newMetadata); // Verify metrics were recorded for the basic deletion Mockito.verify(mockMetricsReporter) .count( eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), - eq((double) snapshotsToDelete.size())); + eq((double) 2)); // 2 snapshots deleted } @Test @@ -1607,63 +1660,94 @@ void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata - TableMetadata base = BASE_TABLE_METADATA; + TableMetadata baseMetadata = BASE_TABLE_METADATA; for (Snapshot snapshot : testSnapshots) { - base = - TableMetadata.buildFrom(base) + baseMetadata = + TableMetadata.buildFrom(baseMetadata) .setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH) .build(); } + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; + // Create a snapshot that doesn't exist in the metadata List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); Snapshot nonExistentSnapshot = extraSnapshots.get(0); - List snapshotsToDelete = List.of(nonExistentSnapshot); + + // New metadata is same as base (non-existent snapshot can't be removed) + Snapshot lastSnapshot = testSnapshots.get(testSnapshots.size() - 1); + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(finalBaseMetadata.snapshots())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); // Use the operations instance with mock metrics reporter - openHouseInternalTableOperationsWithMockMetrics.maybeDeleteSnapshots(base, snapshotsToDelete); + openHouseInternalTableOperationsWithMockMetrics.applySnapshots(finalBaseMetadata, newMetadata); - // Verify metrics are still recorded even for non-existent snapshots - Mockito.verify(mockMetricsReporter) - .count( - eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), - eq((double) snapshotsToDelete.size())); + // Verify metrics are not recorded for non-existent snapshots (no actual deletion) + Mockito.verify(mockMetricsReporter, Mockito.never()) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), Mockito.anyDouble()); } @Test void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); - // Create base metadata with all snapshots, where the last one is referenced by main branch - TableMetadata tempBase = - testSnapshots.subList(0, testSnapshots.size() - 1).stream() - .reduce( - BASE_TABLE_METADATA, - (metadata, snapshot) -> - TableMetadata.buildFrom(metadata).addSnapshot(snapshot).build(), - (m1, m2) -> m2); - final TableMetadata base = - TableMetadata.buildFrom(tempBase) - .setBranchSnapshot(testSnapshots.get(testSnapshots.size() - 1), SnapshotRef.MAIN_BRANCH) + // Create metadata with 2 snapshots: one referenced by multiple branches, one unreferenced + Snapshot unreferencedSnapshot = + testSnapshots.get(0); // This will be referenced by both branches + Snapshot mainSnapshot = testSnapshots.get(1); // This one stays but is not referenced + + TableMetadata baseMetadata = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .addSnapshot(unreferencedSnapshot) + .addSnapshot(mainSnapshot) + .setRef( + SnapshotRef.MAIN_BRANCH, + SnapshotRef.branchBuilder(mainSnapshot.snapshotId()).build()) .build(); - // Attempt to delete ALL snapshots (including the one referenced by main) - List allSnapshots = new ArrayList<>(testSnapshots); + // Attempt to delete the shared snapshot by creating new metadata without it + // Keep the unreferenced snapshot so we're not deleting everything + List remainingSnapshots = List.of(mainSnapshot); - // This should fail because we cannot delete the snapshot referenced by main branch - IllegalArgumentException exception = + // Keep refs pointing to the shared snapshot (causing conflict) + Map refs = baseMetadata.refs(); + Map serializedRefs = + refs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(List.of())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(mainSnapshot))); + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by multiple branches + InvalidIcebergSnapshotException exception = Assertions.assertThrows( - IllegalArgumentException.class, - () -> openHouseInternalTableOperations.maybeDeleteSnapshots(base, allSnapshots), - "Should throw IllegalArgumentException when trying to delete all snapshots including main branch reference"); + InvalidIcebergSnapshotException.class, + () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); - // Verify error message indicates the snapshot is still referenced + // Verify error message mentions the snapshot is still referenced String exceptionMessage = exception.getMessage(); - Assertions.assertTrue( - exceptionMessage.contains("Still referenced by refs") - || exceptionMessage.contains("referenced") - || exceptionMessage.contains("Cannot expire"), - "Error message should indicate snapshot is still referenced: " + exceptionMessage); + String expectedMessage = + "Cannot delete the current snapshot " + + mainSnapshot.snapshotId() + + " without adding replacement snapshots."; + Assertions.assertTrue(exceptionMessage.contains(expectedMessage)); } @Test @@ -1671,20 +1755,30 @@ void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata with unreferenced snapshots only (no main branch or other refs) - TableMetadata tempBase = BASE_TABLE_METADATA; + TableMetadata baseMetadata = BASE_TABLE_METADATA; for (Snapshot snapshot : testSnapshots) { - tempBase = TableMetadata.buildFrom(tempBase).addSnapshot(snapshot).build(); + baseMetadata = TableMetadata.buildFrom(baseMetadata).addSnapshot(snapshot).build(); } - final TableMetadata base = tempBase; // Note: No setBranchSnapshot or setRef calls - all snapshots are unreferenced + // Make baseMetadata effectively final for lambda usage + final TableMetadata finalBaseMetadata = baseMetadata; + // Attempt to delete all unreferenced snapshots - List allSnapshots = new ArrayList<>(testSnapshots); + Map properties = new HashMap<>(finalBaseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(List.of())); // Empty - all snapshots deleted + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap(new HashMap<>())); // No refs + + TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); // This should succeed since no snapshots are referenced by any branch/tag TableMetadata result = Assertions.assertDoesNotThrow( - () -> openHouseInternalTableOperations.maybeDeleteSnapshots(base, allSnapshots), + () -> openHouseInternalTableOperations.applySnapshots(finalBaseMetadata, newMetadata), "Should succeed when deleting all unreferenced snapshots"); // Verify all snapshots were removed from the metadata @@ -1694,12 +1788,12 @@ void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { "All unreferenced snapshots should be deleted, resulting in empty snapshots list"); // Verify deletion tracking shows all snapshots were deleted - Map properties = result.properties(); + Map resultProperties = result.properties(); String deletedSnapshots = - properties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); + resultProperties.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS)); Assertions.assertNotNull(deletedSnapshots, "Deleted snapshots should be tracked"); - for (Snapshot snapshot : allSnapshots) { + for (Snapshot snapshot : testSnapshots) { Assertions.assertTrue( deletedSnapshots.contains(Long.toString(snapshot.snapshotId())), "Snapshot " + snapshot.snapshotId() + " should be tracked as deleted"); @@ -1711,13 +1805,13 @@ void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); // Create base metadata - TableMetadata base = + TableMetadata baseMetadata = TableMetadata.buildFrom(BASE_TABLE_METADATA) .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) .build(); - // Add multiple new snapshots - List newSnapshots = testSnapshots.subList(1, 4); // snapshots 1, 2, 3 + // New metadata includes all snapshots (base + new ones) + List allSnapshots = testSnapshots.subList(0, 4); // snapshots 0, 1, 2, 3 // Create snapshotRefs where each branch points to a DIFFERENT snapshot (valid scenario) Map validRefs = new HashMap<>(); @@ -1725,11 +1819,24 @@ void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { validRefs.put("branch_b", SnapshotRef.branchBuilder(testSnapshots.get(2).snapshotId()).build()); validRefs.put("branch_c", SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build()); + // Serialize the refs + Map serializedRefs = + validRefs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + // This should NOT throw an exception Assertions.assertDoesNotThrow( - () -> - openHouseInternalTableOperations.applySnapshotOperations( - base, newSnapshots, validRefs, false), + () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), "Should NOT throw exception when branches target different snapshots"); } @@ -1739,7 +1846,7 @@ void testStandardWAPScenario() throws IOException { List wapSnapshots = IcebergTestUtil.getWapSnapshots(); // Create base with existing snapshots and a WAP snapshot - TableMetadata base = + TableMetadata baseMetadata = TableMetadata.buildFrom(BASE_TABLE_METADATA) .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) .addSnapshot(wapSnapshots.get(0)) // WAP snapshot (not referenced by any branch) @@ -1747,17 +1854,33 @@ void testStandardWAPScenario() throws IOException { // Standard WAP scenario: pull the WAP snapshot into main branch Snapshot wapSnapshot = wapSnapshots.get(0); - List newSnapshots = List.of(); // No new snapshots, just referencing the existing WAP + + // New metadata keeps the same snapshots but changes the main branch ref to point to WAP + // snapshot + List allSnapshots = List.of(testSnapshots.get(0), wapSnapshot); // Create refs to pull WAP snapshot into main branch Map refs = new HashMap<>(); refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRef.branchBuilder(wapSnapshot.snapshotId()).build()); + // Serialize the refs + Map serializedRefs = + refs.entrySet().stream() + .collect( + Collectors.toMap( + Map.Entry::getKey, + e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + // Should succeed - standard WAP workflow where WAP snapshot becomes the new main Assertions.assertDoesNotThrow( - () -> - openHouseInternalTableOperations.applySnapshotOperations( - base, newSnapshots, refs, false), + () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), "Should successfully pull WAP snapshot into main branch"); } @@ -1942,85 +2065,121 @@ void testMultipleDiffCommitWithValidBranch() throws IOException { */ @Test void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws IOException { + // Combine regular snapshots (4) + extra snapshots (4) to get 8 total snapshots List testSnapshots = IcebergTestUtil.getSnapshots(); + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + List allSnapshots = new ArrayList<>(); + allSnapshots.addAll(testSnapshots); + allSnapshots.addAll(extraSnapshots); + + // ========== Create base metadata with 2 branches ========== + // Base has snapshots 0, 1, 2, 3 with MAIN at snapshot 0 and feature_a at snapshot 1 + TableMetadata.Builder baseBuilder = TableMetadata.buildFrom(BASE_TABLE_METADATA); + baseBuilder.addSnapshot(allSnapshots.get(0)); + baseBuilder.addSnapshot(allSnapshots.get(1)); + baseBuilder.addSnapshot(allSnapshots.get(2)); + baseBuilder.addSnapshot(allSnapshots.get(3)); + baseBuilder.setBranchSnapshot(allSnapshots.get(0).snapshotId(), SnapshotRef.MAIN_BRANCH); + baseBuilder.setBranchSnapshot(allSnapshots.get(1).snapshotId(), "feature_a"); + TableMetadata baseMetadata = baseBuilder.build(); + + // Add custom properties with base snapshots + Map baseProperties = new HashMap<>(baseMetadata.properties()); + List baseSnapshots = allSnapshots.subList(0, 4); + baseProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(baseSnapshots)); + + Map baseRefs = new HashMap<>(); + baseRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(0).snapshotId()).build())); + baseRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(1).snapshotId()).build())); + + baseProperties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(baseRefs)); + + TableMetadata finalBaseMetadata = baseMetadata.replaceProperties(baseProperties); + + // ========== Create new metadata with 3 branches, all advanced 2 snapshots further ========== + // New metadata has snapshots 0-7 with MAIN at snapshot 2, feature_a at snapshot 3, feature_b at + // snapshot 4 + TableMetadata.Builder newBuilder = TableMetadata.buildFrom(BASE_TABLE_METADATA); + for (int i = 0; i < 8; i++) { + newBuilder.addSnapshot(allSnapshots.get(i)); + } + newBuilder.setBranchSnapshot(allSnapshots.get(2).snapshotId(), SnapshotRef.MAIN_BRANCH); + newBuilder.setBranchSnapshot(allSnapshots.get(3).snapshotId(), "feature_a"); + newBuilder.setBranchSnapshot(allSnapshots.get(4).snapshotId(), "feature_b"); + TableMetadata newMetadata = newBuilder.build(); + + // Add custom properties with new snapshots + Map newProperties = new HashMap<>(newMetadata.properties()); + List newSnapshots = allSnapshots.subList(0, 8); + newProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(newSnapshots)); + + Map newRefs = new HashMap<>(); + newRefs.put( + SnapshotRef.MAIN_BRANCH, + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(2).snapshotId()).build())); + newRefs.put( + "feature_a", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(3).snapshotId()).build())); + newRefs.put( + "feature_b", + SnapshotRefParser.toJson( + SnapshotRef.branchBuilder(allSnapshots.get(4).snapshotId()).build())); + + newProperties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(newRefs)); + + TableMetadata finalNewMetadata = newMetadata.replaceProperties(newProperties); + + // ========== COMMIT: Should SUCCEED ========== + openHouseInternalTableOperations.doCommit(finalBaseMetadata, finalNewMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); - try (MockedStatic ignoreWriteMock = - Mockito.mockStatic(TableMetadataParser.class)) { - - // ========== Create base at N with 1 snapshot ========== - TableMetadata baseAtN = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .build(); - - // ========== Create divergent metadata with MAIN and feature_a both pointing to snapshot 3 - // ========== - TableMetadata.Builder builder = TableMetadata.buildFrom(baseAtN); - // Add snapshots 1, 2, 3 without assigning to branches - builder.addSnapshot(testSnapshots.get(1)); - builder.addSnapshot(testSnapshots.get(2)); - builder.addSnapshot(testSnapshots.get(3)); - // Set BOTH branches to point to the same existing snapshot (using snapshot ID) - builder.setBranchSnapshot(testSnapshots.get(3).snapshotId(), SnapshotRef.MAIN_BRANCH); - builder.setBranchSnapshot(testSnapshots.get(3).snapshotId(), "feature_a"); - TableMetadata metadataWithBothBranches = builder.build(); - - // Add custom properties with snapshots - Map divergentProperties = - new HashMap<>(metadataWithBothBranches.properties()); - List snapshots4 = testSnapshots.subList(0, 4); - divergentProperties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); - - // Create refs matching the setBranchSnapshot calls - both pointing to snapshot 3 - Map sameSnapshotRefs = new HashMap<>(); - sameSnapshotRefs.put( - SnapshotRef.MAIN_BRANCH, - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); - sameSnapshotRefs.put( - "feature_a", - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); - - divergentProperties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(sameSnapshotRefs)); - - TableMetadata finalDivergentMetadata = - metadataWithBothBranches.replaceProperties(divergentProperties); + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); - // ========== COMMIT: Should SUCCEED - this is a valid end state ========== - openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); - Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); - - TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + // Verify all 8 snapshots are present + Assertions.assertEquals( + 8, capturedMetadata.snapshots().size(), "Commit should contain all 8 snapshots"); - // Verify all 4 snapshots are present - Assertions.assertEquals( - 4, - capturedMetadata.snapshots().size(), - "Commit with multiple branches pointing to same snapshot should contain all 4 snapshots"); + // Verify MAIN branch advanced 2 snapshots (from snapshot 0 to snapshot 2) + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + allSnapshots.get(2).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to snapshot 2 (advanced 2 snapshots from snapshot 0)"); - // Verify BOTH refs point to the same snapshot - SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); - Assertions.assertNotNull(mainRef, "Main branch ref should exist"); - Assertions.assertEquals( - testSnapshots.get(3).snapshotId(), - mainRef.snapshotId(), - "Main branch should point to the 4th snapshot"); + // Verify feature_a branch advanced 2 snapshots (from snapshot 1 to snapshot 3) + SnapshotRef featureARef = capturedMetadata.ref("feature_a"); + Assertions.assertNotNull(featureARef, "Feature_a branch ref should exist"); + Assertions.assertEquals( + allSnapshots.get(3).snapshotId(), + featureARef.snapshotId(), + "Feature_a branch should point to snapshot 3 (advanced 2 snapshots from snapshot 1)"); - SnapshotRef featureRef = capturedMetadata.ref("feature_a"); - Assertions.assertNotNull(featureRef, "Feature_a branch ref should exist"); - Assertions.assertEquals( - testSnapshots.get(3).snapshotId(), - featureRef.snapshotId(), - "Feature_a branch should also point to the 4th snapshot (same as main)"); + // Verify feature_b branch exists and points to snapshot 4 (new branch in this commit) + SnapshotRef featureBRef = capturedMetadata.ref("feature_b"); + Assertions.assertNotNull(featureBRef, "Feature_b branch ref should exist"); + Assertions.assertEquals( + allSnapshots.get(4).snapshotId(), + featureBRef.snapshotId(), + "Feature_b branch should point to snapshot 4"); - // Verify they point to the SAME snapshot + // Verify correct lineage: snapshots should be in order + List capturedSnapshots = capturedMetadata.snapshots(); + for (int i = 0; i < 8; i++) { Assertions.assertEquals( - mainRef.snapshotId(), - featureRef.snapshotId(), - "Both branches should point to the same snapshot ID"); + allSnapshots.get(i).snapshotId(), + capturedSnapshots.get(i).snapshotId(), + "Snapshot " + i + " should be preserved in correct order"); } } @@ -2077,17 +2236,20 @@ void testMultipleDiffCommitWithInvalidBranch() throws IOException { // ========== COMMIT: Should throw CommitStateUnknownException due to ambiguous branches // ========== - CommitStateUnknownException exception = + InvalidIcebergSnapshotException exception = Assertions.assertThrows( - CommitStateUnknownException.class, + InvalidIcebergSnapshotException.class, () -> openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata), - "Should throw CommitStateUnknownException when multiple branches point to same snapshot"); + "Should throw InvalidIcebergSnapshotException when multiple branches point to same snapshot"); // Verify error message indicates the ambiguous commit String exceptionMessage = exception.getMessage(); + String expectedMessage = + "Ambiguous commit: snapshot " + + testSnapshots.get(3).snapshotId() + + " is referenced by multiple branches [feature_a, main] in a single commit. Each snapshot can only be referenced by one branch per commit."; Assertions.assertTrue( - exceptionMessage.contains("Multiple branches") - && exceptionMessage.contains("same target snapshot"), + exceptionMessage.contains(expectedMessage), "Error message should indicate multiple branches targeting same snapshot: " + exceptionMessage); } From a101d729831548ed83897994c8a98d04843bb5e2 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Thu, 9 Oct 2025 11:35:03 -0700 Subject: [PATCH 14/35] adding comments --- .../OpenHouseInternalTableOperationsTest.java | 203 ++++++++++++++++-- 1 file changed, 183 insertions(+), 20 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index 2ff4d1e21..e0c5f6513 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -129,6 +129,10 @@ void setup() { when(localStorage.getType()).thenReturn(StorageType.LOCAL); } + /** + * Tests committing snapshots to a table with no existing snapshots (initial version). Verifies + * that all snapshots are appended and tracked in table properties. + */ @Test void testDoCommitAppendSnapshotsInitialVersion() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -164,6 +168,10 @@ void testDoCommitAppendSnapshotsInitialVersion() throws IOException { } } + /** + * Tests committing additional snapshots to a table that already has existing snapshots. Verifies + * that only new snapshots are appended and tracked appropriately. + */ @Test void testDoCommitAppendSnapshotsExistingVersion() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -208,6 +216,10 @@ void testDoCommitAppendSnapshotsExistingVersion() throws IOException { } } + /** + * Tests committing changes that both append new snapshots and delete existing ones. Verifies that + * both appended and deleted snapshots are correctly tracked in properties. + */ @Test void testDoCommitAppendAndDeleteSnapshots() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -266,6 +278,10 @@ void testDoCommitAppendAndDeleteSnapshots() throws IOException { } } + /** + * Tests that metadata file updates are performed for replicated table initial version commits. + * Verifies that updateMetadataField is called with the correct parameters for replicated tables. + */ @Test void testDoCommitUpdateMetadataForInitalVersionCommit() throws IOException { Map properties = new HashMap<>(); @@ -326,6 +342,10 @@ void testDoCommitUpdateMetadataForInitalVersionCommit() throws IOException { verify(mockLocalStorageClient).getNativeClient(); } + /** + * Tests that metadata file updates are not performed for non-replicated tables. Verifies that + * updateMetadataField is never called when the table is not replicated. + */ @Test void testDoCommitUpdateMetadataNotCalledForNonReplicatedTable() throws IOException { Map properties = new HashMap<>(); @@ -352,6 +372,10 @@ void testDoCommitUpdateMetadataNotCalledForNonReplicatedTable() throws IOExcepti Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.any(HouseTable.class)); } + /** + * Tests that metadata file updates are not performed for non-initial version commits. Verifies + * that updateMetadataField is only called during table creation, not for subsequent updates. + */ @Test void testDoCommitUpdateMetadataNotCalledForNonInitialVersionCommit() throws IOException { Map properties = new HashMap<>(); @@ -385,6 +409,10 @@ void testDoCommitUpdateMetadataNotCalledForNonInitialVersionCommit() throws IOEx Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.any(HouseTable.class)); } + /** + * Tests committing changes that delete some snapshots while keeping others. Verifies that deleted + * snapshots are properly tracked in table properties. + */ @Test void testDoCommitDeleteSnapshots() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -433,6 +461,10 @@ void testDoCommitDeleteSnapshots() throws IOException { } } + /** + * Tests that commits to staged tables do not persist to the repository. Verifies that table + * metadata is set locally but save() and findById() are never called. + */ @Test void testDoCommitDoesntPersistForStagedTable() { TableMetadata metadata = @@ -454,6 +486,10 @@ void testDoCommitDoesntPersistForStagedTable() { .get()); } + /** + * Tests that repository exceptions are properly converted to Iceberg exceptions. Verifies that + * various repository exceptions map to CommitFailedException or CommitStateUnknownException. + */ @Test void testDoCommitExceptionHandling() { TableMetadata base = BASE_TABLE_METADATA; @@ -482,6 +518,11 @@ void testDoCommitExceptionHandling() { () -> openHouseInternalTableOperations.doCommit(base, metadata)); } + /** + * Tests that attempting to delete a snapshot that is still referenced by a branch throws an + * exception. Verifies that InvalidIcebergSnapshotException is thrown when snapshot refs conflict + * with deletions. + */ @Test void testDoCommitSnapshotsValidationThrowsException() throws IOException { TableMetadata metadata = @@ -528,6 +569,10 @@ void testDoCommitSnapshotsValidationThrowsException() throws IOException { "Should throw exception when trying to delete referenced snapshots"); } + /** + * Tests committing WAP (write-audit-publish) staged snapshots to an initial version table. + * Verifies that snapshots are marked as staged but not appended to the main branch. + */ @Test void testDoCommitAppendStageOnlySnapshotsInitialVersion() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 2); @@ -557,6 +602,10 @@ void testDoCommitAppendStageOnlySnapshotsInitialVersion() throws IOException { } } + /** + * Tests committing WAP staged snapshots to a table with existing snapshots. Verifies that new + * snapshots are tracked as staged without being appended to main. + */ @Test void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -600,6 +649,11 @@ void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { } } + /** + * Tests validation that rejects appending snapshots older than the current metadata timestamp. + * Verifies that IllegalArgumentException is thrown for stale snapshots unless newer ones are + * included. + */ @Test void testAppendSnapshotsWithOldSnapshots() throws IOException { // Create base metadata (existing table state) @@ -640,6 +694,10 @@ void testAppendSnapshotsWithOldSnapshots() throws IOException { openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadataWithFuture); } + /** + * Tests cherry-picking a staged snapshot to main when the base snapshot hasn't changed. Verifies + * that the existing staged snapshot is promoted without creating a new snapshot. + */ @Test void testDoCommitCherryPickSnapshotBaseUnchanged() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -681,6 +739,10 @@ void testDoCommitCherryPickSnapshotBaseUnchanged() throws IOException { } } + /** + * Tests cherry-picking a staged snapshot when the base has changed since staging. Verifies that a + * new snapshot is created and appended to track the rebased changes. + */ @Test void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots(); @@ -721,6 +783,10 @@ void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { } } + /** + * Tests cherry-picking the first staged snapshot (with no parent) to the main branch. Verifies + * that the staged snapshot is promoted directly without creating a new snapshot. + */ @Test void testDoCommitCherryPickFirstSnapshot() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 1); @@ -755,6 +821,10 @@ void testDoCommitCherryPickFirstSnapshot() throws IOException { } } + /** + * Tests deleting the last staged snapshot when no references point to it. Verifies that no + * snapshot operations are tracked since the snapshot was unreferenced. + */ @Test void testDoCommitDeleteLastStagedSnapshotWhenNoRefs() throws IOException { List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 1); @@ -782,6 +852,10 @@ void testDoCommitDeleteLastStagedSnapshotWhenNoRefs() throws IOException { } } + /** + * Tests rebuilding an unpartitioned table's partition spec with a new schema. Verifies that the + * rebuilt spec remains unpartitioned. + */ @Test void testRebuildPartitionSpecUnpartitioned() { Schema originalSchema = @@ -796,6 +870,10 @@ void testRebuildPartitionSpecUnpartitioned() { Assertions.assertTrue(rebuiltSpec.isUnpartitioned()); } + /** + * Tests rebuilding partition spec when the new schema has the same field IDs as the original. + * Verifies that partition fields are correctly mapped using matching field IDs. + */ @Test void testRebuildPartitionSpec_NewSchemaSameFieldIds() { Schema originalSchema = @@ -833,6 +911,11 @@ void testRebuildPartitionSpec_NewSchemaSameFieldIds() { Assertions.assertEquals(3, rebuiltSpec.fields().get(2).sourceId()); } + /** + * Tests rebuilding partition spec when the new schema has different field IDs for same field + * names. Verifies that partition fields are correctly remapped to new field IDs based on field + * names. + */ @Test void testRebuildPartitionSpec_NewSchemaDifferentFieldIds() { Schema originalSchema = @@ -878,6 +961,10 @@ void testRebuildPartitionSpec_NewSchemaDifferentFieldIds() { Assertions.assertEquals(2, rebuiltSpec.fields().get(2).sourceId()); } + /** + * Tests rebuilding partition spec when a partition field is missing from the new schema. Verifies + * that an IllegalArgumentException is thrown for the missing field. + */ @Test void testRebuildPartitionSpec_fieldMissingInNewSchema() { Schema originalSchema = @@ -899,6 +986,10 @@ void testRebuildPartitionSpec_fieldMissingInNewSchema() { "Field field1 does not exist in the new schema", exception.getMessage()); } + /** + * Tests rebuilding sort order when the new schema has the same field IDs as the original. + * Verifies that sort fields are correctly mapped using matching field IDs. + */ @Test void testRebuildSortOrder_NewSchemaSameFieldIds() { Schema originalSchema = @@ -925,6 +1016,10 @@ void testRebuildSortOrder_NewSchemaSameFieldIds() { Assertions.assertEquals(2, rebuiltSortOrder.fields().get(1).sourceId()); } + /** + * Tests rebuilding sort order when the new schema has different field IDs for same field names. + * Verifies that sort fields are correctly remapped to new field IDs based on field names. + */ @Test void testRebuildSortOrder_NewSchemaDifferentFieldIds() { Schema originalSchema = @@ -951,6 +1046,10 @@ void testRebuildSortOrder_NewSchemaDifferentFieldIds() { Assertions.assertEquals(1, rebuiltSortOrder.fields().get(1).sourceId()); } + /** + * Tests rebuilding sort order when a sort field is missing from the new schema. Verifies that an + * IllegalArgumentException is thrown for the missing field. + */ @Test void testRebuildSortOrder_fieldMissingInNewSchema() { Schema originalSchema = @@ -969,6 +1068,10 @@ void testRebuildSortOrder_fieldMissingInNewSchema() { "Field field1 does not exist in the new schema", exception.getMessage()); } + /** + * Tests that refresh metadata operations record metrics with database tag but not table tag. + * Verifies that only the database dimension is included to avoid high cardinality. + */ @Test void testRefreshMetadataIncludesDatabaseTag() { testMetricIncludesDatabaseTag( @@ -978,6 +1081,10 @@ void testRefreshMetadataIncludesDatabaseTag() { "Timer should not have table tag (removed because the table tag has super high cardinality and overloads metric emission max size)"); } + /** + * Tests that commit metadata update operations record metrics with database tag but not table + * tag. Verifies that only the database dimension is included to avoid high cardinality. + */ @Test void testCommitMetadataUpdateIncludesDatabaseTag() { testMetricIncludesDatabaseTag( @@ -987,6 +1094,10 @@ void testCommitMetadataUpdateIncludesDatabaseTag() { "Timer should not have table tag (only database dimension should be included)"); } + /** + * Tests that refresh metadata latency timer has histogram buckets configured. Verifies that the + * metrics can be used for histogram-based monitoring and alerting. + */ @Test void testRefreshMetadataLatencyHasHistogramBuckets() { testMetricHasHistogramBuckets( @@ -995,6 +1106,10 @@ void testRefreshMetadataLatencyHasHistogramBuckets() { this::executeRefreshMetadata); } + /** + * Tests that commit metadata update latency timer has histogram buckets configured. Verifies that + * the metrics can be used for histogram-based monitoring and alerting. + */ @Test void testCommitMetadataUpdateLatencyHasHistogramBuckets() { testMetricHasHistogramBuckets( @@ -1226,6 +1341,10 @@ private void verifyMetricHistogramBuckets( // ===== SNAPSHOT DELETION SAFETY TESTS ===== + /** + * Tests that attempting to delete a snapshot referenced by the main branch throws an exception. + * Verifies that InvalidIcebergSnapshotException is thrown with appropriate error message. + */ @Test void testDeleteSnapshotWithMainReference() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1276,6 +1395,10 @@ void testDeleteSnapshotWithMainReference() throws IOException { "Error message should indicate snapshot is still referenced: " + exception.getMessage()); } + /** + * Tests that unreferenced snapshots can be successfully deleted from the table. Verifies that + * deleted snapshots are removed from metadata and tracked in properties. + */ @Test void testDeleteSnapshotWithNoReference() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1337,6 +1460,11 @@ void testDeleteSnapshotWithNoReference() throws IOException { } } + /** + * Tests that attempting to delete a snapshot referenced by multiple branches throws an exception. + * Verifies that InvalidIcebergSnapshotException is thrown indicating the snapshot is still + * referenced. + */ @Test void testDeleteSnapshotWithMultipleReference() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1394,6 +1522,10 @@ void testDeleteSnapshotWithMultipleReference() throws IOException { + exceptionMessage); } + /** + * Tests that attempting to delete a snapshot referenced by a tag throws an exception. Verifies + * that InvalidIcebergSnapshotException is thrown with branch/tag reference details. + */ @Test void testDeleteSnapshotWithBranchReference() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1456,6 +1588,10 @@ void testDeleteSnapshotWithBranchReference() throws IOException { + exceptionMessage); } + /** + * Tests that attempting to delete an empty list of snapshots makes no changes to the table. + * Verifies that no snapshots are deleted and no deletion properties are set. + */ @Test void testDeleteEmptySnapshotList() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1497,6 +1633,10 @@ void testDeleteEmptySnapshotList() throws IOException { Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); } + /** + * Tests that attempting to delete a null list of snapshots makes no changes to the table. + * Verifies that no snapshots are deleted and no deletion properties are set. + */ @Test void testDeleteNullSnapshotList() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1538,6 +1678,10 @@ void testDeleteNullSnapshotList() throws IOException { Assertions.assertNull(deletedSnapshots, "No deleted snapshots property should be set"); } + /** + * Tests that attempting to delete a snapshot that doesn't exist in the metadata has no effect. + * Verifies that snapshot count remains unchanged and no deletion tracking occurs. + */ @Test void testDeleteNonExistentSnapshot() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1583,6 +1727,10 @@ void testDeleteNonExistentSnapshot() throws IOException { Assertions.assertNull(deletedSnapshots, "No deleted snapshots should be tracked"); } + /** + * Tests that snapshot deletion operations record the correct metrics. Verifies that + * SNAPSHOTS_DELETED_CTR counter is incremented by the number of deleted snapshots. + */ @Test void testDeleteSnapshotMetricsRecorded() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1618,6 +1766,10 @@ void testDeleteSnapshotMetricsRecorded() throws IOException { eq((double) 2)); // 2 snapshots deleted } + /** + * Tests that snapshot deletion metrics are recorded when deleting unreferenced snapshots. + * Verifies that SNAPSHOTS_DELETED_CTR counter tracks deletions with branch references present. + */ @Test void testDeleteSnapshotMetricsRecordedBranch() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1655,6 +1807,10 @@ void testDeleteSnapshotMetricsRecordedBranch() throws IOException { eq((double) 2)); // 2 snapshots deleted } + /** + * Tests that snapshot deletion metrics are not recorded when no actual deletion occurs. Verifies + * that SNAPSHOTS_DELETED_CTR counter is not called for non-existent snapshots. + */ @Test void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1695,6 +1851,11 @@ void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), Mockito.anyDouble()); } + /** + * Tests that attempting to delete all snapshots fails when the main branch references a snapshot. + * Verifies that InvalidIcebergSnapshotException is thrown to prevent deleting referenced + * snapshots. + */ @Test void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1750,6 +1911,10 @@ void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { Assertions.assertTrue(exceptionMessage.contains(expectedMessage)); } + /** + * Tests that deleting all unreferenced snapshots succeeds without errors. Verifies that all + * snapshots can be deleted when no branches or tags reference them. + */ @Test void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1800,6 +1965,10 @@ void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { } } + /** + * Tests that multiple branches can point to different snapshots without conflicts. Verifies that + * commits with multiple valid branch references succeed without exceptions. + */ @Test void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1840,6 +2009,10 @@ void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { "Should NOT throw exception when branches target different snapshots"); } + /** + * Tests the standard Write-Audit-Publish (WAP) workflow where a staged snapshot becomes main. + * Verifies that pulling a WAP snapshot into the main branch succeeds without errors. + */ @Test void testStandardWAPScenario() throws IOException { List testSnapshots = IcebergTestUtil.getSnapshots(); @@ -1885,17 +2058,8 @@ void testStandardWAPScenario() throws IOException { } /** - * Integration test that verifies committing with base and metadata that are at least two commits - * divergent. This simulates scenarios where: - * - *

    - *
  • Base metadata is at version N - *
  • New metadata represents state at version N+2 or later (skipping intermediate versions) - *
  • The commit should still succeed and write complete metadata - *
- * - *

This test validates that Iceberg can handle "jump" commits where the metadata being - * committed has evolved significantly from the base. + * Tests committing metadata that has diverged multiple versions from the base (N to N+3). + * Verifies that "jump" commits succeed with all snapshots and references correctly applied. */ @Test void testMultipleDiffCommit() throws IOException { @@ -1974,8 +2138,8 @@ void testMultipleDiffCommit() throws IOException { } /** - * Test committing with divergent metadata and multiple valid branches. Base is at N with MAIN, - * metadata is at N+3 with both MAIN and feature_a branches pointing to different snapshots. + * Tests divergent commit (N to N+3) with multiple branches pointing to different snapshots. + * Verifies that divergent commits succeed when branch references are valid and non-conflicting. */ @Test void testMultipleDiffCommitWithValidBranch() throws IOException { @@ -2060,8 +2224,9 @@ void testMultipleDiffCommitWithValidBranch() throws IOException { } /** - * Test committing with divergent metadata where multiple branches point to the same snapshot. - * This is VALID when done through setBranchSnapshot() - the end state is allowed. + * Tests committing with multiple branches advancing forward, each pointing to different + * snapshots. Verifies that complex multi-branch commits succeed when each branch has a unique + * target snapshot. */ @Test void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws IOException { @@ -2139,7 +2304,7 @@ void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws I TableMetadata finalNewMetadata = newMetadata.replaceProperties(newProperties); - // ========== COMMIT: Should SUCCEED ========== + // commit should succeed openHouseInternalTableOperations.doCommit(finalBaseMetadata, finalNewMetadata); Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); @@ -2184,8 +2349,8 @@ void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws I } /** - * Test committing with divergent metadata where multiple branches try to point to the same - * snapshot (ambiguous commit). This should throw an IllegalStateException. + * Tests that committing with multiple branches pointing to the same snapshot throws an exception. + * Verifies that InvalidIcebergSnapshotException is thrown for ambiguous branch configurations. */ @Test void testMultipleDiffCommitWithInvalidBranch() throws IOException { @@ -2234,8 +2399,6 @@ void testMultipleDiffCommitWithInvalidBranch() throws IOException { TableMetadata finalDivergentMetadata = metadataWithAllSnapshots.replaceProperties(divergentProperties); - // ========== COMMIT: Should throw CommitStateUnknownException due to ambiguous branches - // ========== InvalidIcebergSnapshotException exception = Assertions.assertThrows( InvalidIcebergSnapshotException.class, From 11be4381fa5c5cc2ebc4dd7b28b2480fd8bca242 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Thu, 9 Oct 2025 16:29:17 -0700 Subject: [PATCH 15/35] working tests --- .../OpenHouseInternalTableOperationsTest.java | 123 +++++++++++++++++- 1 file changed, 120 insertions(+), 3 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index e0c5f6513..f514ed162 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -201,7 +201,7 @@ void testDoCommitAppendSnapshotsExistingVersion() throws IOException { Assertions.assertEquals( 5, updatedProperties - .size()); /*write.parquet.compression-codec, location, lastModifiedTime, version and deleted_snapshots*/ + .size()); /*write.parquet.compression-codec, location, lastModifiedTime, version and appended_snapshots*/ Assertions.assertEquals( TEST_LOCATION, updatedProperties.get(getCanonicalFieldName("tableVersion"))); @@ -1339,8 +1339,6 @@ private void verifyMetricHistogramBuckets( Assertions.assertFalse(Double.isNaN(maxTime), "Timer max time should not be NaN"); } - // ===== SNAPSHOT DELETION SAFETY TESTS ===== - /** * Tests that attempting to delete a snapshot referenced by the main branch throws an exception. * Verifies that InvalidIcebergSnapshotException is thrown with appropriate error message. @@ -2417,4 +2415,123 @@ void testMultipleDiffCommitWithInvalidBranch() throws IOException { + exceptionMessage); } } + + /** + * Tests divergent commit (N to N+3) that includes both regular snapshots and WAP staged + * snapshots. Verifies that staged snapshots remain properly tracked as staged even during a + * multi-version jump commit. + */ + @Test + void testMultipleDiffCommitWithWAPSnapshots() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + List wapSnapshots = IcebergTestUtil.getWapSnapshots(); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class)) { + + // ========== Create base at N with 1 snapshot ========== + TableMetadata baseAtN = + TableMetadata.buildFrom(BASE_TABLE_METADATA) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .build(); + + // ========== Create divergent metadata at N+3 with 2 regular + 2 WAP snapshots ========== + // Simulate evolving through N+1 and N+2 without committing + // The new metadata will have: + // - testSnapshots[0] (existing in base, main branch) + // - testSnapshots[1] (new, main branch will advance here) + // - wapSnapshots[0] (new, staged - no branch reference) + // - wapSnapshots[1] (new, staged - no branch reference) + + TableMetadata metadataAtNPlus3 = + TableMetadata.buildFrom(baseAtN) + .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) + .addSnapshot(wapSnapshots.get(0)) + .addSnapshot(wapSnapshots.get(1)) + .build(); + + // Add custom properties for commit + Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); + + // Include 2 regular snapshots (0, 1) and 2 WAP snapshots (0, 1) + List allSnapshots = new ArrayList<>(); + allSnapshots.add(testSnapshots.get(0)); + allSnapshots.add(testSnapshots.get(1)); + allSnapshots.add(wapSnapshots.get(0)); + allSnapshots.add(wapSnapshots.get(1)); + + divergentProperties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + + // Only main branch ref pointing to testSnapshots[1], WAP snapshots have no refs + divergentProperties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testSnapshots.get(1)))); + divergentProperties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); + + TableMetadata finalDivergentMetadata = + metadataAtNPlus3.replaceProperties(divergentProperties); + + // ========== COMMIT: Base at N, Metadata at N+3 (divergent by 3 commits) ========== + openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); + Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); + + TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); + Map updatedProperties = capturedMetadata.properties(); + + // Verify the divergent commit contains all 4 snapshots + Assertions.assertEquals( + 4, + capturedMetadata.snapshots().size(), + "Divergent commit should contain all 4 snapshots (2 regular + 2 WAP)"); + + Set expectedSnapshotIds = + allSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + Set actualSnapshotIds = + capturedMetadata.snapshots().stream() + .map(Snapshot::snapshotId) + .collect(Collectors.toSet()); + Assertions.assertEquals( + expectedSnapshotIds, + actualSnapshotIds, + "All snapshot IDs (regular + WAP) should be present after divergent commit"); + + // Verify main ref points to the expected snapshot (testSnapshots[1]) + SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); + Assertions.assertNotNull(mainRef, "Main branch ref should exist"); + Assertions.assertEquals( + testSnapshots.get(1).snapshotId(), + mainRef.snapshotId(), + "Main branch should point to testSnapshots[1] after divergent commit"); + + // Verify WAP snapshots are tracked as staged + String stagedSnapshots = updatedProperties.get(getCanonicalFieldName("staged_snapshots")); + Assertions.assertNotNull(stagedSnapshots, "Staged snapshots should be tracked"); + Set stagedSnapshotIds = Set.of(stagedSnapshots.split(",")); + Assertions.assertTrue( + stagedSnapshotIds.contains(Long.toString(wapSnapshots.get(0).snapshotId())), + "WAP snapshot 0 should be tracked as staged"); + Assertions.assertTrue( + stagedSnapshotIds.contains(Long.toString(wapSnapshots.get(1).snapshotId())), + "WAP snapshot 1 should be tracked as staged"); + + // Verify regular snapshot is tracked as appended (not testSnapshots[0] since it was in base) + String appendedSnapshots = updatedProperties.get(getCanonicalFieldName("appended_snapshots")); + Assertions.assertNotNull(appendedSnapshots, "Appended snapshots should be tracked"); + Assertions.assertEquals( + Long.toString(testSnapshots.get(1).snapshotId()), + appendedSnapshots, + "testSnapshots[1] should be tracked as appended"); + + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots")), + "No snapshots should be cherry-picked in this scenario"); + Assertions.assertNull( + updatedProperties.get(getCanonicalFieldName("deleted_snapshots")), + "No snapshots should be deleted in this scenario"); + + Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); + } + } } From c7426b43c8af56ca76fd25bf38d3ef5feae52271 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Thu, 9 Oct 2025 21:07:31 -0700 Subject: [PATCH 16/35] complete refactor + new tests --- .../catalog/OpenHouseInternalCatalog.java | 14 +- .../OpenHouseInternalTableOperations.java | 658 +----------------- .../internal/catalog/SnapshotDiffApplier.java | 468 +++++++++++++ .../internal/catalog/SnapshotInspector.java | 96 --- .../OpenHouseInternalTableOperationsTest.java | 87 ++- .../catalog/SnapshotDiffApplierTest.java | 359 ++++++++++ .../catalog/SnapshotInspectorTest.java | 171 ----- .../RepositoryTestWithSettableComponents.java | 30 +- .../tables/e2e/h2/SpringH2Application.java | 17 - .../tablestest/SpringH2TestApplication.java | 18 - 10 files changed, 916 insertions(+), 1002 deletions(-) create mode 100644 iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java delete mode 100644 iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotInspector.java create mode 100644 iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java delete mode 100644 iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotInspectorTest.java diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java index f77f7a0cb..2743c6579 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalCatalog.java @@ -59,22 +59,24 @@ public class OpenHouseInternalCatalog extends BaseMetastoreCatalog { @Autowired StorageType storageType; - @Autowired SnapshotInspector snapshotInspector; - @Autowired HouseTableMapper houseTableMapper; @Autowired MeterRegistry meterRegistry; @Override protected TableOperations newTableOps(TableIdentifier tableIdentifier) { + FileIO fileIO = resolveFileIO(tableIdentifier); + MetricsReporter metricsReporter = + new MetricsReporter(this.meterRegistry, METRICS_PREFIX, Lists.newArrayList()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); return new OpenHouseInternalTableOperations( houseTableRepository, - resolveFileIO(tableIdentifier), - snapshotInspector, + fileIO, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, METRICS_PREFIX, Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); } @Override diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java index 6a2c43305..d96d9d6b1 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperations.java @@ -4,7 +4,6 @@ import com.google.common.cache.Cache; import com.google.common.cache.CacheBuilder; -import com.google.common.collect.Sets; import com.google.gson.Gson; import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.cluster.storage.Storage; @@ -24,15 +23,11 @@ import java.io.IOException; import java.time.Clock; import java.time.Instant; -import java.util.Collections; import java.util.HashMap; -import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.UUID; import java.util.concurrent.TimeUnit; -import java.util.stream.Collectors; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; import org.apache.hadoop.fs.FileSystem; @@ -41,9 +36,6 @@ import org.apache.iceberg.PartitionSpec; import org.apache.iceberg.Schema; import org.apache.iceberg.SchemaParser; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortField; import org.apache.iceberg.SortOrder; @@ -68,8 +60,6 @@ public class OpenHouseInternalTableOperations extends BaseMetastoreTableOperatio FileIO fileIO; - SnapshotInspector snapshotInspector; - HouseTableMapper houseTableMapper; TableIdentifier tableIdentifier; @@ -78,6 +68,8 @@ public class OpenHouseInternalTableOperations extends BaseMetastoreTableOperatio FileIOManager fileIOManager; + SnapshotDiffApplier snapshotDiffApplier; + private static final Gson GSON = new Gson(); private static final Cache CACHE = @@ -227,7 +219,7 @@ protected void doCommit(TableMetadata base, TableMetadata metadata) { metadata = rebuildTblMetaWithSchema(metadata, CatalogConstants.EVOLVED_SCHEMA_KEY, true); } - metadata = applySnapshots(base, metadata); + metadata = snapshotDiffApplier.applySnapshots(base, metadata); int version = currentVersion() + 1; CommitStatus commitStatus = CommitStatus.FAILURE; @@ -482,650 +474,6 @@ static SortOrder rebuildSortOrder(SortOrder originalSortOrder, Schema newSchema) return builder.build(); } - // ==================== Functional Snapshot Application Pipeline ==================== - - /** - * Immutable state object representing the complete snapshot diff and categorization. All fields - * are final and collections are unmodifiable. - */ - @lombok.Value - @lombok.Builder - private static class SnapshotState { - List providedSnapshots; - Map providedRefs; - List existingSnapshots; - Map existingRefs; - - // Categorization - List wapSnapshots; - List cherryPickedSnapshots; - List regularSnapshots; - - // Diff results - List newSnapshots; - List existingRetainedSnapshots; - List deletedSnapshots; - - // Branch updates - Map branchUpdates; - - // Metrics for recording - int appendedCount; - int stagedCount; - int cherryPickedCount; - int deletedCount; - } - - /** - * Applies snapshot updates from metadata properties using a functional pipeline. This method - * follows principles: immutability, pure functions, and composition. - * - *

Pipeline stages: 1. Extract snapshots from properties 2. Parse snapshots from JSON 3. Parse - * references from JSON 4. Compute complete state diff (categorize, identify changes) 5. Validate - * entire operation 6. Apply state changes (returns builder) 7. Add metric properties to builder - * 8. Build once at top level to preserve lastUpdatedMillis from snapshot operations - * - * @param base The base table metadata (may be null for table creation) - * @param metadata The new metadata with properties containing snapshot updates - * @return Updated metadata with snapshots applied - */ - TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) { - // Check if snapshots update is requested - if (!metadata.properties().containsKey(CatalogConstants.SNAPSHOTS_JSON_KEY)) { - // No snapshot updates requested, return unchanged - return metadata; - } - - return Optional.ofNullable(metadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY)) - .map( - snapshotsJson -> { - // Stage 1-3: Extract and parse - SnapshotState.SnapshotStateBuilder stateBuilder = SnapshotState.builder(); - - // Extract and parse snapshots (Stage 1-2) - List providedSnapshots = parseSnapshotsFromJson(snapshotsJson); - stateBuilder.providedSnapshots(Collections.unmodifiableList(providedSnapshots)); - - // Extract and parse references (Stage 3) - Map providedRefs = - Optional.ofNullable( - metadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) - .map(this::parseReferencesFromJson) - .orElse(Collections.emptyMap()); - stateBuilder.providedRefs(Collections.unmodifiableMap(providedRefs)); - - // Get existing state from base - List existingSnapshots = - Optional.ofNullable(base) - .map(TableMetadata::snapshots) - .orElse(Collections.emptyList()); - stateBuilder.existingSnapshots(Collections.unmodifiableList(existingSnapshots)); - - Map existingRefs = - Optional.ofNullable(base).map(TableMetadata::refs).orElse(Collections.emptyMap()); - stateBuilder.existingRefs(Collections.unmodifiableMap(existingRefs)); - - // Stage 4: Compute complete state diff - SnapshotState state = computeStateDiff(stateBuilder); - - // Stage 5: Validate entire operation - validateOperation(state, base); - - // Stage 6: Apply state changes - returns builder - TableMetadata.Builder builder = applyStateChanges(metadata, state); - - // Stage 7: Record metrics and add metric properties to builder - builder = recordMetrics(builder, state); - - // Build once at the end to preserve lastUpdatedMillis from snapshot operations - return builder.build(); - }) - .orElse(metadata); // No snapshot updates if key not present - } - - /** Stage 2: Parse snapshots from JSON string. Pure function - no side effects. */ - private List parseSnapshotsFromJson(String snapshotsJson) { - return SnapshotsUtil.parseSnapshots(fileIO, snapshotsJson); - } - - /** Stage 3: Parse references from JSON string. Pure function - no side effects. */ - private Map parseReferencesFromJson(String refsJson) { - return SnapshotsUtil.parseSnapshotRefs(refsJson); - } - - /** - * Stage 4: Compute complete state diff. Pure function that categorizes snapshots and identifies - * changes. - */ - private SnapshotState computeStateDiff(SnapshotState.SnapshotStateBuilder builder) { - SnapshotState partial = builder.build(); - - Map providedById = - partial.getProvidedSnapshots().stream() - .collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - Map existingById = - partial.getExistingSnapshots().stream() - .collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - - // Categorize all snapshots by type - SnapshotCategories categories = - categorizeAllSnapshots( - partial.getProvidedSnapshots(), - existingById, - partial.getExistingRefs(), - partial.getProvidedRefs()); - - // Identify snapshot changes (new, retained, deleted) - SnapshotChanges changes = - identifySnapshotChanges( - partial.getProvidedSnapshots(), - partial.getExistingSnapshots(), - providedById, - existingById); - - // Identify branch updates - Map branchUpdates = - computeBranchUpdates(partial.getProvidedRefs(), partial.getExistingRefs()); - - // Compute metrics - SnapshotMetrics metrics = computeSnapshotMetrics(categories, changes, existingById); - - // Build complete state - return builder - .wapSnapshots(Collections.unmodifiableList(categories.wapSnapshots)) - .cherryPickedSnapshots(Collections.unmodifiableList(categories.cherryPickedSnapshots)) - .regularSnapshots(Collections.unmodifiableList(categories.regularSnapshots)) - .newSnapshots(Collections.unmodifiableList(changes.newSnapshots)) - .existingRetainedSnapshots(Collections.unmodifiableList(changes.existingRetainedSnapshots)) - .deletedSnapshots(Collections.unmodifiableList(changes.deletedSnapshots)) - .branchUpdates(Collections.unmodifiableMap(branchUpdates)) - .appendedCount(metrics.appendedCount) - .stagedCount(metrics.stagedCount) - .cherryPickedCount(metrics.cherryPickedCount) - .deletedCount(metrics.deletedCount) - .build(); - } - - /** Container for categorized snapshots. */ - @lombok.Value - private static class SnapshotCategories { - List wapSnapshots; - List cherryPickedSnapshots; - List regularSnapshots; - } - - /** Categorize all snapshots into WAP, cherry-picked, and regular. */ - private SnapshotCategories categorizeAllSnapshots( - List providedSnapshots, - Map existingById, - Map existingRefs, - Map providedRefs) { - List wapSnapshots = - categorizeWapSnapshots(providedSnapshots, existingRefs, providedRefs); - List cherryPickedSnapshots = - categorizeCherryPickedSnapshots( - providedSnapshots, existingById, existingRefs, providedRefs); - - // Cherry-picked snapshots should not be considered WAP/staged anymore - Set cherryPickedIds = - cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - wapSnapshots = - wapSnapshots.stream() - .filter(s -> !cherryPickedIds.contains(s.snapshotId())) - .collect(Collectors.toList()); - - List regularSnapshots = - categorizeRegularSnapshots(providedSnapshots, wapSnapshots, cherryPickedSnapshots); - - return new SnapshotCategories(wapSnapshots, cherryPickedSnapshots, regularSnapshots); - } - - /** Container for snapshot changes. */ - @lombok.Value - private static class SnapshotChanges { - List newSnapshots; - List existingRetainedSnapshots; - List deletedSnapshots; - } - - /** Identify which snapshots are new, retained, or deleted. */ - private SnapshotChanges identifySnapshotChanges( - List providedSnapshots, - List existingSnapshots, - Map providedById, - Map existingById) { - - List newSnapshots = - providedSnapshots.stream() - .filter(s -> !existingById.containsKey(s.snapshotId())) - .collect(Collectors.toList()); - - List existingRetainedSnapshots = - providedSnapshots.stream() - .filter(s -> existingById.containsKey(s.snapshotId())) - .collect(Collectors.toList()); - - List deletedSnapshots = - existingSnapshots.stream() - .filter(s -> !providedById.containsKey(s.snapshotId())) - .collect(Collectors.toList()); - - return new SnapshotChanges(newSnapshots, existingRetainedSnapshots, deletedSnapshots); - } - - /** Container for snapshot metrics. */ - @lombok.Value - private static class SnapshotMetrics { - int appendedCount; - int stagedCount; - int cherryPickedCount; - int deletedCount; - } - - /** Compute metrics based on categorized snapshots and changes. */ - private SnapshotMetrics computeSnapshotMetrics( - SnapshotCategories categories, SnapshotChanges changes, Map existingById) { - - int appendedCount = - (int) - categories.regularSnapshots.stream() - .filter(s -> !existingById.containsKey(s.snapshotId())) - .count(); - int stagedCount = categories.wapSnapshots.size(); - int cherryPickedCount = categories.cherryPickedSnapshots.size(); - int deletedCount = changes.deletedSnapshots.size(); - - return new SnapshotMetrics(appendedCount, stagedCount, cherryPickedCount, deletedCount); - } - - /** - * Categorize WAP (Write-Audit-Publish) snapshots. A snapshot is considered WAP/staged if it has - * the wap.id property AND is not on any branch in either the existing or provided metadata. This - * correctly handles: 1. Snapshots that were on branches in base - not WAP even if unreferenced in - * new metadata 2. Snapshots being published (staged -> branch) - not WAP as they're now on a - * branch - * - * @param snapshots List of provided snapshots - * @param existingRefs Existing snapshot refs from base metadata - * @param providedRefs Provided snapshot refs from new metadata - * @return List of WAP snapshots - */ - private List categorizeWapSnapshots( - List snapshots, - Map existingRefs, - Map providedRefs) { - // Get set of snapshot IDs that are/were on branches - Set branchSnapshotIds = new java.util.HashSet<>(); - branchSnapshotIds.addAll( - existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet())); - branchSnapshotIds.addAll( - providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet())); - - return snapshots.stream() - .filter( - s -> s.summary() != null && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) - .filter(s -> !branchSnapshotIds.contains(s.snapshotId())) - .collect(Collectors.toList()); - } - - /** - * Categorize cherry-picked snapshots. A snapshot is cherry-picked if: 1. It exists in the current - * metadata but has a different parent than in the provided snapshots (indicating it was moved to - * a different branch), OR 2. It is referenced as the source of a cherry-pick by another - * snapshot's "source-snapshot-id", OR 3. It has wap.id AND was staged (not on a branch) in - * existing refs AND is now on a branch in provided refs (indicating it's being published) - */ - private List categorizeCherryPickedSnapshots( - List providedSnapshots, - Map existingById, - Map existingRefs, - Map providedRefs) { - - // Find snapshots that are sources of cherry-picks - Set cherryPickSourceIds = - providedSnapshots.stream() - .filter(s -> s.summary() != null && s.summary().containsKey("source-snapshot-id")) - .map(s -> Long.parseLong(s.summary().get("source-snapshot-id"))) - .collect(Collectors.toSet()); - - // Get snapshot IDs on branches - Set existingBranchSnapshotIds = - existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); - Set providedBranchSnapshotIds = - providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); - - return providedSnapshots.stream() - .filter( - provided -> { - Snapshot existing = existingById.get(provided.snapshotId()); - if (existing == null) { - return false; // New snapshot, not cherry-picked - } - // Check if parent changed (indicating cherry-pick to different branch) - Long providedParent = provided.parentId(); - Long existingParent = existing.parentId(); - boolean parentChanged = !Objects.equal(providedParent, existingParent); - - // Check if this snapshot is the source of a cherry-pick - boolean isCherryPickSource = cherryPickSourceIds.contains(provided.snapshotId()); - - // Check if this is a WAP snapshot being published (staged -> branch) - boolean hasWapId = - provided.summary() != null - && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); - boolean wasStaged = !existingBranchSnapshotIds.contains(provided.snapshotId()); - boolean isNowOnBranch = providedBranchSnapshotIds.contains(provided.snapshotId()); - boolean isBeingPublished = hasWapId && wasStaged && isNowOnBranch; - - return parentChanged || isCherryPickSource || isBeingPublished; - }) - .collect(Collectors.toList()); - } - - /** - * Categorize regular (appended) snapshots. Regular snapshots are those that are not WAP or - * cherry-picked. - */ - private List categorizeRegularSnapshots( - List allSnapshots, - List wapSnapshots, - List cherryPickedSnapshots) { - - Set wapIds = wapSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - Set cherryPickedIds = - cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - - return allSnapshots.stream() - .filter(s -> !wapIds.contains(s.snapshotId()) && !cherryPickedIds.contains(s.snapshotId())) - .collect(Collectors.toList()); - } - - /** Compute branch updates by comparing provided and existing refs. */ - private Map computeBranchUpdates( - Map providedRefs, Map existingRefs) { - - return providedRefs.entrySet().stream() - .filter( - entry -> { - SnapshotRef existing = existingRefs.get(entry.getKey()); - return existing == null || existing.snapshotId() != entry.getValue().snapshotId(); - }) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); - } - - /** Stage 5: Validate entire operation. Throws exceptions for invalid operations. */ - private void validateOperation(SnapshotState state, TableMetadata base) { - // Validation 1: Current snapshot not deleted without replacements - validateCurrentSnapshotNotDeleted(state, base); - - // Validation 2: No ambiguous commits (multiple branches → same snapshot) - validateNoAmbiguousCommits(state); - - // Validation 3: Deleted snapshots not referenced by branches/tags - validateDeletedSnapshotsNotReferenced(state); - - // Validation 4: Individual snapshot validation using SnapshotInspector - validateIndividualSnapshots(state); - } - - /** - * Validate that current snapshot is not deleted without replacements. Package-private for - * testing. - */ - void validateCurrentSnapshotNotDeleted(SnapshotState state, TableMetadata base) { - if (base == null || base.currentSnapshot() == null) { - return; // No current snapshot to validate - } - - long currentSnapshotId = base.currentSnapshot().snapshotId(); - boolean currentDeleted = - state.getDeletedSnapshots().stream().anyMatch(s -> s.snapshotId() == currentSnapshotId); - - if (currentDeleted && state.getNewSnapshots().isEmpty()) { - throw new InvalidIcebergSnapshotException( - String.format( - "Cannot delete the current snapshot %s without adding replacement snapshots. " - + "Deleted: [%s], New: [%s]", - currentSnapshotId, - state.getDeletedSnapshots().stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(", ")), - state.getNewSnapshots().stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(", ")))); - } - } - - /** - * Validate no ambiguous commits (multiple branches pointing to same snapshot in one commit). - * Package-private for testing. - */ - void validateNoAmbiguousCommits(SnapshotState state) { - Map> snapshotToBranches = - state.getBranchUpdates().entrySet().stream() - .collect( - Collectors.groupingBy( - e -> e.getValue().snapshotId(), - Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); - - snapshotToBranches.forEach( - (snapshotId, branches) -> { - if (branches.size() > 1) { - throw new InvalidIcebergSnapshotException( - String.format( - "Ambiguous commit: snapshot %s is referenced by multiple branches [%s] in a single commit. " - + "Each snapshot can only be referenced by one branch per commit.", - snapshotId, String.join(", ", branches))); - } - }); - } - - /** - * Validate that deleted snapshots are not referenced by any branches or tags. Package-private for - * testing. - */ - void validateDeletedSnapshotsNotReferenced(SnapshotState state) { - Set deletedIds = - state.getDeletedSnapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - - Map> referencedIdsToRefs = - state.getProvidedRefs().entrySet().stream() - .collect( - Collectors.groupingBy( - e -> e.getValue().snapshotId(), - Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); - - Map> invalidDeletes = - deletedIds.stream() - .filter(referencedIdsToRefs::containsKey) - .collect(Collectors.toMap(id -> id, referencedIdsToRefs::get)); - - if (!invalidDeletes.isEmpty()) { - String details = - invalidDeletes.entrySet().stream() - .map( - e -> - String.format( - "snapshot %s (referenced by: %s)", - e.getKey(), String.join(", ", e.getValue()))) - .collect(Collectors.joining("; ")); - throw new InvalidIcebergSnapshotException( - String.format( - "Cannot delete snapshots that are still referenced by branches/tags: %s", details)); - } - } - - /** - * Validate individual snapshots using existing SnapshotInspector. Package-private for testing. - */ - void validateIndividualSnapshots(SnapshotState state) { - state - .getNewSnapshots() - .forEach( - snapshot -> { - if (snapshotInspector != null) { - snapshotInspector.validateSnapshot(snapshot); - } - }); - } - - /** - * Stage 6: Apply state changes to create TableMetadata builder. Returns builder (not built) to - * allow metric properties to be added before the final build, preserving lastUpdatedMillis. - * - *

This method uses Iceberg's proper APIs: - removeSnapshots() to delete snapshots - - * addSnapshot() to add new snapshots - setBranchSnapshot() to set branch references - * - *

The order of operations matters: 1. Start with base metadata (buildFrom copies all existing - * state) 2. Remove deleted snapshots first (using proper removeSnapshots API) 3. Remove stale - * branch references 4. Add new snapshots and set branch pointers - * - * @return Builder with all snapshot changes applied but not yet built - */ - private TableMetadata.Builder applyStateChanges(TableMetadata metadata, SnapshotState state) { - TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); - - // Step 1: Remove deleted snapshots using proper Iceberg API - if (!state.getDeletedSnapshots().isEmpty()) { - Set deletedIds = - state.getDeletedSnapshots().stream() - .map(Snapshot::snapshotId) - .collect(Collectors.toSet()); - builder.removeSnapshots(deletedIds); - } - - // Step 2: Remove stale branch references (branches that are no longer in provided refs) - Set providedRefNames = state.getProvidedRefs().keySet(); - metadata.refs().keySet().stream() - .filter(refName -> !providedRefNames.contains(refName)) - .forEach(builder::removeRef); - - // Step 3: Identify existing snapshots (after deletions) - Set existingSnapshotIds = - metadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - Set deletedIds = - state.getDeletedSnapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - existingSnapshotIds.removeAll(deletedIds); - - // Step 4: Identify snapshots referenced by branches - Set referencedByBranches = - state.getProvidedRefs().values().stream() - .map(SnapshotRef::snapshotId) - .collect(Collectors.toSet()); - - // Step 5: Add unreferenced new snapshots (referenced ones are added via setBranchSnapshot) - state.getProvidedSnapshots().stream() - .filter(s -> !existingSnapshotIds.contains(s.snapshotId())) - .filter(s -> !referencedByBranches.contains(s.snapshotId())) - .forEach(builder::addSnapshot); - - // Step 6: Set branch pointers for all provided refs - state - .getProvidedRefs() - .forEach( - (branchName, ref) -> { - Snapshot snapshot = - state.getProvidedSnapshots().stream() - .filter(s -> s.snapshotId() == ref.snapshotId()) - .findFirst() - .orElseThrow( - () -> - new InvalidIcebergSnapshotException( - String.format( - "Branch %s references non-existent snapshot %s", - branchName, ref.snapshotId()))); - - if (existingSnapshotIds.contains(snapshot.snapshotId())) { - // Snapshot already exists - just update the branch pointer if needed - SnapshotRef existingRef = metadata.refs().get(branchName); - if (existingRef == null || existingRef.snapshotId() != ref.snapshotId()) { - builder.setRef(branchName, ref); - } - } else { - // Snapshot is new - setBranchSnapshot will add it and set the branch pointer - builder.setBranchSnapshot(snapshot, branchName); - } - }); - - return builder; - } - - /** - * Stage 7: Add metric properties to builder. Returns the builder for final build in - * applySnapshots. This allows the single build to preserve lastUpdatedMillis from snapshot - * operations. - * - * @param builder Builder with snapshot changes already applied - * @param state Snapshot state containing metrics to record - * @return Builder with metric properties added, ready to be built - */ - private TableMetadata.Builder recordMetrics(TableMetadata.Builder builder, SnapshotState state) { - // Emit metrics to reporter - if (state.getAppendedCount() > 0) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, state.getAppendedCount()); - } - if (state.getStagedCount() > 0) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, state.getStagedCount()); - } - if (state.getCherryPickedCount() > 0) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, state.getCherryPickedCount()); - } - if (state.getDeletedCount() > 0) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, state.getDeletedCount()); - } - - // Helper to format snapshot IDs as comma-separated string - java.util.function.Function, String> formatIds = - snapshots -> - snapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(",")); - - // Record categorization metrics as comma-separated snapshot IDs - if (!state.getRegularSnapshots().isEmpty()) { - List newRegularSnapshots = - state.getRegularSnapshots().stream() - .filter(s -> state.getNewSnapshots().contains(s)) - .collect(Collectors.toList()); - if (!newRegularSnapshots.isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - formatIds.apply(newRegularSnapshots))); - } - } - if (!state.getWapSnapshots().isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - formatIds.apply(state.getWapSnapshots()))); - } - if (!state.getCherryPickedSnapshots().isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - formatIds.apply(state.getCherryPickedSnapshots()))); - } - if (!state.getDeletedSnapshots().isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - formatIds.apply(state.getDeletedSnapshots()))); - } - - // Remove the transient snapshot keys from properties - builder.removeProperties( - Sets.newHashSet(CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY)); - - return builder; - } - - // ==================== End Functional Snapshot Application Pipeline ==================== - /** * If this commit comes from Iceberg built-in retry in * org.apache.iceberg.PropertiesUpdate#commit() Then throw fatal {@link CommitFailedException} to diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java new file mode 100644 index 000000000..347cf4f7d --- /dev/null +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -0,0 +1,468 @@ +package com.linkedin.openhouse.internal.catalog; + +import static com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils.getCanonicalFieldName; + +import com.google.common.collect.Sets; +import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; +import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; +import java.util.Collections; +import java.util.List; +import java.util.Map; +import java.util.Optional; +import java.util.Set; +import java.util.stream.Collectors; +import lombok.AllArgsConstructor; +import lombok.extern.slf4j.Slf4j; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotSummary; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.relocated.com.google.common.base.Objects; + +/** + * Service responsible for applying snapshot changes to Iceberg table metadata. + * + *

This class handles the complex logic of computing snapshot diffs, validating changes, and + * applying them to table metadata. It supports various snapshot operations including: + * + *

    + *
  • Adding new snapshots (regular commits) + *
  • Staging snapshots (WAP - Write-Audit-Publish) + *
  • Cherry-picking snapshots across branches + *
  • Deleting snapshots + *
  • Updating branch references + *
+ * + *

The service performs comprehensive validation to ensure data integrity and prevent invalid + * operations such as deleting referenced snapshots or creating ambiguous branch references. + */ +@AllArgsConstructor +@Slf4j +public class SnapshotDiffApplier { + + private final MetricsReporter metricsReporter; + + /** + * Applies snapshot updates from metadata properties. Simple and clear: parse input, compute diff, + * validate, apply, record metrics, build. + * + * @param base The base table metadata (may be null for table creation) + * @param metadata The new metadata with properties containing snapshot updates + * @return Updated metadata with snapshots applied + */ + public TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) { + String snapshotsJson = metadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY); + if (snapshotsJson == null) { + return metadata; + } + + // Parse input + List providedSnapshots = SnapshotsUtil.parseSnapshots(null, snapshotsJson); + Map providedRefs = + Optional.ofNullable(metadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) + .map(SnapshotsUtil::parseSnapshotRefs) + .orElse(Collections.emptyMap()); + + List existingSnapshots = base != null ? base.snapshots() : Collections.emptyList(); + Map existingRefs = base != null ? base.refs() : Collections.emptyMap(); + + // Compute diff (all maps created once in constructor) + SnapshotDiff diff = + new SnapshotDiff(providedSnapshots, providedRefs, existingSnapshots, existingRefs); + + // Validate, apply, record metrics, build + diff.validate(base); + TableMetadata.Builder builder = diff.applyTo(metadata); + diff.recordMetrics(builder); + return builder.build(); + } + + /** + * State object that computes and caches all snapshot analysis. Computes all maps once in the + * constructor to avoid redundant operations. Provides clear methods for validation and + * application. + */ + private class SnapshotDiff { + // Input state + private final List providedSnapshots; + private final Map providedRefs; + private final List existingSnapshots; + private final Map existingRefs; + + // Computed maps (created once) + private final Map providedById; + private final Map existingById; + private final Set existingBranchIds; + private final Set providedBranchIds; + + // Categorized snapshots + private final List wapSnapshots; + private final List cherryPickedSnapshots; + private final List regularSnapshots; + + // Changes + private final List newSnapshots; + private final List deletedSnapshots; + private final Map branchUpdates; + + SnapshotDiff( + List providedSnapshots, + Map providedRefs, + List existingSnapshots, + Map existingRefs) { + this.providedSnapshots = providedSnapshots; + this.providedRefs = providedRefs; + this.existingSnapshots = existingSnapshots; + this.existingRefs = existingRefs; + + // Compute all maps once + this.providedById = + providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + this.existingById = + existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + this.existingBranchIds = + existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); + this.providedBranchIds = + providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); + + // Compute categorization (order matters: cherry-picked filters WAP) + List initialWapSnapshots = computeWapSnapshots(); + this.cherryPickedSnapshots = computeCherryPickedSnapshots(); + this.wapSnapshots = filterWapFromCherryPicked(initialWapSnapshots); + this.regularSnapshots = computeRegularSnapshots(); + + // Compute changes + this.newSnapshots = + providedSnapshots.stream() + .filter(s -> !existingById.containsKey(s.snapshotId())) + .collect(Collectors.toList()); + this.deletedSnapshots = + existingSnapshots.stream() + .filter(s -> !providedById.containsKey(s.snapshotId())) + .collect(Collectors.toList()); + this.branchUpdates = computeBranchUpdates(); + } + + private List computeWapSnapshots() { + Set allBranchIds = + java.util.stream.Stream.concat(existingBranchIds.stream(), providedBranchIds.stream()) + .collect(Collectors.toSet()); + + return providedSnapshots.stream() + .filter( + s -> + s.summary() != null + && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP) + && !allBranchIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + } + + private List computeCherryPickedSnapshots() { + Set cherryPickSourceIds = + providedSnapshots.stream() + .filter(s -> s.summary() != null && s.summary().containsKey("source-snapshot-id")) + .map(s -> Long.parseLong(s.summary().get("source-snapshot-id"))) + .collect(Collectors.toSet()); + + return providedSnapshots.stream() + .filter( + provided -> { + Snapshot existing = existingById.get(provided.snapshotId()); + if (existing == null) { + return false; + } + + // Parent changed (moved to different branch) + if (!Objects.equal(provided.parentId(), existing.parentId())) { + return true; + } + + // Is source of cherry-pick + if (cherryPickSourceIds.contains(provided.snapshotId())) { + return true; + } + + // WAP snapshot being published (staged → branch) + boolean hasWapId = + provided.summary() != null + && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + boolean wasStaged = !existingBranchIds.contains(provided.snapshotId()); + boolean isNowOnBranch = providedBranchIds.contains(provided.snapshotId()); + return hasWapId && wasStaged && isNowOnBranch; + }) + .collect(Collectors.toList()); + } + + private List filterWapFromCherryPicked(List initialWapSnapshots) { + Set cherryPickedIds = + cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + return initialWapSnapshots.stream() + .filter(s -> !cherryPickedIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + } + + private List computeRegularSnapshots() { + Set excludedIds = + java.util.stream.Stream.concat( + wapSnapshots.stream().map(Snapshot::snapshotId), + cherryPickedSnapshots.stream().map(Snapshot::snapshotId)) + .collect(Collectors.toSet()); + + return providedSnapshots.stream() + .filter(s -> !excludedIds.contains(s.snapshotId())) + .collect(Collectors.toList()); + } + + private Map computeBranchUpdates() { + return providedRefs.entrySet().stream() + .filter( + entry -> { + SnapshotRef existing = existingRefs.get(entry.getKey()); + return existing == null || existing.snapshotId() != entry.getValue().snapshotId(); + }) + .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); + } + + /** + * Validates all snapshot changes before applying them to table metadata. Runs multiple + * validation checks to ensure snapshot operations are safe and consistent. + * + * @param base The base table metadata to validate against (may be null for table creation) + * @throws InvalidIcebergSnapshotException if any validation check fails + */ + void validate(TableMetadata base) { + validateCurrentSnapshotNotDeleted(base); + validateNoAmbiguousCommits(); + validateDeletedSnapshotsNotReferenced(); + } + + /** + * Validates that the current snapshot is not deleted without providing replacement snapshots. + * This prevents leaving the table in an inconsistent state where the current snapshot pointer + * would reference a non-existent snapshot. + * + * @param base The base table metadata containing the current snapshot (may be null for table + * creation) + * @throws InvalidIcebergSnapshotException if the current snapshot is being deleted without + * replacements + */ + private void validateCurrentSnapshotNotDeleted(TableMetadata base) { + if (base == null || base.currentSnapshot() == null) { + return; + } + + long currentSnapshotId = base.currentSnapshot().snapshotId(); + boolean currentDeleted = + deletedSnapshots.stream().anyMatch(s -> s.snapshotId() == currentSnapshotId); + + if (currentDeleted && newSnapshots.isEmpty()) { + throw new InvalidIcebergSnapshotException( + String.format( + "Cannot delete the current snapshot %s without adding replacement snapshots. " + + "Deleted: [%s], New: [%s]", + currentSnapshotId, + deletedSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(", ")), + newSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(", ")))); + } + } + + /** + * Validates that no single snapshot is referenced by multiple branches in the same commit. This + * prevents ambiguous commits where it's unclear which branch should be the primary reference + * for a snapshot. Each snapshot can only be associated with one branch per commit to maintain + * clear lineage and avoid conflicts. + * + * @throws InvalidIcebergSnapshotException if a snapshot is referenced by multiple branches + */ + private void validateNoAmbiguousCommits() { + Map> snapshotToBranches = + branchUpdates.entrySet().stream() + .collect( + Collectors.groupingBy( + e -> e.getValue().snapshotId(), + Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); + + snapshotToBranches.forEach( + (snapshotId, branches) -> { + if (branches.size() > 1) { + throw new InvalidIcebergSnapshotException( + String.format( + "Ambiguous commit: snapshot %s is referenced by multiple branches [%s] in a single commit. " + + "Each snapshot can only be referenced by one branch per commit.", + snapshotId, String.join(", ", branches))); + } + }); + } + + /** + * Validates that snapshots being deleted are not still referenced by any branches or tags. This + * prevents data loss and maintains referential integrity by ensuring that all branch and tag + * pointers reference valid snapshots that will continue to exist after the commit. + * + * @throws InvalidIcebergSnapshotException if any deleted snapshot is still referenced by a + * branch or tag + */ + private void validateDeletedSnapshotsNotReferenced() { + Set deletedIds = + deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + Map> referencedIdsToRefs = + providedRefs.entrySet().stream() + .collect( + Collectors.groupingBy( + e -> e.getValue().snapshotId(), + Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); + + List invalidDeleteDetails = + deletedIds.stream() + .filter(referencedIdsToRefs::containsKey) + .map( + id -> + String.format( + "snapshot %s (referenced by: %s)", + id, String.join(", ", referencedIdsToRefs.get(id)))) + .collect(Collectors.toList()); + + if (!invalidDeleteDetails.isEmpty()) { + throw new InvalidIcebergSnapshotException( + String.format( + "Cannot delete snapshots that are still referenced by branches/tags: %s", + String.join("; ", invalidDeleteDetails))); + } + } + + TableMetadata.Builder applyTo(TableMetadata metadata) { + TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); + + // Remove deleted snapshots + if (!deletedSnapshots.isEmpty()) { + Set deletedIds = + deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + builder.removeSnapshots(deletedIds); + } + + // Remove stale branch references + metadata.refs().keySet().stream() + .filter(refName -> !providedRefs.containsKey(refName)) + .forEach(builder::removeRef); + + // Track existing snapshot IDs after deletions + Set existingAfterDeletion = + metadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + deletedSnapshots.forEach(s -> existingAfterDeletion.remove(s.snapshotId())); + + // Add unreferenced new snapshots + providedSnapshots.stream() + .filter( + s -> + !existingAfterDeletion.contains(s.snapshotId()) + && !providedBranchIds.contains(s.snapshotId())) + .forEach(builder::addSnapshot); + + // Set branch pointers + providedRefs.forEach( + (branchName, ref) -> { + Snapshot snapshot = providedById.get(ref.snapshotId()); + if (snapshot == null) { + throw new InvalidIcebergSnapshotException( + String.format( + "Branch %s references non-existent snapshot %s", + branchName, ref.snapshotId())); + } + + if (existingAfterDeletion.contains(snapshot.snapshotId())) { + SnapshotRef existingRef = metadata.refs().get(branchName); + if (existingRef == null || existingRef.snapshotId() != ref.snapshotId()) { + builder.setRef(branchName, ref); + } + } else { + builder.setBranchSnapshot(snapshot, branchName); + } + }); + + return builder; + } + + void recordMetrics(TableMetadata.Builder builder) { + int appendedCount = + (int) + regularSnapshots.stream() + .filter(s -> !existingById.containsKey(s.snapshotId())) + .count(); + int stagedCount = wapSnapshots.size(); + int cherryPickedCount = cherryPickedSnapshots.size(); + int deletedCount = deletedSnapshots.size(); + + if (appendedCount > 0) { + metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedCount); + } + if (stagedCount > 0) { + metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedCount); + } + if (cherryPickedCount > 0) { + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickedCount); + } + if (deletedCount > 0) { + metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, deletedCount); + } + + // Record snapshot IDs in properties + List newRegularSnapshots = + regularSnapshots.stream().filter(newSnapshots::contains).collect(Collectors.toList()); + if (!newRegularSnapshots.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + formatSnapshotIds(newRegularSnapshots))); + } + if (!wapSnapshots.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + formatSnapshotIds(wapSnapshots))); + } + if (!cherryPickedSnapshots.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + formatSnapshotIds(cherryPickedSnapshots))); + } + if (!deletedSnapshots.isEmpty()) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), + formatSnapshotIds(deletedSnapshots))); + } + + builder.removeProperties( + Sets.newHashSet( + CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY)); + } + } + + /** + * Formats a list of snapshots as a comma-separated string of snapshot IDs. Optimized + * implementation using StringBuilder for better performance with large lists. + * + * @param snapshots List of snapshots to format + * @return Comma-separated string of snapshot IDs, or empty string if list is empty + */ + private String formatSnapshotIds(List snapshots) { + if (snapshots.isEmpty()) { + return ""; + } + StringBuilder sb = new StringBuilder(); + for (int i = 0; i < snapshots.size(); i++) { + if (i > 0) { + sb.append(','); + } + sb.append(snapshots.get(i).snapshotId()); + } + return sb.toString(); + } +} diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotInspector.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotInspector.java deleted file mode 100644 index dc7dd06c2..000000000 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotInspector.java +++ /dev/null @@ -1,96 +0,0 @@ -package com.linkedin.openhouse.internal.catalog; - -import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; -import java.io.UncheckedIOException; -import java.util.List; -import java.util.function.Consumer; -import java.util.function.Supplier; -import java.util.stream.StreamSupport; -import org.apache.hadoop.fs.Path; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DeleteFile; -import org.apache.iceberg.ManifestFile; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.annotations.VisibleForTesting; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.stereotype.Component; - -/** - * A inspector class providing functionalities that inspect components of {@link Snapshot} provided - * by clients and decide if OpenHouse need to take additional steps to incorporate it or decide - * whether to incorporate at all. - * - *

Instance of this class will be injected into {@link OpenHouseInternalTableOperations} in - * runtime. - */ -@Component -public class SnapshotInspector { - @Autowired private Consumer> fileSecurer; - /** - * TODO: ADD Validation for snapshot: Sequence-number based, schema-id based, see iceberg spec for - * details. Throwing exceptions when failures occurred. - * - * @param providedSnapshot deserialized {@link Snapshot} object that clients provided. - * @throws InvalidIcebergSnapshotException Exception thrown from the process validating the - * snapshot provided by client. - */ - void validateSnapshot(Snapshot providedSnapshot) throws InvalidIcebergSnapshotException { - // TODO: Fill this method. - } - - void validateSnapshotsUpdate( - TableMetadata metadata, List addedSnapshots, List deletedSnapshots) { - if (metadata.currentSnapshot() == null) { - // no need to verify attempt to delete current snapshot if it doesn't exist - // deletedSnapshots is necessarily empty when original snapshots list is empty - return; - } - if (!addedSnapshots.isEmpty()) { - // latest snapshot can be deleted if new snapshots are added. - return; - } - long latestSnapshotId = metadata.currentSnapshot().snapshotId(); - if (!deletedSnapshots.isEmpty() - && deletedSnapshots.get(deletedSnapshots.size() - 1).snapshotId() == latestSnapshotId) { - throw new InvalidIcebergSnapshotException( - String.format("Cannot delete the latest snapshot %s", latestSnapshotId)); - } - } - - /** - * A sister method to {@link #validateSnapshot(Snapshot)} that change the file-level permission to - * be OpenHouse exclusive to avoid unexpected changes from unauthorized parties. Throwing - * exceptions when failures occurred. - * - * @param providedSnapshot deserialized {@link Snapshot} object that clients provided. - * @param fileIO {@link FileIO} object - * @throws UncheckedIOException Exception thrown from the process securing the files associated - * with {@param providedSnapshot}. - */ - @VisibleForTesting - void secureSnapshot(Snapshot providedSnapshot, FileIO fileIO) throws UncheckedIOException { - secureDataFile(providedSnapshot.addedDataFiles(fileIO)); - secureDeleteFile(providedSnapshot.addedDeleteFiles(fileIO)); - secureManifestFile(providedSnapshot.allManifests(fileIO)); - } - - private void secureDataFile(Iterable dataFiles) { - StreamSupport.stream(dataFiles.spliterator(), false) - .map(x -> (Supplier) (() -> new Path(x.path().toString()))) - .forEach(fileSecurer); - } - - private void secureDeleteFile(Iterable deleteFiles) { - StreamSupport.stream(deleteFiles.spliterator(), false) - .map(x -> (Supplier) (() -> new Path(x.path().toString()))) - .forEach(fileSecurer); - } - - private void secureManifestFile(List manifestFiles) throws UncheckedIOException { - manifestFiles.stream() - .map(x -> (Supplier) (() -> new Path(x.path()))) - .forEach(fileSecurer); - } -} diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index f514ed162..c5c186eb5 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -103,26 +103,31 @@ void setup() { Mockito.when(mockHouseTableMapper.toHouseTable(Mockito.any(TableMetadata.class), Mockito.any())) .thenReturn(mockHouseTable); HadoopFileIO fileIO = new HadoopFileIO(new Configuration()); + MetricsReporter metricsReporter = + new MetricsReporter(new SimpleMeterRegistry(), "TEST_CATALOG", Lists.newArrayList()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); openHouseInternalTableOperations = new OpenHouseInternalTableOperations( mockHouseTableRepository, fileIO, - Mockito.mock(SnapshotInspector.class), mockHouseTableMapper, TEST_TABLE_IDENTIFIER, - new MetricsReporter(new SimpleMeterRegistry(), "TEST_CATALOG", Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); // Create a separate instance with mock metrics reporter for testing metrics + SnapshotDiffApplier snapshotDiffApplierWithMockMetrics = + new SnapshotDiffApplier(mockMetricsReporter); openHouseInternalTableOperationsWithMockMetrics = new OpenHouseInternalTableOperations( mockHouseTableRepository, fileIO, - Mockito.mock(SnapshotInspector.class), mockHouseTableMapper, TEST_TABLE_IDENTIFIER, mockMetricsReporter, - fileIOManager); + fileIOManager, + snapshotDiffApplierWithMockMetrics); LocalStorage localStorage = mock(LocalStorage.class); when(fileIOManager.getStorage(fileIO)).thenReturn(localStorage); @@ -677,7 +682,9 @@ void testAppendSnapshotsWithOldSnapshots() throws IOException { Assertions.assertThrows( IllegalArgumentException.class, - () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata)); + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata)); // the latest snapshots have larger timestamp than the previous metadata timestamp, so it should // pass the validation @@ -691,7 +698,8 @@ void testAppendSnapshotsWithOldSnapshots() throws IOException { IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); TableMetadata newMetadataWithFuture = baseMetadata.replaceProperties(propertiesWithFuture); - openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadataWithFuture); + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadataWithFuture); } /** @@ -1136,17 +1144,19 @@ private void testMetricIncludesDatabaseTag( SimpleMeterRegistry meterRegistry = new SimpleMeterRegistry(); MetricsReporter realMetricsReporter = new MetricsReporter(meterRegistry, "TEST_CATALOG", Lists.newArrayList()); + HadoopFileIO fileIO = new HadoopFileIO(new Configuration()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(realMetricsReporter); // Create instance with real metrics reporter OpenHouseInternalTableOperations operationsWithRealMetrics = new OpenHouseInternalTableOperations( mockHouseTableRepository, - new HadoopFileIO(new Configuration()), - Mockito.mock(SnapshotInspector.class), + fileIO, mockHouseTableMapper, TEST_TABLE_IDENTIFIER, realMetricsReporter, - fileIOManager); + fileIOManager, + snapshotDiffApplier); // Setup test-specific mocks setupFunction.accept(operationsWithRealMetrics); @@ -1199,17 +1209,19 @@ private void testMetricHasHistogramBuckets( MetricsReporter realMetricsReporter = new MetricsReporter(meterRegistry, "TEST_CATALOG", Lists.newArrayList()); + HadoopFileIO fileIO = new HadoopFileIO(new Configuration()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(realMetricsReporter); // Create instance with real metrics reporter OpenHouseInternalTableOperations operationsWithRealMetrics = new OpenHouseInternalTableOperations( mockHouseTableRepository, - new HadoopFileIO(new Configuration()), - Mockito.mock(SnapshotInspector.class), + fileIO, mockHouseTableMapper, TEST_TABLE_IDENTIFIER, realMetricsReporter, - fileIOManager); + fileIOManager, + snapshotDiffApplier); // Setup test-specific mocks setupFunction.accept(operationsWithRealMetrics); @@ -1380,7 +1392,9 @@ void testDeleteSnapshotWithMainReference() throws IOException { InvalidIcebergSnapshotException exception = Assertions.assertThrows( InvalidIcebergSnapshotException.class, - () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), "Should throw InvalidIcebergSnapshotException when trying to delete referenced snapshot"); // Verify error message mentions the reference @@ -1427,7 +1441,8 @@ void testDeleteSnapshotWithNoReference() throws IOException { TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = - openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); // Verify unreferenced snapshots were removed List unreferencedSnapshots = testSnapshots.subList(0, 2); @@ -1508,7 +1523,9 @@ void testDeleteSnapshotWithMultipleReference() throws IOException { InvalidIcebergSnapshotException exception = Assertions.assertThrows( InvalidIcebergSnapshotException.class, - () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); // Verify error message mentions the snapshot is still referenced @@ -1571,7 +1588,9 @@ void testDeleteSnapshotWithBranchReference() throws IOException { InvalidIcebergSnapshotException exception = Assertions.assertThrows( InvalidIcebergSnapshotException.class, - () -> openHouseInternalTableOperations.applySnapshots(finalBaseMetadata, newMetadata), + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata), "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by tag"); // Verify error message mentions tag reference @@ -1616,7 +1635,8 @@ void testDeleteEmptySnapshotList() throws IOException { TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = - openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); // Verify no changes were made Assertions.assertEquals( @@ -1661,7 +1681,8 @@ void testDeleteNullSnapshotList() throws IOException { TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = - openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); // Verify no changes were made Assertions.assertEquals( @@ -1710,7 +1731,8 @@ void testDeleteNonExistentSnapshot() throws IOException { TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = - openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata); + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); // Verify original snapshots are unchanged Assertions.assertEquals( @@ -1755,7 +1777,8 @@ void testDeleteSnapshotMetricsRecorded() throws IOException { TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); // Use the operations instance with mock metrics reporter - openHouseInternalTableOperationsWithMockMetrics.applySnapshots(finalBaseMetadata, newMetadata); + openHouseInternalTableOperationsWithMockMetrics.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata); // Verify metrics were recorded Mockito.verify(mockMetricsReporter) @@ -1796,7 +1819,8 @@ void testDeleteSnapshotMetricsRecordedBranch() throws IOException { TableMetadata newMetadata = baseMetadata.replaceProperties(properties); // Use the operations instance with mock metrics reporter - openHouseInternalTableOperationsWithMockMetrics.applySnapshots(baseMetadata, newMetadata); + openHouseInternalTableOperationsWithMockMetrics.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata); // Verify metrics were recorded for the basic deletion Mockito.verify(mockMetricsReporter) @@ -1842,7 +1866,8 @@ void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); // Use the operations instance with mock metrics reporter - openHouseInternalTableOperationsWithMockMetrics.applySnapshots(finalBaseMetadata, newMetadata); + openHouseInternalTableOperationsWithMockMetrics.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata); // Verify metrics are not recorded for non-existent snapshots (no actual deletion) Mockito.verify(mockMetricsReporter, Mockito.never()) @@ -1897,7 +1922,9 @@ void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { InvalidIcebergSnapshotException exception = Assertions.assertThrows( InvalidIcebergSnapshotException.class, - () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); // Verify error message mentions the snapshot is still referenced @@ -1941,7 +1968,9 @@ void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { // This should succeed since no snapshots are referenced by any branch/tag TableMetadata result = Assertions.assertDoesNotThrow( - () -> openHouseInternalTableOperations.applySnapshots(finalBaseMetadata, newMetadata), + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + finalBaseMetadata, newMetadata), "Should succeed when deleting all unreferenced snapshots"); // Verify all snapshots were removed from the metadata @@ -2003,7 +2032,9 @@ void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { // This should NOT throw an exception Assertions.assertDoesNotThrow( - () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), "Should NOT throw exception when branches target different snapshots"); } @@ -2051,7 +2082,9 @@ void testStandardWAPScenario() throws IOException { // Should succeed - standard WAP workflow where WAP snapshot becomes the new main Assertions.assertDoesNotThrow( - () -> openHouseInternalTableOperations.applySnapshots(baseMetadata, newMetadata), + () -> + openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( + baseMetadata, newMetadata), "Should successfully pull WAP snapshot into main branch"); } diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java new file mode 100644 index 000000000..4fa913b4d --- /dev/null +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java @@ -0,0 +1,359 @@ +package com.linkedin.openhouse.internal.catalog; + +import static com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils.getCanonicalFieldName; +import static org.junit.jupiter.api.Assertions.*; +import static org.mockito.Mockito.*; + +import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; +import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; +import java.io.IOException; +import java.nio.file.Files; +import java.util.ArrayList; +import java.util.Collections; +import java.util.HashMap; +import java.util.List; +import java.util.Map; +import java.util.UUID; +import lombok.SneakyThrows; +import org.apache.iceberg.PartitionSpec; +import org.apache.iceberg.Schema; +import org.apache.iceberg.Snapshot; +import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SortOrder; +import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.types.Types; +import org.junit.jupiter.api.BeforeEach; +import org.junit.jupiter.api.Test; +import org.mockito.Mockito; + +public class SnapshotDiffApplierTest { + + private SnapshotDiffApplier snapshotDiffApplier; + private MetricsReporter mockMetricsReporter; + private TableMetadata baseMetadata; + private static final String TEST_TABLE_LOCATION = getTempLocation(); + + @SneakyThrows + private static String getTempLocation() { + return Files.createTempDirectory(UUID.randomUUID().toString()).toString(); + } + + @BeforeEach + void setup() { + mockMetricsReporter = Mockito.mock(MetricsReporter.class); + snapshotDiffApplier = new SnapshotDiffApplier(mockMetricsReporter); + + Schema schema = + new Schema( + Types.NestedField.required(1, "id", Types.LongType.get()), + Types.NestedField.optional(2, "data", Types.StringType.get())); + + baseMetadata = + TableMetadata.newTableMetadata( + schema, + PartitionSpec.unpartitioned(), + SortOrder.unsorted(), + TEST_TABLE_LOCATION, + new HashMap<>()); + } + + @Test + void testApplySnapshots_noSnapshotsJson_returnsUnmodified() { + TableMetadata result = snapshotDiffApplier.applySnapshots(null, baseMetadata); + + assertEquals(baseMetadata, result); + verifyNoInteractions(mockMetricsReporter); + } + + @Test + void testApplySnapshots_nullBase_handlesTableCreation() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + assertNotNull(result); + assertEquals(snapshots.size(), result.snapshots().size()); + } + + @Test + void testApplySnapshots_addNewSnapshots_success() throws IOException { + List initialSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, initialSnapshots); + + List allSnapshots = new ArrayList<>(initialSnapshots); + allSnapshots.addAll(IcebergTestUtil.getExtraSnapshots()); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + allSnapshots.get(allSnapshots.size() - 1)))); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertTrue(result.snapshots().size() > baseWithSnapshots.snapshots().size()); + + verify(mockMetricsReporter, atLeastOnce()).count(anyString(), anyDouble()); + } + + @Test + void testValidateCurrentSnapshotNotDeleted_whenCurrentDeleted_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, + SnapshotsUtil.serializedSnapshots(Collections.emptyList())); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(new HashMap<>())); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + } + + @Test + void testValidateNoAmbiguousCommits_whenSnapshotReferencedByMultipleBranches_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Snapshot targetSnapshot = snapshots.get(0); + + Map snapshotRefs = new HashMap<>(); + SnapshotRef ref = SnapshotRef.branchBuilder(targetSnapshot.snapshotId()).build(); + snapshotRefs.put("branch1", org.apache.iceberg.SnapshotRefParser.toJson(ref)); + snapshotRefs.put("branch2", org.apache.iceberg.SnapshotRefParser.toJson(ref)); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Ambiguous commit")); + assertTrue(exception.getMessage().contains("referenced by multiple branches")); + } + + @Test + void + testValidateDeletedSnapshotsNotReferenced_whenDeletedSnapshotStillReferenced_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Snapshot snapshotToDelete = snapshots.get(0); + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + + Map snapshotRefs = new HashMap<>(); + SnapshotRef ref = SnapshotRef.branchBuilder(snapshotToDelete.snapshotId()).build(); + snapshotRefs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(ref)); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete snapshots")); + assertTrue(exception.getMessage().contains("still referenced")); + } + + @Test + void testApplySnapshots_withWapSnapshots_recordsCorrectMetrics() throws IOException { + List baseSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); + + List wapSnapshots = IcebergTestUtil.getWapSnapshots(); + List allSnapshots = new ArrayList<>(baseSnapshots); + allSnapshots.addAll(wapSnapshots); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + baseSnapshots.get(baseSnapshots.size() - 1)))); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + + verify(mockMetricsReporter) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR), anyDouble()); + } + + @Test + void testApplySnapshots_deleteSnapshots_recordsCorrectMetrics() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + remainingSnapshots.get(remainingSnapshots.size() - 1)))); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertEquals(remainingSnapshots.size(), result.snapshots().size()); + + verify(mockMetricsReporter) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), eq(1.0)); + } + + @Test + void testApplySnapshots_recordsSnapshotIdsInProperties() throws IOException { + List baseSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); + + List newSnapshotsList = IcebergTestUtil.getExtraSnapshots(); + List allSnapshots = new ArrayList<>(baseSnapshots); + allSnapshots.addAll(newSnapshotsList); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + allSnapshots.get(allSnapshots.size() - 1)))); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + + String appendedSnapshots = + result.properties().get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + assertNotNull(appendedSnapshots, "Appended snapshots should be recorded in properties"); + + assertTrue(appendedSnapshots.contains(",") || !appendedSnapshots.isEmpty()); + } + + @Test + void testApplySnapshots_removesSnapshotKeysFromProperties() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put( + CatalogConstants.SNAPSHOTS_REFS_KEY, + SnapshotsUtil.serializeMap( + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + assertNotNull(result); + + assertFalse( + result.properties().containsKey(CatalogConstants.SNAPSHOTS_JSON_KEY), + "Snapshots JSON key should be removed from final properties"); + assertFalse( + result.properties().containsKey(CatalogConstants.SNAPSHOTS_REFS_KEY), + "Snapshots refs key should be removed from final properties"); + } + + @Test + void testApplySnapshots_branchUpdates_appliesCorrectly() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Snapshot newBranchTarget = snapshots.get(1); + Map snapshotRefs = + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(newBranchTarget); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertNotNull(result.currentSnapshot()); + assertEquals(newBranchTarget.snapshotId(), result.currentSnapshot().snapshotId()); + } + + @Test + void testApplySnapshots_multipleBranchUpdates_success() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + Map snapshotRefs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(snapshots.get(0).snapshotId()).build(); + SnapshotRef devRef = SnapshotRef.branchBuilder(snapshots.get(1).snapshotId()).build(); + snapshotRefs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(mainRef)); + snapshotRefs.put("dev", org.apache.iceberg.SnapshotRefParser.toJson(devRef)); + + Map properties = new HashMap<>(baseWithSnapshots.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); + + TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + assertEquals(2, result.refs().size()); + } + + private TableMetadata addSnapshotsToMetadata(TableMetadata metadata, List snapshots) { + TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); + for (Snapshot snapshot : snapshots) { + builder.addSnapshot(snapshot); + } + if (!snapshots.isEmpty()) { + Snapshot lastSnapshot = snapshots.get(snapshots.size() - 1); + SnapshotRef ref = SnapshotRef.branchBuilder(lastSnapshot.snapshotId()).build(); + builder.setRef(SnapshotRef.MAIN_BRANCH, ref); + } + return builder.build(); + } +} diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotInspectorTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotInspectorTest.java deleted file mode 100644 index 3fb9ced17..000000000 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotInspectorTest.java +++ /dev/null @@ -1,171 +0,0 @@ -package com.linkedin.openhouse.internal.catalog; - -import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; -import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapperTest; -import java.io.IOException; -import java.nio.charset.StandardCharsets; -import java.nio.file.Files; -import java.nio.file.Path; -import java.nio.file.Paths; -import java.nio.file.attribute.FileAttribute; -import java.nio.file.attribute.PosixFilePermission; -import java.nio.file.attribute.PosixFilePermissions; -import java.util.Collections; -import java.util.List; -import java.util.Set; -import java.util.UUID; -import org.apache.hadoop.conf.Configuration; -import org.apache.hadoop.fs.FileSystem; -import org.apache.iceberg.DataFile; -import org.apache.iceberg.DataFiles; -import org.apache.iceberg.ManifestFiles; -import org.apache.iceberg.ManifestWriter; -import org.apache.iceberg.PartitionSpec; -import org.apache.iceberg.Schema; -import org.apache.iceberg.Snapshot; -import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.hadoop.HadoopOutputFile; -import org.apache.iceberg.io.FileIO; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableList; -import org.apache.iceberg.relocated.com.google.common.collect.ImmutableMap; -import org.apache.iceberg.relocated.com.google.common.collect.Lists; -import org.apache.iceberg.types.Types; -import org.junit.jupiter.api.Assertions; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.io.TempDir; -import org.mockito.Mockito; -import org.springframework.beans.factory.annotation.Autowired; -import org.springframework.boot.test.context.SpringBootTest; -import org.springframework.context.annotation.Import; - -@SpringBootTest -@Import(HouseTableMapperTest.MockConfiguration.class) -class SnapshotInspectorTest { - - @Autowired SnapshotInspector snapshotInspector; - - @TempDir static Path tempDir; - - private static final TableMetadata NO_SNAPSHOTS_METADATA = - TableMetadata.newTableMetadata( - new Schema( - Types.NestedField.required(1, "data", Types.StringType.get()), - Types.NestedField.required(2, "ts", Types.TimestampType.withoutZone())), - PartitionSpec.unpartitioned(), - UUID.randomUUID().toString(), - ImmutableMap.of()); - - @Test - void testValidateSnapshotsUpdateWithNoSnapshotMetadata() throws IOException { - - List testSnapshots = IcebergTestUtil.getSnapshots(); - // No exception since added as well deleted snapshots are allowed to support replication - // use case which performs table commit with added and deleted snapshots. - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - NO_SNAPSHOTS_METADATA, testSnapshots.subList(0, 1), testSnapshots.subList(1, 4))); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - NO_SNAPSHOTS_METADATA, testSnapshots, Collections.emptyList())); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - NO_SNAPSHOTS_METADATA, Collections.emptyList(), testSnapshots)); - } - - @Test - void testValidateSnapshotsUpdateWithSnapshotMetadata() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - List extraTestSnapshots = IcebergTestUtil.getExtraSnapshots(); - TableMetadata metadataWithSnapshots = - TableMetadata.buildFrom(NO_SNAPSHOTS_METADATA) - .setBranchSnapshot(testSnapshots.get(testSnapshots.size() - 1), SnapshotRef.MAIN_BRANCH) - .build(); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, testSnapshots, Collections.emptyList())); - // No validation error if snapshots are added and deleted - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, testSnapshots, testSnapshots)); - // No validation error if snapshots are added and deleted - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, extraTestSnapshots, testSnapshots)); - Assertions.assertThrows( - InvalidIcebergSnapshotException.class, - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, Collections.emptyList(), testSnapshots)); - Assertions.assertDoesNotThrow( - () -> - snapshotInspector.validateSnapshotsUpdate( - metadataWithSnapshots, - Collections.emptyList(), - testSnapshots.subList(0, testSnapshots.size() - 1))); - } - - @Test - void testSecureSnapshot() throws IOException { - // The default file attribute that sets the permission as 777 when a file is created. - FileAttribute> attr = - PosixFilePermissions.asFileAttribute(PosixFilePermissions.fromString("rwxrwxrwx")); - - // Mock DataFile and ManifestFile - Snapshot mockSnapshot = Mockito.mock(org.apache.iceberg.Snapshot.class); - Path tempFile1 = Files.createFile(tempDir.resolve("data1.parquet"), attr); - Path tempFile2 = Files.createFile(tempDir.resolve("data2.parquet"), attr); - Path tempFile3 = Files.createFile(tempDir.resolve("manifest"), attr); - - // Mock FileIO - FileIO fileIO = Mockito.mock(org.apache.iceberg.io.FileIO.class); - - List dataFileList = - ImmutableList.of( - createDataFile(tempFile1.toString()), createDataFile(tempFile2.toString())); - - ManifestWriter manifestWriter = - ManifestFiles.write( - PartitionSpec.unpartitioned(), - HadoopOutputFile.fromLocation(tempFile3.toString(), new Configuration())); - manifestWriter.close(); - - Mockito.when(mockSnapshot.allManifests(fileIO)) - .thenReturn(ImmutableList.of(manifestWriter.toManifestFile())); - Mockito.when(mockSnapshot.addedDataFiles(fileIO)).thenReturn(dataFileList); - snapshotInspector.secureSnapshot(mockSnapshot, fileIO); - - /* Verify the perms of files are modified as com.linkedin.openhouse.internal.catalog.MockApplication.perm does */ - FileSystem fileSystem = FileSystem.get(new Configuration()); - Assertions.assertEquals( - fileSystem - .getFileStatus(new org.apache.hadoop.fs.Path(tempFile1.toString())) - .getPermission(), - MockApplication.FS_PERMISSION); - Assertions.assertEquals( - fileSystem - .getFileStatus(new org.apache.hadoop.fs.Path(tempFile2.toString())) - .getPermission(), - MockApplication.FS_PERMISSION); - Assertions.assertEquals( - fileSystem - .getFileStatus(new org.apache.hadoop.fs.Path(tempFile3.toString())) - .getPermission(), - MockApplication.FS_PERMISSION); - } - - public static DataFile createDataFile(String dataPath) throws IOException { - Files.write(Paths.get(dataPath), Lists.newArrayList(), StandardCharsets.UTF_8); - return DataFiles.builder(PartitionSpec.unpartitioned()) - .withPath(dataPath) - .withFileSizeInBytes(10) - .withRecordCount(1) - .build(); - } -} diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java index 85044da5e..c6073aff7 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java @@ -8,7 +8,6 @@ import com.linkedin.openhouse.cluster.storage.StorageManager; import com.linkedin.openhouse.common.test.cluster.PropertyOverrideContextInitializer; import com.linkedin.openhouse.internal.catalog.OpenHouseInternalTableOperations; -import com.linkedin.openhouse.internal.catalog.SnapshotInspector; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -60,8 +59,6 @@ public class RepositoryTestWithSettableComponents { @Autowired FileIOManager fileIOManager; - @Autowired SnapshotInspector snapshotInspector; - @Autowired HouseTableMapper houseTableMapper; @Autowired MeterRegistry meterRegistry; @@ -97,15 +94,18 @@ void testNoRetryInternalRepo() { // construct a real table object to prepare subsequent client call for table-update (that they // will fail) + MetricsReporter metricsReporter = + new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); OpenHouseInternalTableOperations actualOps = new OpenHouseInternalTableOperations( houseTablesRepository, fileIO, - snapshotInspector, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); ((SettableCatalogForTest) catalog).setOperation(actualOps); TableDto creationDTO = TABLE_DTO.toBuilder().tableVersion(INITIAL_TABLE_VERSION).build(); creationDTO = openHouseInternalRepository.save(creationDTO); @@ -114,15 +114,18 @@ void testNoRetryInternalRepo() { // injecting mocked htsRepo within a tableOperation that fails doCommit method. // The requirement to trigger htsRepo.save call are: Detectable updates in Transaction itself. + MetricsReporter metricsReporter2 = + new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); + SnapshotDiffApplier snapshotDiffApplier2 = new SnapshotDiffApplier(metricsReporter2); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( htsRepo, fileIO, - snapshotInspector, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()), - fileIOManager); + metricsReporter2, + fileIOManager, + snapshotDiffApplier2); OpenHouseInternalTableOperations spyOperations = Mockito.spy(mockOps); doReturn(actualOps.current()).when(spyOperations).refresh(); BaseTable spyOptsMockedTable = Mockito.spy(new BaseTable(spyOperations, realTable.name())); @@ -195,15 +198,18 @@ void testFailedHtsRepoWhenGet() { for (Class c : exs) { HouseTableRepository htsRepo = provideFailedHtsRepoWhenGet(c); + MetricsReporter metricsReporter = + new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); + SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( htsRepo, fileIO, - snapshotInspector, houseTableMapper, tableIdentifier, - new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()), - fileIOManager); + metricsReporter, + fileIOManager, + snapshotDiffApplier); OpenHouseInternalTableOperations spyOperations = Mockito.spy(mockOps); BaseTable spyOptsMockedTable = Mockito.spy( diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java index d845e1b39..7cf0528ec 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/SpringH2Application.java @@ -5,9 +5,6 @@ import com.linkedin.openhouse.common.audit.model.ServiceAuditEvent; import com.linkedin.openhouse.tables.audit.DummyTableAuditHandler; import com.linkedin.openhouse.tables.audit.model.TableAuditEvent; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.hadoop.fs.Path; import org.mockito.Mockito; import org.springframework.boot.SpringApplication; import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration; @@ -17,7 +14,6 @@ import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; -import org.springframework.context.annotation.Primary; @SpringBootApplication @ComponentScan( @@ -53,19 +49,6 @@ public static void main(String[] args) { SpringApplication.run(SpringH2Application.class, args); } - /** - * File secure used for testing purpose. We cannot directly use the actual - * SnapshotInspector#fileSecurer as that changes file to a user group that is not guaranteed to - * exist across different platforms thus creating environment dependencies for unit tests. - */ - @Bean - @Primary - Consumer> provideTestFileSecurer() { - return pathSupplier -> { - // This is a no-op Consumer. It does nothing with the supplied Path. - }; - } - @Bean public AuditHandler serviceAuditHandler() { return Mockito.mock(DummyServiceAuditHandler.class); diff --git a/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java b/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java index 343c38d0d..0d85a24b0 100644 --- a/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java +++ b/tables-test-fixtures/tables-test-fixtures-iceberg-1.2/src/main/java/com/linkedin/openhouse/tablestest/SpringH2TestApplication.java @@ -1,17 +1,12 @@ package com.linkedin.openhouse.tablestest; -import java.util.function.Consumer; -import java.util.function.Supplier; -import org.apache.hadoop.fs.Path; import org.springframework.boot.SpringApplication; import org.springframework.boot.actuate.autoconfigure.security.servlet.ManagementWebSecurityAutoConfiguration; import org.springframework.boot.autoconfigure.EnableAutoConfiguration; import org.springframework.boot.autoconfigure.SpringBootApplication; import org.springframework.boot.autoconfigure.domain.EntityScan; import org.springframework.boot.autoconfigure.security.servlet.SecurityAutoConfiguration; -import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.ComponentScan; -import org.springframework.context.annotation.Primary; @SpringBootApplication @ComponentScan( @@ -47,17 +42,4 @@ public class SpringH2TestApplication { public static void main(String[] args) { SpringApplication.run(SpringH2TestApplication.class, args); } - - /** - * File secure used for testing purpose. We cannot directly use the actual - * SnapshotInspector#fileSecurer as that changes file to a user group that is not guaranteed to - * exist across different platforms thus creating environment dependencies for unit tests. - */ - @Bean - @Primary - Consumer> provideTestFileSecurer() { - return pathSupplier -> { - // This is a no-op Consumer. It does nothing with the supplied Path. - }; - } } From afe2627c509744affc5480ddd50a268471b68a0f Mon Sep 17 00:00:00 2001 From: cbb330 Date: Thu, 9 Oct 2025 22:05:32 -0700 Subject: [PATCH 17/35] fixing broken tests --- .../e2e/h2/RepositoryTestWithSettableComponents.java | 8 ++++---- .../openhouse/tables/settable/SettableTestConfig.java | 11 +++++++++++ 2 files changed, 15 insertions(+), 4 deletions(-) diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java index c6073aff7..ceb3f9e26 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/e2e/h2/RepositoryTestWithSettableComponents.java @@ -8,6 +8,7 @@ import com.linkedin.openhouse.cluster.storage.StorageManager; import com.linkedin.openhouse.common.test.cluster.PropertyOverrideContextInitializer; import com.linkedin.openhouse.internal.catalog.OpenHouseInternalTableOperations; +import com.linkedin.openhouse.internal.catalog.SnapshotDiffApplier; import com.linkedin.openhouse.internal.catalog.fileio.FileIOManager; import com.linkedin.openhouse.internal.catalog.mapper.HouseTableMapper; import com.linkedin.openhouse.internal.catalog.model.HouseTable; @@ -63,6 +64,8 @@ public class RepositoryTestWithSettableComponents { @Autowired MeterRegistry meterRegistry; + @Autowired SnapshotDiffApplier snapshotDiffApplier; + FileIO fileIO; @PostConstruct @@ -96,7 +99,6 @@ void testNoRetryInternalRepo() { // will fail) MetricsReporter metricsReporter = new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); - SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); OpenHouseInternalTableOperations actualOps = new OpenHouseInternalTableOperations( houseTablesRepository, @@ -116,7 +118,6 @@ void testNoRetryInternalRepo() { // The requirement to trigger htsRepo.save call are: Detectable updates in Transaction itself. MetricsReporter metricsReporter2 = new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); - SnapshotDiffApplier snapshotDiffApplier2 = new SnapshotDiffApplier(metricsReporter2); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( htsRepo, @@ -125,7 +126,7 @@ void testNoRetryInternalRepo() { tableIdentifier, metricsReporter2, fileIOManager, - snapshotDiffApplier2); + snapshotDiffApplier); OpenHouseInternalTableOperations spyOperations = Mockito.spy(mockOps); doReturn(actualOps.current()).when(spyOperations).refresh(); BaseTable spyOptsMockedTable = Mockito.spy(new BaseTable(spyOperations, realTable.name())); @@ -200,7 +201,6 @@ void testFailedHtsRepoWhenGet() { HouseTableRepository htsRepo = provideFailedHtsRepoWhenGet(c); MetricsReporter metricsReporter = new MetricsReporter(this.meterRegistry, "test", Lists.newArrayList()); - SnapshotDiffApplier snapshotDiffApplier = new SnapshotDiffApplier(metricsReporter); OpenHouseInternalTableOperations mockOps = new OpenHouseInternalTableOperations( htsRepo, diff --git a/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java b/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java index 400b92b0f..f7d4f0124 100644 --- a/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java +++ b/services/tables/src/test/java/com/linkedin/openhouse/tables/settable/SettableTestConfig.java @@ -1,8 +1,12 @@ package com.linkedin.openhouse.tables.settable; +import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; +import com.linkedin.openhouse.internal.catalog.SnapshotDiffApplier; import com.linkedin.openhouse.tables.repository.OpenHouseInternalRepository; import com.linkedin.openhouse.tables.repository.impl.SettableInternalRepositoryForTest; +import io.micrometer.core.instrument.MeterRegistry; import org.apache.iceberg.catalog.Catalog; +import org.apache.iceberg.relocated.com.google.common.collect.Lists; import org.springframework.boot.test.context.TestConfiguration; import org.springframework.context.annotation.Bean; import org.springframework.context.annotation.Primary; @@ -20,4 +24,11 @@ public Catalog provideTestCatalog() { public OpenHouseInternalRepository provideTestInternalRepo() { return new SettableInternalRepositoryForTest(); } + + @Bean + public SnapshotDiffApplier snapshotDiffApplier(MeterRegistry meterRegistry) { + MetricsReporter metricsReporter = + new MetricsReporter(meterRegistry, "test", Lists.newArrayList()); + return new SnapshotDiffApplier(metricsReporter); + } } From 6ba98f517300072c7b5020739b6206e89beac5b2 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 20 Oct 2025 23:04:23 -0700 Subject: [PATCH 18/35] centralizing maps/lists in constructor and reusing in applyTo --- .../internal/catalog/SnapshotDiffApplier.java | 46 +++++++++---------- 1 file changed, 22 insertions(+), 24 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index 347cf4f7d..ddf0b8d71 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -104,6 +104,11 @@ private class SnapshotDiff { private final List newSnapshots; private final List deletedSnapshots; private final Map branchUpdates; + private final Set deletedIds; + private final List newRegularSnapshots; + private final Set staleRefs; + private final Set existingAfterDeletionIds; + private final List unreferencedNewSnapshots; SnapshotDiff( List providedSnapshots, @@ -141,6 +146,19 @@ private class SnapshotDiff { .filter(s -> !providedById.containsKey(s.snapshotId())) .collect(Collectors.toList()); this.branchUpdates = computeBranchUpdates(); + this.deletedIds = + deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + this.newRegularSnapshots = + regularSnapshots.stream().filter(newSnapshots::contains).collect(Collectors.toList()); + this.staleRefs = Sets.difference(existingRefs.keySet(), providedRefs.keySet()); + this.existingAfterDeletionIds = Sets.difference(existingById.keySet(), deletedIds); + this.unreferencedNewSnapshots = + providedSnapshots.stream() + .filter( + s -> + !existingAfterDeletionIds.contains(s.snapshotId()) + && !providedBranchIds.contains(s.snapshotId())) + .collect(Collectors.toList()); } private List computeWapSnapshots() { @@ -252,8 +270,7 @@ private void validateCurrentSnapshotNotDeleted(TableMetadata base) { } long currentSnapshotId = base.currentSnapshot().snapshotId(); - boolean currentDeleted = - deletedSnapshots.stream().anyMatch(s -> s.snapshotId() == currentSnapshotId); + boolean currentDeleted = deletedIds.contains(currentSnapshotId); if (currentDeleted && newSnapshots.isEmpty()) { throw new InvalidIcebergSnapshotException( @@ -307,9 +324,6 @@ private void validateNoAmbiguousCommits() { * branch or tag */ private void validateDeletedSnapshotsNotReferenced() { - Set deletedIds = - deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - Map> referencedIdsToRefs = providedRefs.entrySet().stream() .collect( @@ -340,28 +354,14 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { // Remove deleted snapshots if (!deletedSnapshots.isEmpty()) { - Set deletedIds = - deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); builder.removeSnapshots(deletedIds); } // Remove stale branch references - metadata.refs().keySet().stream() - .filter(refName -> !providedRefs.containsKey(refName)) - .forEach(builder::removeRef); - - // Track existing snapshot IDs after deletions - Set existingAfterDeletion = - metadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - deletedSnapshots.forEach(s -> existingAfterDeletion.remove(s.snapshotId())); + staleRefs.forEach(builder::removeRef); // Add unreferenced new snapshots - providedSnapshots.stream() - .filter( - s -> - !existingAfterDeletion.contains(s.snapshotId()) - && !providedBranchIds.contains(s.snapshotId())) - .forEach(builder::addSnapshot); + unreferencedNewSnapshots.forEach(builder::addSnapshot); // Set branch pointers providedRefs.forEach( @@ -374,7 +374,7 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { branchName, ref.snapshotId())); } - if (existingAfterDeletion.contains(snapshot.snapshotId())) { + if (existingAfterDeletionIds.contains(snapshot.snapshotId())) { SnapshotRef existingRef = metadata.refs().get(branchName); if (existingRef == null || existingRef.snapshotId() != ref.snapshotId()) { builder.setRef(branchName, ref); @@ -412,8 +412,6 @@ void recordMetrics(TableMetadata.Builder builder) { } // Record snapshot IDs in properties - List newRegularSnapshots = - regularSnapshots.stream().filter(newSnapshots::contains).collect(Collectors.toList()); if (!newRegularSnapshots.isEmpty()) { builder.setProperties( Collections.singletonMap( From 39b6cf1b8d27600e74d3d9f664be962edd2535b0 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Wed, 22 Oct 2025 18:19:04 -0700 Subject: [PATCH 19/35] responding to comments --- .../internal/catalog/SnapshotDiffApplier.java | 120 +++++++++--------- .../spark/catalogtest/BranchTestSpark3_5.java | 13 +- 2 files changed, 71 insertions(+), 62 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index ddf0b8d71..4f2f0ccce 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -68,11 +68,12 @@ public TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) // Compute diff (all maps created once in constructor) SnapshotDiff diff = - new SnapshotDiff(providedSnapshots, providedRefs, existingSnapshots, existingRefs); + new SnapshotDiff( + providedSnapshots, providedRefs, existingSnapshots, existingRefs, metadata); // Validate, apply, record metrics, build diff.validate(base); - TableMetadata.Builder builder = diff.applyTo(metadata); + TableMetadata.Builder builder = diff.applyTo(); diff.recordMetrics(builder); return builder.build(); } @@ -88,12 +89,14 @@ private class SnapshotDiff { private final Map providedRefs; private final List existingSnapshots; private final Map existingRefs; + private final TableMetadata metadata; // Computed maps (created once) - private final Map providedById; - private final Map existingById; - private final Set existingBranchIds; - private final Set providedBranchIds; + private final Map providedSnapshotByIds; + private final Map existingSnapshotByIds; + private final Set metadataSnapshotIds; + private final Set existingBranchRefIds; + private final Set providedBranchRefIds; // Categorized snapshots private final List wapSnapshots; @@ -114,36 +117,48 @@ private class SnapshotDiff { List providedSnapshots, Map providedRefs, List existingSnapshots, - Map existingRefs) { + Map existingRefs, + TableMetadata metadata) { this.providedSnapshots = providedSnapshots; this.providedRefs = providedRefs; this.existingSnapshots = existingSnapshots; this.existingRefs = existingRefs; + this.metadata = metadata; // Compute all maps once - this.providedById = + this.providedSnapshotByIds = providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - this.existingById = + this.existingSnapshotByIds = existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - this.existingBranchIds = + this.metadataSnapshotIds = + metadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + this.existingBranchRefIds = existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); - this.providedBranchIds = + this.providedBranchRefIds = providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); - // Compute categorization (order matters: cherry-picked filters WAP) - List initialWapSnapshots = computeWapSnapshots(); + // Compute categorization - process in dependency order + // 1. Cherry-picked has highest priority (includes WAP being published) + // 2. WAP snapshots (staged, not published) + // 3. Regular snapshots (everything else) this.cherryPickedSnapshots = computeCherryPickedSnapshots(); - this.wapSnapshots = filterWapFromCherryPicked(initialWapSnapshots); - this.regularSnapshots = computeRegularSnapshots(); + Set cherryPickedIds = + cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + this.wapSnapshots = computeWapSnapshots(cherryPickedIds); + Set wapIds = + wapSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + + this.regularSnapshots = computeRegularSnapshots(cherryPickedIds, wapIds); // Compute changes this.newSnapshots = providedSnapshots.stream() - .filter(s -> !existingById.containsKey(s.snapshotId())) + .filter(s -> !existingSnapshotByIds.containsKey(s.snapshotId())) .collect(Collectors.toList()); this.deletedSnapshots = existingSnapshots.stream() - .filter(s -> !providedById.containsKey(s.snapshotId())) + .filter(s -> !providedSnapshotByIds.containsKey(s.snapshotId())) .collect(Collectors.toList()); this.branchUpdates = computeBranchUpdates(); this.deletedIds = @@ -151,27 +166,31 @@ private class SnapshotDiff { this.newRegularSnapshots = regularSnapshots.stream().filter(newSnapshots::contains).collect(Collectors.toList()); this.staleRefs = Sets.difference(existingRefs.keySet(), providedRefs.keySet()); - this.existingAfterDeletionIds = Sets.difference(existingById.keySet(), deletedIds); + this.existingAfterDeletionIds = Sets.difference(existingSnapshotByIds.keySet(), deletedIds); this.unreferencedNewSnapshots = providedSnapshots.stream() .filter( s -> !existingAfterDeletionIds.contains(s.snapshotId()) - && !providedBranchIds.contains(s.snapshotId())) + && !providedBranchRefIds.contains(s.snapshotId()) + && !metadataSnapshotIds.contains(s.snapshotId())) .collect(Collectors.toList()); } - private List computeWapSnapshots() { - Set allBranchIds = - java.util.stream.Stream.concat(existingBranchIds.stream(), providedBranchIds.stream()) + private List computeWapSnapshots(Set excludeCherryPicked) { + // Depends on: cherry-picked IDs (to exclude WAP snapshots being published) + Set allBranchRefIds = + java.util.stream.Stream.concat( + existingBranchRefIds.stream(), providedBranchRefIds.stream()) .collect(Collectors.toSet()); return providedSnapshots.stream() + .filter(s -> !excludeCherryPicked.contains(s.snapshotId())) .filter( s -> s.summary() != null && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP) - && !allBranchIds.contains(s.snapshotId())) + && !allBranchRefIds.contains(s.snapshotId())) .collect(Collectors.toList()); } @@ -185,7 +204,7 @@ private List computeCherryPickedSnapshots() { return providedSnapshots.stream() .filter( provided -> { - Snapshot existing = existingById.get(provided.snapshotId()); + Snapshot existing = existingSnapshotByIds.get(provided.snapshotId()); if (existing == null) { return false; } @@ -204,30 +223,19 @@ private List computeCherryPickedSnapshots() { boolean hasWapId = provided.summary() != null && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); - boolean wasStaged = !existingBranchIds.contains(provided.snapshotId()); - boolean isNowOnBranch = providedBranchIds.contains(provided.snapshotId()); + boolean wasStaged = !existingBranchRefIds.contains(provided.snapshotId()); + boolean isNowOnBranch = providedBranchRefIds.contains(provided.snapshotId()); return hasWapId && wasStaged && isNowOnBranch; }) .collect(Collectors.toList()); } - private List filterWapFromCherryPicked(List initialWapSnapshots) { - Set cherryPickedIds = - cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - return initialWapSnapshots.stream() - .filter(s -> !cherryPickedIds.contains(s.snapshotId())) - .collect(Collectors.toList()); - } - - private List computeRegularSnapshots() { - Set excludedIds = - java.util.stream.Stream.concat( - wapSnapshots.stream().map(Snapshot::snapshotId), - cherryPickedSnapshots.stream().map(Snapshot::snapshotId)) - .collect(Collectors.toSet()); - + private List computeRegularSnapshots( + Set excludeCherryPicked, Set excludeWap) { + // Depends on: cherry-picked and WAP IDs (everything else is regular) return providedSnapshots.stream() - .filter(s -> !excludedIds.contains(s.snapshotId())) + .filter(s -> !excludeCherryPicked.contains(s.snapshotId())) + .filter(s -> !excludeWap.contains(s.snapshotId())) .collect(Collectors.toList()); } @@ -349,7 +357,7 @@ private void validateDeletedSnapshotsNotReferenced() { } } - TableMetadata.Builder applyTo(TableMetadata metadata) { + TableMetadata.Builder applyTo() { TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); // Remove deleted snapshots @@ -366,7 +374,7 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { // Set branch pointers providedRefs.forEach( (branchName, ref) -> { - Snapshot snapshot = providedById.get(ref.snapshotId()); + Snapshot snapshot = providedSnapshotByIds.get(ref.snapshotId()); if (snapshot == null) { throw new InvalidIcebergSnapshotException( String.format( @@ -374,7 +382,12 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { branchName, ref.snapshotId())); } - if (existingAfterDeletionIds.contains(snapshot.snapshotId())) { + // Check if snapshot is already in metadata (after deletions) + boolean snapshotExistsInMetadata = + metadataSnapshotIds.contains(snapshot.snapshotId()) + && !deletedIds.contains(snapshot.snapshotId()); + + if (snapshotExistsInMetadata) { SnapshotRef existingRef = metadata.refs().get(branchName); if (existingRef == null || existingRef.snapshotId() != ref.snapshotId()) { builder.setRef(branchName, ref); @@ -391,7 +404,7 @@ void recordMetrics(TableMetadata.Builder builder) { int appendedCount = (int) regularSnapshots.stream() - .filter(s -> !existingById.containsKey(s.snapshotId())) + .filter(s -> !existingSnapshotByIds.containsKey(s.snapshotId())) .count(); int stagedCount = wapSnapshots.size(); int cherryPickedCount = cherryPickedSnapshots.size(); @@ -451,16 +464,9 @@ void recordMetrics(TableMetadata.Builder builder) { * @return Comma-separated string of snapshot IDs, or empty string if list is empty */ private String formatSnapshotIds(List snapshots) { - if (snapshots.isEmpty()) { - return ""; - } - StringBuilder sb = new StringBuilder(); - for (int i = 0; i < snapshots.size(); i++) { - if (i > 0) { - sb.append(','); - } - sb.append(snapshots.get(i).snapshotId()); - } - return sb.toString(); + return snapshots.stream() + .map(Snapshot::snapshotId) + .map(String::valueOf) + .collect(Collectors.joining(",")); } } diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java index 488750620..478059289 100644 --- a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java +++ b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java @@ -103,8 +103,9 @@ public void testBasicBranchOperations() throws Exception { List refs = spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); assertEquals(2, refs.size()); - assertEquals("feature_a", refs.get(0).getString(0)); - assertEquals("main", refs.get(1).getString(0)); + Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(refNames.contains("feature_a")); + assertTrue(refNames.contains("main")); } } @@ -2004,7 +2005,8 @@ public void testBackwardCompatibilityMainBranchOnly() throws Exception { assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); List refs = spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); assertEquals(1, refs.size()); - assertEquals("main", refs.get(0).getString(0)); + Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(refNames.contains("main")); // Traditional snapshot queries should work assertTrue( @@ -2279,8 +2281,9 @@ public void testErrorInsertToNonExistentBranch() throws Exception { List refs = spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); assertEquals(2, refs.size()); - assertEquals("feature_a", refs.get(0).getString(0)); - assertEquals("main", refs.get(1).getString(0)); + Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); + assertTrue(refNames.contains("feature_a")); + assertTrue(refNames.contains("main")); } } From fb0ff1babf34c6d156decd486180bd8818cc140a Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 3 Nov 2025 18:57:32 -0800 Subject: [PATCH 20/35] formatting --- .../internal/catalog/SnapshotDiffApplier.java | 490 +--- .../OpenHouseInternalTableOperationsTest.java | 304 --- .../spark/catalogtest/BranchTestSpark3_5.java | 2370 ----------------- 3 files changed, 135 insertions(+), 3029 deletions(-) delete mode 100644 integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index 4f2f0ccce..90dbc2b85 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -2,10 +2,11 @@ import static com.linkedin.openhouse.internal.catalog.mapper.HouseTableSerdeUtils.getCanonicalFieldName; -import com.google.common.collect.Sets; import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; +import java.util.ArrayList; import java.util.Collections; +import java.util.HashMap; import java.util.List; import java.util.Map; import java.util.Optional; @@ -13,28 +14,19 @@ import java.util.stream.Collectors; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; +import org.apache.commons.collections.CollectionUtils; +import org.apache.commons.collections.MapUtils; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.TableMetadata; -import org.apache.iceberg.relocated.com.google.common.base.Objects; /** * Service responsible for applying snapshot changes to Iceberg table metadata. * - *

This class handles the complex logic of computing snapshot diffs, validating changes, and - * applying them to table metadata. It supports various snapshot operations including: - * - *

    - *
  • Adding new snapshots (regular commits) - *
  • Staging snapshots (WAP - Write-Audit-Publish) - *
  • Cherry-picking snapshots across branches - *
  • Deleting snapshots - *
  • Updating branch references - *
- * - *

The service performs comprehensive validation to ensure data integrity and prevent invalid - * operations such as deleting referenced snapshots or creating ambiguous branch references. + *

This class extracts snapshot logic from OpenHouseInternalTableOperations while maintaining the + * same behavior. The main entry point applySnapshots() has a clear flow: parse input → compute diff + * → validate → apply. */ @AllArgsConstructor @Slf4j @@ -43,8 +35,8 @@ public class SnapshotDiffApplier { private final MetricsReporter metricsReporter; /** - * Applies snapshot updates from metadata properties. Simple and clear: parse input, compute diff, - * validate, apply, record metrics, build. + * Applies snapshot updates from metadata properties. Clear flow: parse input, compute diff, + * validate, apply, build. * * @param base The base table metadata (may be null for table creation) * @param metadata The new metadata with properties containing snapshot updates @@ -61,97 +53,54 @@ public TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) Map providedRefs = Optional.ofNullable(metadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) .map(SnapshotsUtil::parseSnapshotRefs) - .orElse(Collections.emptyMap()); + .orElse(new HashMap<>()); List existingSnapshots = base != null ? base.snapshots() : Collections.emptyList(); - Map existingRefs = base != null ? base.refs() : Collections.emptyMap(); - // Compute diff (all maps created once in constructor) + // Compute diff (minimal maps in constructor) SnapshotDiff diff = - new SnapshotDiff( - providedSnapshots, providedRefs, existingSnapshots, existingRefs, metadata); + new SnapshotDiff(providedSnapshots, existingSnapshots, metadata, providedRefs); - // Validate, apply, record metrics, build + // Validate, apply, build diff.validate(base); - TableMetadata.Builder builder = diff.applyTo(); - diff.recordMetrics(builder); + TableMetadata.Builder builder = diff.applyTo(metadata); return builder.build(); } /** - * State object that computes and caches all snapshot analysis. Computes all maps once in the - * constructor to avoid redundant operations. Provides clear methods for validation and - * application. + * State object that computes minimal snapshot diff. Computes only essential maps in the + * constructor for the refactoring. Provides simple validation and application methods. */ private class SnapshotDiff { // Input state private final List providedSnapshots; - private final Map providedRefs; private final List existingSnapshots; - private final Map existingRefs; private final TableMetadata metadata; + private final Map providedRefs; - // Computed maps (created once) + // Computed maps (minimal for original behavior) private final Map providedSnapshotByIds; private final Map existingSnapshotByIds; - private final Set metadataSnapshotIds; - private final Set existingBranchRefIds; - private final Set providedBranchRefIds; - - // Categorized snapshots - private final List wapSnapshots; - private final List cherryPickedSnapshots; - private final List regularSnapshots; - - // Changes private final List newSnapshots; private final List deletedSnapshots; - private final Map branchUpdates; - private final Set deletedIds; - private final List newRegularSnapshots; - private final Set staleRefs; - private final Set existingAfterDeletionIds; - private final List unreferencedNewSnapshots; SnapshotDiff( List providedSnapshots, - Map providedRefs, List existingSnapshots, - Map existingRefs, - TableMetadata metadata) { + TableMetadata metadata, + Map providedRefs) { this.providedSnapshots = providedSnapshots; - this.providedRefs = providedRefs; this.existingSnapshots = existingSnapshots; - this.existingRefs = existingRefs; this.metadata = metadata; + this.providedRefs = providedRefs; - // Compute all maps once + // Compute basic maps this.providedSnapshotByIds = providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); this.existingSnapshotByIds = existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - this.metadataSnapshotIds = - metadata.snapshots().stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - this.existingBranchRefIds = - existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); - this.providedBranchRefIds = - providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); - - // Compute categorization - process in dependency order - // 1. Cherry-picked has highest priority (includes WAP being published) - // 2. WAP snapshots (staged, not published) - // 3. Regular snapshots (everything else) - this.cherryPickedSnapshots = computeCherryPickedSnapshots(); - Set cherryPickedIds = - cherryPickedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - this.wapSnapshots = computeWapSnapshots(cherryPickedIds); - Set wapIds = - wapSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - - this.regularSnapshots = computeRegularSnapshots(cherryPickedIds, wapIds); - - // Compute changes + // Compute diff (symmetric difference) this.newSnapshots = providedSnapshots.stream() .filter(s -> !existingSnapshotByIds.containsKey(s.snapshotId())) @@ -160,313 +109,144 @@ private class SnapshotDiff { existingSnapshots.stream() .filter(s -> !providedSnapshotByIds.containsKey(s.snapshotId())) .collect(Collectors.toList()); - this.branchUpdates = computeBranchUpdates(); - this.deletedIds = - deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - this.newRegularSnapshots = - regularSnapshots.stream().filter(newSnapshots::contains).collect(Collectors.toList()); - this.staleRefs = Sets.difference(existingRefs.keySet(), providedRefs.keySet()); - this.existingAfterDeletionIds = Sets.difference(existingSnapshotByIds.keySet(), deletedIds); - this.unreferencedNewSnapshots = - providedSnapshots.stream() - .filter( - s -> - !existingAfterDeletionIds.contains(s.snapshotId()) - && !providedBranchRefIds.contains(s.snapshotId()) - && !metadataSnapshotIds.contains(s.snapshotId())) - .collect(Collectors.toList()); - } - - private List computeWapSnapshots(Set excludeCherryPicked) { - // Depends on: cherry-picked IDs (to exclude WAP snapshots being published) - Set allBranchRefIds = - java.util.stream.Stream.concat( - existingBranchRefIds.stream(), providedBranchRefIds.stream()) - .collect(Collectors.toSet()); - - return providedSnapshots.stream() - .filter(s -> !excludeCherryPicked.contains(s.snapshotId())) - .filter( - s -> - s.summary() != null - && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP) - && !allBranchRefIds.contains(s.snapshotId())) - .collect(Collectors.toList()); - } - - private List computeCherryPickedSnapshots() { - Set cherryPickSourceIds = - providedSnapshots.stream() - .filter(s -> s.summary() != null && s.summary().containsKey("source-snapshot-id")) - .map(s -> Long.parseLong(s.summary().get("source-snapshot-id"))) - .collect(Collectors.toSet()); - - return providedSnapshots.stream() - .filter( - provided -> { - Snapshot existing = existingSnapshotByIds.get(provided.snapshotId()); - if (existing == null) { - return false; - } - - // Parent changed (moved to different branch) - if (!Objects.equal(provided.parentId(), existing.parentId())) { - return true; - } - - // Is source of cherry-pick - if (cherryPickSourceIds.contains(provided.snapshotId())) { - return true; - } - - // WAP snapshot being published (staged → branch) - boolean hasWapId = - provided.summary() != null - && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); - boolean wasStaged = !existingBranchRefIds.contains(provided.snapshotId()); - boolean isNowOnBranch = providedBranchRefIds.contains(provided.snapshotId()); - return hasWapId && wasStaged && isNowOnBranch; - }) - .collect(Collectors.toList()); - } - - private List computeRegularSnapshots( - Set excludeCherryPicked, Set excludeWap) { - // Depends on: cherry-picked and WAP IDs (everything else is regular) - return providedSnapshots.stream() - .filter(s -> !excludeCherryPicked.contains(s.snapshotId())) - .filter(s -> !excludeWap.contains(s.snapshotId())) - .collect(Collectors.toList()); - } - - private Map computeBranchUpdates() { - return providedRefs.entrySet().stream() - .filter( - entry -> { - SnapshotRef existing = existingRefs.get(entry.getKey()); - return existing == null || existing.snapshotId() != entry.getValue().snapshotId(); - }) - .collect(Collectors.toMap(Map.Entry::getKey, Map.Entry::getValue)); } /** - * Validates all snapshot changes before applying them to table metadata. Runs multiple - * validation checks to ensure snapshot operations are safe and consistent. - * - * @param base The base table metadata to validate against (may be null for table creation) - * @throws InvalidIcebergSnapshotException if any validation check fails + * Validates snapshots update - ensures we don't delete the latest snapshot without adding new + * ones. This is the same validation logic from SnapshotInspector.validateSnapshotsUpdate(). */ void validate(TableMetadata base) { - validateCurrentSnapshotNotDeleted(base); - validateNoAmbiguousCommits(); - validateDeletedSnapshotsNotReferenced(); - } - - /** - * Validates that the current snapshot is not deleted without providing replacement snapshots. - * This prevents leaving the table in an inconsistent state where the current snapshot pointer - * would reference a non-existent snapshot. - * - * @param base The base table metadata containing the current snapshot (may be null for table - * creation) - * @throws InvalidIcebergSnapshotException if the current snapshot is being deleted without - * replacements - */ - private void validateCurrentSnapshotNotDeleted(TableMetadata base) { if (base == null || base.currentSnapshot() == null) { return; } - - long currentSnapshotId = base.currentSnapshot().snapshotId(); - boolean currentDeleted = deletedIds.contains(currentSnapshotId); - - if (currentDeleted && newSnapshots.isEmpty()) { - throw new InvalidIcebergSnapshotException( - String.format( - "Cannot delete the current snapshot %s without adding replacement snapshots. " - + "Deleted: [%s], New: [%s]", - currentSnapshotId, - deletedSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(", ")), - newSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(", ")))); + if (!newSnapshots.isEmpty()) { + return; } - } - - /** - * Validates that no single snapshot is referenced by multiple branches in the same commit. This - * prevents ambiguous commits where it's unclear which branch should be the primary reference - * for a snapshot. Each snapshot can only be associated with one branch per commit to maintain - * clear lineage and avoid conflicts. - * - * @throws InvalidIcebergSnapshotException if a snapshot is referenced by multiple branches - */ - private void validateNoAmbiguousCommits() { - Map> snapshotToBranches = - branchUpdates.entrySet().stream() - .collect( - Collectors.groupingBy( - e -> e.getValue().snapshotId(), - Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); - - snapshotToBranches.forEach( - (snapshotId, branches) -> { - if (branches.size() > 1) { - throw new InvalidIcebergSnapshotException( - String.format( - "Ambiguous commit: snapshot %s is referenced by multiple branches [%s] in a single commit. " - + "Each snapshot can only be referenced by one branch per commit.", - snapshotId, String.join(", ", branches))); - } - }); - } - - /** - * Validates that snapshots being deleted are not still referenced by any branches or tags. This - * prevents data loss and maintains referential integrity by ensuring that all branch and tag - * pointers reference valid snapshots that will continue to exist after the commit. - * - * @throws InvalidIcebergSnapshotException if any deleted snapshot is still referenced by a - * branch or tag - */ - private void validateDeletedSnapshotsNotReferenced() { - Map> referencedIdsToRefs = - providedRefs.entrySet().stream() - .collect( - Collectors.groupingBy( - e -> e.getValue().snapshotId(), - Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); - - List invalidDeleteDetails = - deletedIds.stream() - .filter(referencedIdsToRefs::containsKey) - .map( - id -> - String.format( - "snapshot %s (referenced by: %s)", - id, String.join(", ", referencedIdsToRefs.get(id)))) - .collect(Collectors.toList()); - - if (!invalidDeleteDetails.isEmpty()) { + long latestSnapshotId = base.currentSnapshot().snapshotId(); + if (!deletedSnapshots.isEmpty() + && deletedSnapshots.get(deletedSnapshots.size() - 1).snapshotId() == latestSnapshotId) { throw new InvalidIcebergSnapshotException( - String.format( - "Cannot delete snapshots that are still referenced by branches/tags: %s", - String.join("; ", invalidDeleteDetails))); + String.format("Cannot delete the latest snapshot %s", latestSnapshotId)); } } - TableMetadata.Builder applyTo() { - TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); - - // Remove deleted snapshots - if (!deletedSnapshots.isEmpty()) { - builder.removeSnapshots(deletedIds); + TableMetadata.Builder applyTo(TableMetadata metadata) { + TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); + List appendedSnapshots = new ArrayList<>(); + List stagedSnapshots = new ArrayList<>(); + List cherryPickedSnapshots = new ArrayList<>(); + + // Validate only MAIN branch + for (Map.Entry entry : providedRefs.entrySet()) { + if (!entry.getKey().equals(SnapshotRef.MAIN_BRANCH)) { + throw new UnsupportedOperationException("OpenHouse supports only MAIN branch"); + } } - // Remove stale branch references - staleRefs.forEach(builder::removeRef); - - // Add unreferenced new snapshots - unreferencedNewSnapshots.forEach(builder::addSnapshot); - - // Set branch pointers - providedRefs.forEach( - (branchName, ref) -> { - Snapshot snapshot = providedSnapshotByIds.get(ref.snapshotId()); - if (snapshot == null) { - throw new InvalidIcebergSnapshotException( - String.format( - "Branch %s references non-existent snapshot %s", - branchName, ref.snapshotId())); - } + /** + * First check if there are new snapshots to be appended to current TableMetadata. If yes, + * following are the cases to be handled: + * + *

[1] A regular (non-wap) snapshot is being added to the MAIN branch. + * + *

[2] A staged (wap) snapshot is being created on top of current snapshot as its base. + * Recognized by STAGED_WAP_ID_PROP. + * + *

[3] A staged (wap) snapshot is being cherry picked to the MAIN branch wherein current + * snapshot in the MAIN branch is not the same as the base snapshot the staged (wap) snapshot + * was created on. Recognized by SOURCE_SNAPSHOT_ID_PROP. This case is called non-fast forward + * cherry pick. + * + *

In case no new snapshots are to be appended to current TableMetadata, there could be a + * cherrypick of a staged (wap) snapshot on top of the current snapshot in the MAIN branch + * which is the same as the base snapshot the staged (wap) snapshot was created on. This case + * is called fast forward cherry pick. + */ + if (CollectionUtils.isNotEmpty(newSnapshots)) { + for (Snapshot snapshot : newSnapshots) { + if (snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) { + // a stage only snapshot using wap.id + metadataBuilder.addSnapshot(snapshot); + stagedSnapshots.add(String.valueOf(snapshot.snapshotId())); + } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { + // a snapshot created on a non fast-forward cherry-pick snapshot + metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); + cherryPickedSnapshots.add( + String.valueOf(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))); + } else { + // a regular snapshot + metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); + } + } + } else if (MapUtils.isNotEmpty(providedRefs)) { + // Updated ref in the main branch with no new snapshot means this is a + // fast-forward cherry-pick or rollback operation. + long newSnapshotId = providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); + // Either the current snapshot is null or the current snapshot is not equal + // to the new snapshot indicates an update. The first case happens when the + // stage/wap snapshot being cherry-picked is the first snapshot. + if (MapUtils.isEmpty(metadata.refs()) + || metadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() != newSnapshotId) { + metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); + cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); + } + } - // Check if snapshot is already in metadata (after deletions) - boolean snapshotExistsInMetadata = - metadataSnapshotIds.contains(snapshot.snapshotId()) - && !deletedIds.contains(snapshot.snapshotId()); + // Delete snapshots + if (CollectionUtils.isNotEmpty(deletedSnapshots)) { + Set snapshotIds = + deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + metadataBuilder.removeSnapshots(snapshotIds); + } - if (snapshotExistsInMetadata) { - SnapshotRef existingRef = metadata.refs().get(branchName); - if (existingRef == null || existingRef.snapshotId() != ref.snapshotId()) { - builder.setRef(branchName, ref); - } - } else { - builder.setBranchSnapshot(snapshot, branchName); - } - }); + // Record metrics and properties + recordMetrics(metadataBuilder, appendedSnapshots, stagedSnapshots, cherryPickedSnapshots); - return builder; + return metadataBuilder; } - void recordMetrics(TableMetadata.Builder builder) { - int appendedCount = - (int) - regularSnapshots.stream() - .filter(s -> !existingSnapshotByIds.containsKey(s.snapshotId())) - .count(); - int stagedCount = wapSnapshots.size(); - int cherryPickedCount = cherryPickedSnapshots.size(); - int deletedCount = deletedSnapshots.size(); - - if (appendedCount > 0) { - metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedCount); - } - if (stagedCount > 0) { - metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedCount); - } - if (cherryPickedCount > 0) { + private void recordMetrics( + TableMetadata.Builder builder, + List appendedSnapshots, + List stagedSnapshots, + List cherryPickedSnapshots) { + Map updatedProperties = new HashMap<>(metadata.properties()); + + if (CollectionUtils.isNotEmpty(appendedSnapshots)) { + updatedProperties.put( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + String.join(",", appendedSnapshots)); metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickedCount); - } - if (deletedCount > 0) { - metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, deletedCount); - } - - // Record snapshot IDs in properties - if (!newRegularSnapshots.isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - formatSnapshotIds(newRegularSnapshots))); + InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); } - if (!wapSnapshots.isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - formatSnapshotIds(wapSnapshots))); + if (CollectionUtils.isNotEmpty(stagedSnapshots)) { + updatedProperties.put( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + String.join(",", stagedSnapshots)); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); } - if (!cherryPickedSnapshots.isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - formatSnapshotIds(cherryPickedSnapshots))); + if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { + updatedProperties.put( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + String.join(",", cherryPickedSnapshots)); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, + cherryPickedSnapshots.size()); } - if (!deletedSnapshots.isEmpty()) { - builder.setProperties( - Collections.singletonMap( - getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - formatSnapshotIds(deletedSnapshots))); + if (CollectionUtils.isNotEmpty(deletedSnapshots)) { + updatedProperties.put( + getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), + deletedSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(","))); + metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, deletedSnapshots.size()); } - builder.removeProperties( - Sets.newHashSet( - CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY)); + builder.setProperties(updatedProperties); } } - - /** - * Formats a list of snapshots as a comma-separated string of snapshot IDs. Optimized - * implementation using StringBuilder for better performance with large lists. - * - * @param snapshots List of snapshots to format - * @return Comma-separated string of snapshot IDs, or empty string if list is empty - */ - private String formatSnapshotIds(List snapshots) { - return snapshots.stream() - .map(Snapshot::snapshotId) - .map(String::valueOf) - .collect(Collectors.joining(",")); - } } diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index c5c186eb5..cbced7f7a 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -1351,62 +1351,6 @@ private void verifyMetricHistogramBuckets( Assertions.assertFalse(Double.isNaN(maxTime), "Timer max time should not be NaN"); } - /** - * Tests that attempting to delete a snapshot referenced by the main branch throws an exception. - * Verifies that InvalidIcebergSnapshotException is thrown with appropriate error message. - */ - @Test - void testDeleteSnapshotWithMainReference() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - // Create base metadata with multiple snapshots - TableMetadata baseMetadata = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .addSnapshot(testSnapshots.get(0)) // Unreferenced - can be deleted - .addSnapshot(testSnapshots.get(1)) // Unreferenced - can be deleted - .addSnapshot(testSnapshots.get(2)) // Unreferenced - can be deleted - .setBranchSnapshot( - testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) // Referenced - cannot be deleted - .build(); - - // Get the current head snapshot that is referenced by main branch - Snapshot referencedSnapshot = testSnapshots.get(3); - - // Create new metadata that attempts to delete the referenced snapshot - // The SNAPSHOTS_JSON_KEY will only include first 3 snapshots (excluding the referenced one) - // But SNAPSHOTS_REFS_KEY will still reference snapshot 3, causing a conflict - Map properties = new HashMap<>(baseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, - SnapshotsUtil.serializedSnapshots( - testSnapshots.subList(0, 3))); // Only snapshots 0-2, excluding referenced snapshot 3 - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - referencedSnapshot))); // Still references snapshot 3 - - TableMetadata newMetadata = baseMetadata.replaceProperties(properties); - - // This MUST throw IllegalArgumentException for referenced snapshots - InvalidIcebergSnapshotException exception = - Assertions.assertThrows( - InvalidIcebergSnapshotException.class, - () -> - openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( - baseMetadata, newMetadata), - "Should throw InvalidIcebergSnapshotException when trying to delete referenced snapshot"); - - // Verify error message mentions the reference - String expectedMessage = - "Cannot delete the current snapshot " - + referencedSnapshot.snapshotId() - + " without adding replacement snapshots"; - Assertions.assertTrue( - exception.getMessage().contains(expectedMessage), - "Error message should indicate snapshot is still referenced: " + exception.getMessage()); - } - /** * Tests that unreferenced snapshots can be successfully deleted from the table. Verifies that * deleted snapshots are removed from metadata and tracked in properties. @@ -1473,138 +1417,6 @@ void testDeleteSnapshotWithNoReference() throws IOException { } } - /** - * Tests that attempting to delete a snapshot referenced by multiple branches throws an exception. - * Verifies that InvalidIcebergSnapshotException is thrown indicating the snapshot is still - * referenced. - */ - @Test - void testDeleteSnapshotWithMultipleReference() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - // Create metadata with 2 snapshots: one referenced by multiple branches, one unreferenced - Snapshot sharedSnapshot = testSnapshots.get(0); // This will be referenced by both branches - Snapshot mainSnapshot = testSnapshots.get(1); // This one stays but is not referenced - - TableMetadata baseMetadata = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .addSnapshot(sharedSnapshot) - .addSnapshot(mainSnapshot) - .setRef( - SnapshotRef.MAIN_BRANCH, - SnapshotRef.branchBuilder(mainSnapshot.snapshotId()).build()) - .setRef( - "feature_branch", SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) - .setRef( - "feature_branch1", SnapshotRef.branchBuilder(sharedSnapshot.snapshotId()).build()) - .build(); - - // Attempt to delete the shared snapshot by creating new metadata without it - // Keep the unreferenced snapshot so we're not deleting everything - List remainingSnapshots = List.of(mainSnapshot); - - // Keep refs pointing to the shared snapshot (causing conflict) - Map refs = baseMetadata.refs(); - Map serializedRefs = - refs.entrySet().stream() - .collect( - Collectors.toMap( - Map.Entry::getKey, - e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); - - Map properties = new HashMap<>(baseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); - - TableMetadata newMetadata = baseMetadata.replaceProperties(properties); - - // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by multiple branches - InvalidIcebergSnapshotException exception = - Assertions.assertThrows( - InvalidIcebergSnapshotException.class, - () -> - openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( - baseMetadata, newMetadata), - "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); - - // Verify error message mentions the snapshot is still referenced - String exceptionMessage = exception.getMessage(); - Assertions.assertTrue( - exceptionMessage.contains("Still referenced by refs") - || exceptionMessage.contains("still referenced"), - "Error message should indicate snapshot is still referenced by branches: " - + exceptionMessage); - } - - /** - * Tests that attempting to delete a snapshot referenced by a tag throws an exception. Verifies - * that InvalidIcebergSnapshotException is thrown with branch/tag reference details. - */ - @Test - void testDeleteSnapshotWithBranchReference() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - // Create base metadata with snapshots - add the tagged snapshot first - Snapshot taggedSnapshot = testSnapshots.get(0); - TableMetadata baseMetadata = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .addSnapshot(taggedSnapshot) // Add the snapshot first so it exists - .setBranchSnapshot(testSnapshots.get(testSnapshots.size() - 1), SnapshotRef.MAIN_BRANCH) - .setRef( - "feature_branch", - SnapshotRef.tagBuilder(taggedSnapshot.snapshotId()).build()) // Now create the tag - .build(); - // Add remaining snapshots - for (int i = 1; i < testSnapshots.size() - 1; i++) { - baseMetadata = - TableMetadata.buildFrom(baseMetadata).addSnapshot(testSnapshots.get(i)).build(); - } - - // Make baseMetadata effectively final for lambda usage - final TableMetadata finalBaseMetadata = baseMetadata; - - // Attempt to delete snapshot that has a tag reference by creating new metadata without it - List remainingSnapshots = - finalBaseMetadata.snapshots().stream() - .filter(s -> s.snapshotId() != taggedSnapshot.snapshotId()) - .collect(Collectors.toList()); - - Map properties = new HashMap<>(finalBaseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); - // Keep refs pointing to the tagged snapshot (causing conflict) - Map serializedRefs = - finalBaseMetadata.refs().entrySet().stream() - .collect( - Collectors.toMap( - Map.Entry::getKey, - e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); - - TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); - - // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by tags - InvalidIcebergSnapshotException exception = - Assertions.assertThrows( - InvalidIcebergSnapshotException.class, - () -> - openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( - finalBaseMetadata, newMetadata), - "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by tag"); - - // Verify error message mentions tag reference - String exceptionMessage = exception.getMessage(); - String expectedMessage = - "Cannot delete snapshots that are still referenced by branches/tags: snapshot " - + taggedSnapshot.snapshotId() - + " (referenced by: feature_branch)"; - Assertions.assertTrue( - exceptionMessage.contains(expectedMessage), - "Error message should indicate snapshot is still referenced by branches: " - + exceptionMessage); - } - /** * Tests that attempting to delete an empty list of snapshots makes no changes to the table. * Verifies that no snapshots are deleted and no deletion properties are set. @@ -1992,52 +1804,6 @@ void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { } } - /** - * Tests that multiple branches can point to different snapshots without conflicts. Verifies that - * commits with multiple valid branch references succeed without exceptions. - */ - @Test - void testValidMultipleBranchesWithDifferentSnapshots() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - // Create base metadata - TableMetadata baseMetadata = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .build(); - - // New metadata includes all snapshots (base + new ones) - List allSnapshots = testSnapshots.subList(0, 4); // snapshots 0, 1, 2, 3 - - // Create snapshotRefs where each branch points to a DIFFERENT snapshot (valid scenario) - Map validRefs = new HashMap<>(); - validRefs.put("branch_a", SnapshotRef.branchBuilder(testSnapshots.get(1).snapshotId()).build()); - validRefs.put("branch_b", SnapshotRef.branchBuilder(testSnapshots.get(2).snapshotId()).build()); - validRefs.put("branch_c", SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build()); - - // Serialize the refs - Map serializedRefs = - validRefs.entrySet().stream() - .collect( - Collectors.toMap( - Map.Entry::getKey, - e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); - - Map properties = new HashMap<>(baseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); - - TableMetadata newMetadata = baseMetadata.replaceProperties(properties); - - // This should NOT throw an exception - Assertions.assertDoesNotThrow( - () -> - openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( - baseMetadata, newMetadata), - "Should NOT throw exception when branches target different snapshots"); - } - /** * Tests the standard Write-Audit-Publish (WAP) workflow where a staged snapshot becomes main. * Verifies that pulling a WAP snapshot into the main branch succeeds without errors. @@ -2379,76 +2145,6 @@ void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws I } } - /** - * Tests that committing with multiple branches pointing to the same snapshot throws an exception. - * Verifies that InvalidIcebergSnapshotException is thrown for ambiguous branch configurations. - */ - @Test - void testMultipleDiffCommitWithInvalidBranch() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - try (MockedStatic ignoreWriteMock = - Mockito.mockStatic(TableMetadataParser.class)) { - - // ========== Create base at N with 1 snapshot ========== - TableMetadata baseAtN = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .build(); - - // ========== Create metadata with 4 snapshots but only snapshot 0 in refs ========== - // Build metadata with all 4 snapshots added, but keep MAIN pointing to snapshot 0 - TableMetadata.Builder builder = TableMetadata.buildFrom(baseAtN); - // Add snapshots 1, 2, 3 without assigning them to any branch - builder.addSnapshot(testSnapshots.get(1)); - builder.addSnapshot(testSnapshots.get(2)); - builder.addSnapshot(testSnapshots.get(3)); - TableMetadata metadataWithAllSnapshots = builder.build(); - - // Add custom properties with AMBIGUOUS branch refs - both pointing to same snapshot - Map divergentProperties = - new HashMap<>(metadataWithAllSnapshots.properties()); - List snapshots4 = testSnapshots.subList(0, 4); - divergentProperties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); - - // Create INVALID refs: both MAIN and feature_a pointing to the SAME snapshot (ambiguous!) - Map ambiguousRefs = new HashMap<>(); - ambiguousRefs.put( - SnapshotRef.MAIN_BRANCH, - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); - ambiguousRefs.put( - "feature_a", - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()) - .build())); // Same snapshot! - - divergentProperties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(ambiguousRefs)); - - TableMetadata finalDivergentMetadata = - metadataWithAllSnapshots.replaceProperties(divergentProperties); - - InvalidIcebergSnapshotException exception = - Assertions.assertThrows( - InvalidIcebergSnapshotException.class, - () -> openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata), - "Should throw InvalidIcebergSnapshotException when multiple branches point to same snapshot"); - - // Verify error message indicates the ambiguous commit - String exceptionMessage = exception.getMessage(); - String expectedMessage = - "Ambiguous commit: snapshot " - + testSnapshots.get(3).snapshotId() - + " is referenced by multiple branches [feature_a, main] in a single commit. Each snapshot can only be referenced by one branch per commit."; - Assertions.assertTrue( - exceptionMessage.contains(expectedMessage), - "Error message should indicate multiple branches targeting same snapshot: " - + exceptionMessage); - } - } - /** * Tests divergent commit (N to N+3) that includes both regular snapshots and WAP staged * snapshots. Verifies that staged snapshots remain properly tracked as staged even during a diff --git a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java b/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java deleted file mode 100644 index 478059289..000000000 --- a/integrations/spark/spark-3.5/openhouse-spark-itest/src/test/java/com/linkedin/openhouse/spark/catalogtest/BranchTestSpark3_5.java +++ /dev/null @@ -1,2370 +0,0 @@ -package com.linkedin.openhouse.spark.catalogtest; - -import static org.junit.jupiter.api.Assertions.*; - -import com.linkedin.openhouse.tablestest.OpenHouseSparkITest; -import java.util.List; -import java.util.Set; -import java.util.stream.Collectors; -import org.apache.spark.sql.Row; -import org.apache.spark.sql.SparkSession; -import org.junit.jupiter.api.AfterEach; -import org.junit.jupiter.api.MethodOrderer; -import org.junit.jupiter.api.Test; -import org.junit.jupiter.api.TestMethodOrder; -import org.junit.jupiter.api.parallel.Execution; -import org.junit.jupiter.api.parallel.ExecutionMode; - -/** - * Comprehensive tests for multi-branch WAP operations in Spark 3.5. Tests validate the enhanced - * applySnapshotOperations functionality that supports: - Non-main branch operations (add/expire - * snapshots from any branch) - WAP.id staging with multi-branch support - Cherry picking between - * any branches - Fast forward merges for all branches - Backward compatibility with main-only - * workflows - Forward compatibility for future wap.branch features - */ -@TestMethodOrder(MethodOrderer.MethodName.class) -@Execution(ExecutionMode.SAME_THREAD) -public class BranchTestSpark3_5 extends OpenHouseSparkITest { - - /** - * Comprehensive cleanup method to prevent configuration and table bleed-over between tests. This - * ensures WAP configurations are properly reset and all test tables are dropped. - */ - @AfterEach - public void cleanupAfterTest() { - try (SparkSession spark = getSparkSession()) { - // Clear WAP configurations to prevent bleed-over between tests - spark.conf().unset("spark.wap.id"); - spark.conf().unset("spark.wap.branch"); - - // Drop all test tables to ensure clean state for next test - // Get all tables in the d1 database that start with branch_test_ or similar patterns - try { - List tables = spark.sql("SHOW TABLES IN openhouse.d1").collectAsList(); - for (Row table : tables) { - String tableName = table.getString(1); // table name is in second column - if (tableName.startsWith("branch_test_") || tableName.startsWith("test_")) { - String fullTableName = "openhouse.d1." + tableName; - spark.sql("DROP TABLE IF EXISTS " + fullTableName); - } - } - } catch (Exception e) { - // If SHOW TABLES fails, try to drop common test table patterns - // This is a fallback in case the database doesn't exist yet - for (String pattern : new String[] {"branch_test_", "test_"}) { - for (int i = 0; i < 10; i++) { // Try a few recent timestamps - long timestamp = System.currentTimeMillis() - (i * 1000); - String tableName = "openhouse.d1." + pattern + timestamp; - try { - spark.sql("DROP TABLE IF EXISTS " + tableName); - } catch (Exception ignored) { - // Ignore failures for non-existent tables - } - } - } - } - } catch (Exception e) { - // Log but don't fail the test for cleanup issues - System.err.println("Warning: Failed to cleanup after test: " + e.getMessage()); - } - } - - // ===== BASIC BRANCH OPERATIONS ===== - - @Test - public void testBasicBranchOperations() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Add initial data to main - spark.sql("INSERT INTO " + tableName + " VALUES ('main.initial')"); - - // Create feature branch - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Write to feature branch - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data1')"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data2')"); - - // Verify branch isolation - assertEquals( - 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature-a has 3 rows - - // Verify refs exist for both branches - List refs = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals(2, refs.size()); - Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); - assertTrue(refNames.contains("feature_a")); - assertTrue(refNames.contains("main")); - } - } - - // ===== WAP STAGING WITH MULTI-BRANCH SUPPORT ===== - - @Test - public void testWapStagingWithBranches() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup main and feature branches - spark.sql("INSERT INTO " + tableName + " VALUES ('main.data')"); - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature-a.data')"); - - // Stage WAP snapshot (should not affect any branch) - spark.conf().set("spark.wap.id", "multi-branch-wap"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap.staged.data')"); - - // Verify WAP staging doesn't affect branch visibility - assertEquals( - 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature-a unchanged - - // Verify WAP snapshot exists but no new refs - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + ".snapshots") - .collectAsList() - .size()); // 1 main + 1 feature + 1 wap - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + ".refs") - .collectAsList() - .size()); // main + feature-a only - - // Verify WAP snapshot has correct properties - List wapSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'multi-branch-wap'") - .collectAsList(); - assertEquals(1, wapSnapshots.size()); - } - } - - @Test - public void testWapIdAfterCreateTable() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_id_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - // Create table without any data (no snapshots exist) - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Enable WAP on the table - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Verify no snapshots exist yet - List initialSnapshots = - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); - assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); - - // Verify no branches exist yet (empty table has no branches) - List initialRefs = spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); - assertEquals(0, initialRefs.size(), "Empty table should have no branches initially"); - - // ===== WAP STAGING ON EMPTY TABLE ===== - - // 1. Create WAP staged data on empty table (should create staging snapshot) - spark.conf().set("spark.wap.id", "wap-stage-1"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_1')"); - spark.conf().unset("spark.wap.id"); - - // Verify WAP snapshot was created - List wapSnapshots = - spark - .sql( - "SELECT snapshot_id, summary FROM " - + tableName - + ".snapshots " - + "WHERE summary['wap.id'] = 'wap-stage-1'") - .collectAsList(); - assertEquals(1, wapSnapshots.size(), "Should have 1 WAP staged snapshot"); - - // Verify no branches exist yet (WAP staging doesn't create branches) - List refsAfterWapStaging = - spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); - assertEquals(0, refsAfterWapStaging.size(), "WAP staging should not create branches"); - - // Verify WAP data is not visible in main queries (no branch exists) - assertEquals( - 0, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Should see 0 rows - no branches exist, WAP data is staged"); - - // ===== WAP PUBLISHING TO CREATE MAIN BRANCH ===== - - // 2. Publish WAP data to create main branch - String wapSnapshotId = String.valueOf(wapSnapshots.get(0).getLong(0)); - spark.sql( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', " - + wapSnapshotId - + ")"); - - // Verify main branch now exists - List refsAfterPublishing = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals( - 1, refsAfterPublishing.size(), "Should have main branch after publishing WAP data"); - assertEquals("main", refsAfterPublishing.get(0).getString(0), "Should have main branch"); - - // Verify WAP data is now visible in main branch - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 1 row after WAP publishing"); - - List mainData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); - assertEquals( - "wap_staged_data_1", mainData.get(0).getString(0), "Should see published WAP data"); - - // ===== MULTI-WAP OPERATIONS ===== - - // 3. Create multiple WAP staged data sets - spark.conf().set("spark.wap.id", "wap-stage-2"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_2')"); - spark.conf().unset("spark.wap.id"); - - spark.conf().set("spark.wap.id", "wap-stage-3"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap_staged_data_3')"); - spark.conf().unset("spark.wap.id"); - - // Verify multiple WAP snapshots exist - List allWapSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots " - + "WHERE summary['wap.id'] IS NOT NULL") - .collectAsList(); - assertEquals(3, allWapSnapshots.size(), "Should have 3 WAP staged snapshots"); - - // Verify main branch is unchanged (WAP data is staged) - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should still have 1 row (staged WAP not visible)"); - - // ===== SELECTIVE WAP PUBLISHING ===== - - // 4. Publish second WAP data set only - List wap2Snapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots " - + "WHERE summary['wap.id'] = 'wap-stage-2'") - .collectAsList(); - String wap2SnapshotId = String.valueOf(wap2Snapshots.get(0).getLong(0)); - spark.sql( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', " - + wap2SnapshotId - + ")"); - - // Verify main branch now has both published datasets - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 2 rows after second WAP publishing"); - - List publishedData = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals( - "wap_staged_data_1", - publishedData.get(0).getString(0), - "First row should be first WAP data"); - assertEquals( - "wap_staged_data_2", - publishedData.get(1).getString(0), - "Second row should be second WAP data"); - - // ===== UNPUBLISHED WAP DATA VERIFICATION ===== - - // 5. Verify third WAP data remains unpublished - List wap3Snapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots " - + "WHERE summary['wap.id'] = 'wap-stage-3'") - .collectAsList(); - assertEquals(1, wap3Snapshots.size(), "Third WAP snapshot should still exist"); - - // Verify unpublished WAP data is not visible - List currentData = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertFalse( - currentData.stream().anyMatch(row -> "wap_staged_data_3".equals(row.getString(0))), - "Unpublished WAP data should not be visible in main branch"); - - // ===== REGULAR DATA VS WAP DATA ===== - - // 6. Add regular (non-WAP) data to main branch - spark.sql("INSERT INTO " + tableName + " VALUES ('regular_data')"); - - // Verify main branch now has mixed data - assertEquals( - 3, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 3 rows (2 published WAP + 1 regular)"); - - List finalData = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals("regular_data", finalData.get(0).getString(0), "Should contain regular data"); - assertEquals( - "wap_staged_data_1", finalData.get(1).getString(0), "Should contain first WAP data"); - assertEquals( - "wap_staged_data_2", finalData.get(2).getString(0), "Should contain second WAP data"); - - // ===== SNAPSHOT HISTORY VERIFICATION ===== - - // 7. Verify snapshot counts and types - List totalSnapshots = - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); - assertTrue( - totalSnapshots.size() >= 4, "Should have at least 4 snapshots (3 WAP + 1 regular)"); - - // Verify WAP snapshots still exist in metadata - List remainingWapSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots " - + "WHERE summary['wap.id'] IS NOT NULL") - .collectAsList(); - assertEquals( - 3, remainingWapSnapshots.size(), "All 3 WAP snapshots should still exist in metadata"); - - // Verify main branch has the latest published snapshot (points to regular INSERT snapshot) - List mainSnapshotRef = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .collectAsList(); - assertEquals(1, mainSnapshotRef.size(), "Main branch should exist and point to a snapshot"); - } - } - - @Test - public void testBranchAfterCreateTable() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - // Create table without any data (no snapshots exist) - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Verify no snapshots exist yet - List initialSnapshots = - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); - assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); - - // Create branch on table with no existing snapshots - // According to Iceberg specification, this should succeed and create an empty snapshot - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_on_empty"); - - // Verify that an empty snapshot was created for the branch - List snapshotsAfterBranchCreation = - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); - assertEquals( - 1, - snapshotsAfterBranchCreation.size(), - "Should have 1 empty snapshot after branch creation"); - - // Verify the empty snapshot properties - Row emptySnapshot = snapshotsAfterBranchCreation.get(0); - // The parent_id should be null for the empty snapshot - assertNull( - emptySnapshot.get(emptySnapshot.fieldIndex("parent_id")), - "Empty snapshot should have no parent"); - - // Verify the branch was created successfully - List refsAfterBranchCreation = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals( - 1, - refsAfterBranchCreation.size(), - "Should have feature_on_empty branch (main doesn't exist yet)"); - assertEquals( - "feature_on_empty", - refsAfterBranchCreation.get(0).getString(0), - "Should have feature_on_empty branch"); - - // Verify that main branch still doesn't exist (as expected) - boolean hasMainBranch = - refsAfterBranchCreation.stream().anyMatch(row -> "main".equals(row.getString(0))); - assertFalse(hasMainBranch, "Main branch should not exist on empty table"); - - // Now insert data to create a data snapshot - spark.sql("INSERT INTO " + tableName + " VALUES ('initial.data')"); - - // Verify we now have 2 snapshots (empty + data) - List snapshotsAfterInsert = - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); - assertEquals( - 2, snapshotsAfterInsert.size(), "Should have 2 snapshots after insert (empty + data)"); - - // Now we should have main branch as well - List refsAfterInsert = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals(2, refsAfterInsert.size(), "Should have feature_on_empty and main branches"); - - // Create another branch after data exists - this should also succeed - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_after_snapshot"); - - // Verify we now have 3 branches (feature_on_empty, main, feature_after_snapshot) - List refs = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals(3, refs.size(), "Should have 3 branches total"); - - // Verify all expected branches exist - Set branchNames = - refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); - assertTrue(branchNames.contains("feature_on_empty"), "feature_on_empty branch should exist"); - assertTrue(branchNames.contains("main"), "main branch should exist"); - assertTrue( - branchNames.contains("feature_after_snapshot"), - "feature_after_snapshot branch should exist"); - - // ===== BRANCH ISOLATION TESTING ===== - - // 1. Test initial state: main and feature_after_snapshot should have the same data - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 1 row"); - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") - .collectAsList() - .size(), - "feature_after_snapshot branch should have 1 row"); - - // 2. Test feature_on_empty branch should be empty (points to empty snapshot) - assertEquals( - 0, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") - .collectAsList() - .size(), - "feature_on_empty branch should have 0 rows (points to empty snapshot)"); - - // 3. Add data to feature_on_empty branch only - spark.sql( - "INSERT INTO " + tableName + ".branch_feature_on_empty VALUES ('empty_branch_data')"); - - // Verify isolation: feature_on_empty now has data, others unchanged - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") - .collectAsList() - .size(), - "feature_on_empty branch should now have 1 row"); - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should still have 1 row (unchanged)"); - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") - .collectAsList() - .size(), - "feature_after_snapshot branch should still have 1 row (unchanged)"); - - // 4. Add different data to feature_after_snapshot branch - spark.sql( - "INSERT INTO " - + tableName - + ".branch_feature_after_snapshot VALUES ('snapshot_branch_data')"); - - // Verify isolation: each branch has its own data - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") - .collectAsList() - .size(), - "feature_on_empty branch should still have 1 row"); - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should still have 1 row (unchanged)"); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") - .collectAsList() - .size(), - "feature_after_snapshot branch should now have 2 rows"); - - // 5. Add data to main branch - spark.sql("INSERT INTO " + tableName + " VALUES ('main_branch_data')"); - - // Verify complete isolation: each branch maintains its own data - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_on_empty'") - .collectAsList() - .size(), - "feature_on_empty branch should still have 1 row"); - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should now have 2 rows"); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_after_snapshot'") - .collectAsList() - .size(), - "feature_after_snapshot branch should still have 2 rows (unchanged)"); - - // 6. Verify data content isolation - List featureOnEmptyData = - spark - .sql( - "SELECT name FROM " - + tableName - + " VERSION AS OF 'feature_on_empty' ORDER BY name") - .collectAsList(); - assertEquals( - "empty_branch_data", - featureOnEmptyData.get(0).getString(0), - "feature_on_empty should contain its specific data"); - - List mainData = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals( - "initial.data", mainData.get(0).getString(0), "main should contain initial data"); - assertEquals( - "main_branch_data", - mainData.get(1).getString(0), - "main should contain its specific data"); - - List featureAfterSnapshotData = - spark - .sql( - "SELECT name FROM " - + tableName - + " VERSION AS OF 'feature_after_snapshot' ORDER BY name") - .collectAsList(); - assertEquals( - "initial.data", - featureAfterSnapshotData.get(0).getString(0), - "feature_after_snapshot should contain initial data"); - assertEquals( - "snapshot_branch_data", - featureAfterSnapshotData.get(1).getString(0), - "feature_after_snapshot should contain its specific data"); - - // 7. Verify snapshot isolation: each branch should have different snapshot histories - List mainSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .collectAsList(); - List featureOnEmptySnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_on_empty'") - .collectAsList(); - List featureAfterSnapshotSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".refs WHERE name = 'feature_after_snapshot'") - .collectAsList(); - - assertNotEquals( - mainSnapshots.get(0).getLong(0), - featureOnEmptySnapshots.get(0).getLong(0), - "main and feature_on_empty should point to different snapshots"); - assertNotEquals( - mainSnapshots.get(0).getLong(0), - featureAfterSnapshotSnapshots.get(0).getLong(0), - "main and feature_after_snapshot should point to different snapshots"); - assertNotEquals( - featureOnEmptySnapshots.get(0).getLong(0), - featureAfterSnapshotSnapshots.get(0).getLong(0), - "feature_on_empty and feature_after_snapshot should point to different snapshots"); - } - } - - @Test - public void testWapBranchAfterCreateTable() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - // Create table without any data (no snapshots exist) - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Enable WAP on the table - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Verify no snapshots exist yet - List initialSnapshots = - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); - assertEquals(0, initialSnapshots.size(), "Newly created table should have no snapshots"); - - // Create branch on table with no existing snapshots - // According to Iceberg specification, this should succeed and create an empty snapshot - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_empty"); - - // Verify that an empty snapshot was created for the branch - List snapshotsAfterBranchCreation = - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList(); - assertEquals( - 1, - snapshotsAfterBranchCreation.size(), - "Should have 1 empty snapshot after branch creation"); - - // Verify the branch was created successfully - List refsAfterBranchCreation = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals( - 1, - refsAfterBranchCreation.size(), - "Should have feature_empty branch (main doesn't exist yet)"); - assertEquals( - "feature_empty", - refsAfterBranchCreation.get(0).getString(0), - "Should have feature_empty branch"); - - // ===== WAP BRANCH TESTING ===== - - // 1. Set WAP branch and insert data - should go to the feature_empty branch - spark.conf().set("spark.wap.branch", "feature_empty"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap_branch_data_1')"); - - // Verify WAP branch data is visible when spark.wap.branch is set - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Should see 1 row when spark.wap.branch=feature_empty"); - - List wapBranchData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); - assertEquals( - "wap_branch_data_1", wapBranchData.get(0).getString(0), "Should see WAP branch data"); - - // Verify feature_empty branch directly - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") - .collectAsList() - .size(), - "feature_empty branch should have 1 row"); - - // Unset WAP branch - queries should now see main branch (which doesn't exist yet, so empty) - spark.conf().unset("spark.wap.branch"); - assertEquals( - 0, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Should see 0 rows when spark.wap.branch is unset (main doesn't exist)"); - - // ===== MULTI-BRANCH WAP TESTING ===== - - // 2. Create main branch with regular data - spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); - - // Now we should have main branch - List refs = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals(2, refs.size(), "Should have feature_empty and main branches"); - - // Verify main branch data when spark.wap.branch is unset - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 1 row"); - List mainData = spark.sql("SELECT name FROM " + tableName + "").collectAsList(); - assertEquals("main_data", mainData.get(0).getString(0), "Should see main branch data"); - - // 3. Create another branch and test WAP branch functionality - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_wap_test"); - - // Set WAP branch to feature_wap_test and add data - spark.conf().set("spark.wap.branch", "feature_wap_test"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap_branch_data_2')"); - - // Verify WAP branch data is visible when spark.wap.branch=feature_wap_test - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Should see 2 rows when spark.wap.branch=feature_wap_test (main_data + wap_branch_data_2)"); - - // ===== COMPREHENSIVE WAP BRANCH ISOLATION VERIFICATION ===== - - // Verify each branch has independent data - spark.conf().unset("spark.wap.branch"); - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 1 row when WAP branch is unset"); - - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") - .collectAsList() - .size(), - "feature_empty branch should have 1 row"); - - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_wap_test'") - .collectAsList() - .size(), - "feature_wap_test branch should have 2 rows"); - - // Verify data content isolation - List finalMainData = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals("main_data", finalMainData.get(0).getString(0), "main should contain main_data"); - - List finalFeatureEmptyData = - spark - .sql("SELECT name FROM " + tableName + " VERSION AS OF 'feature_empty' ORDER BY name") - .collectAsList(); - assertEquals( - "wap_branch_data_1", - finalFeatureEmptyData.get(0).getString(0), - "feature_empty should contain wap_branch_data_1"); - - List finalFeatureWapTestData = - spark - .sql( - "SELECT name FROM " - + tableName - + " VERSION AS OF 'feature_wap_test' ORDER BY name") - .collectAsList(); - assertEquals( - "main_data", - finalFeatureWapTestData.get(0).getString(0), - "feature_wap_test should contain main_data"); - assertEquals( - "wap_branch_data_2", - finalFeatureWapTestData.get(1).getString(0), - "feature_wap_test should contain wap_branch_data_2"); - - // ===== WAP BRANCH SWITCHING BEHAVIOR ===== - - // 4. Test switching between WAP branches - spark.conf().set("spark.wap.branch", "feature_empty"); - List switchToFeatureEmpty = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals( - "wap_branch_data_1", - switchToFeatureEmpty.get(0).getString(0), - "Should see feature_empty data when switched"); - - spark.conf().set("spark.wap.branch", "feature_wap_test"); - List switchToFeatureWapTest = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals( - 2, switchToFeatureWapTest.size(), "Should see 2 rows when switched to feature_wap_test"); - assertEquals( - "main_data", switchToFeatureWapTest.get(0).getString(0), "First row should be main_data"); - assertEquals( - "wap_branch_data_2", - switchToFeatureWapTest.get(1).getString(0), - "Second row should be wap_branch_data_2"); - - // 5. Test INSERT behavior with WAP branch set - spark.conf().set("spark.wap.branch", "feature_empty"); - spark.sql("INSERT INTO " + tableName + " VALUES ('additional_wap_data')"); - - // Verify the insert went to the WAP branch - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Should see 2 rows in feature_empty after additional insert"); - - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_empty'") - .collectAsList() - .size(), - "feature_empty branch should have 2 rows after additional insert"); - - // Verify other branches are unchanged - spark.conf().unset("spark.wap.branch"); - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should still have 1 row (unchanged)"); - - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_wap_test'") - .collectAsList() - .size(), - "feature_wap_test branch should still have 2 rows (unchanged)"); - - // ===== SNAPSHOT HISTORY VERIFICATION ===== - - // 6. Verify that each branch points to different snapshots - List finalMainSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .collectAsList(); - List finalFeatureEmptySnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_empty'") - .collectAsList(); - List finalFeatureWapTestSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_wap_test'") - .collectAsList(); - - assertNotEquals( - finalMainSnapshots.get(0).getLong(0), - finalFeatureEmptySnapshots.get(0).getLong(0), - "main and feature_empty should point to different snapshots"); - assertNotEquals( - finalMainSnapshots.get(0).getLong(0), - finalFeatureWapTestSnapshots.get(0).getLong(0), - "main and feature_wap_test should point to different snapshots"); - assertNotEquals( - finalFeatureEmptySnapshots.get(0).getLong(0), - finalFeatureWapTestSnapshots.get(0).getLong(0), - "feature_empty and feature_wap_test should point to different snapshots"); - - // Clean up WAP branch configuration - spark.conf().unset("spark.wap.branch"); - } - } - - @Test - public void testWapBranchCommitWithMultipleBranches() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_multi_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - // Create table and enable WAP - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Step 1: Start with main at snapshotX - spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); - - // Verify main branch exists and get its snapshot - List mainSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .collectAsList(); - assertEquals(1, mainSnapshots.size(), "Main branch should exist"); - long snapshotX = mainSnapshots.get(0).getLong(0); - System.out.println("SnapshotX (main): " + snapshotX); - - // Step 2: Create branchA from main → branchA also points to snapshotX - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchA"); - - // Verify branchA points to same snapshot as main - List branchASnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") - .collectAsList(); - assertEquals(1, branchASnapshots.size(), "BranchA should exist"); - long branchASnapshotAfterCreation = branchASnapshots.get(0).getLong(0); - assertEquals( - snapshotX, branchASnapshotAfterCreation, "BranchA should point to same snapshot as main"); - - // Step 3: Set branchA as the WAP branch and commit data - spark.conf().set("spark.wap.branch", "branchA"); - spark.sql("INSERT INTO " + tableName + " VALUES ('branchA_data')"); - - // Step 4: Verify branchA now points to snapshotY (child of snapshotX) - List branchASnapshotsAfterCommit = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") - .collectAsList(); - long snapshotY = branchASnapshotsAfterCommit.get(0).getLong(0); - assertNotEquals( - snapshotX, snapshotY, "BranchA should now point to a new snapshot (snapshotY)"); - System.out.println("SnapshotY (branchA after commit): " + snapshotY); - - // Verify branchA has both main_data and branchA_data - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") - .collectAsList() - .size(), - "BranchA should have 2 rows after commit"); - - // Verify main still points to snapshotX and has only main_data - spark.conf().unset("spark.wap.branch"); - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should still have 1 row"); - - // Step 5: Create branchB from branchA → branchB points to snapshotY - // First create the branch, then set it to point to the same snapshot as branchA - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchB"); - spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'branchB', 'branchA')"); - - // Verify branchB points to snapshotY - List branchBSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") - .collectAsList(); - long branchBSnapshotAfterCreation = branchBSnapshots.get(0).getLong(0); - assertEquals( - snapshotY, - branchBSnapshotAfterCreation, - "BranchB should point to snapshotY (same as branchA)"); - - // Step 6: Make a commit on branchB → branchB now points to snapshotZ (child of snapshotY) - // Use direct branch syntax to target branchB specifically - spark.sql("INSERT INTO " + tableName + ".branch_branchB VALUES ('branchB_data')"); - - // Verify branchB now points to snapshotZ - List branchBSnapshotsAfterCommit = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") - .collectAsList(); - long snapshotZ = branchBSnapshotsAfterCommit.get(0).getLong(0); - assertNotEquals( - snapshotY, snapshotZ, "BranchB should now point to a new snapshot (snapshotZ)"); - System.out.println("SnapshotZ (branchB after commit): " + snapshotZ); - - // ===== VERIFICATION OF FINAL STATE ===== - - // Verify all three branches exist and point to different snapshots - List allRefs = - spark - .sql("SELECT name, snapshot_id FROM " + tableName + ".refs ORDER BY name") - .collectAsList(); - assertEquals(3, allRefs.size(), "Should have 3 branches: main, branchA, branchB"); - - // Verify snapshot relationships - List mainFinalSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .collectAsList(); - List branchAFinalSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") - .collectAsList(); - List branchBFinalSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") - .collectAsList(); - - long finalSnapshotX = mainFinalSnapshots.get(0).getLong(0); - long finalSnapshotY = branchAFinalSnapshots.get(0).getLong(0); - long finalSnapshotZ = branchBFinalSnapshots.get(0).getLong(0); - - assertEquals(snapshotX, finalSnapshotX, "Main should still point to snapshotX"); - assertEquals(snapshotY, finalSnapshotY, "BranchA should still point to snapshotY"); - assertEquals(snapshotZ, finalSnapshotZ, "BranchB should point to snapshotZ"); - - // Verify data isolation between branches - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 1 row"); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") - .collectAsList() - .size(), - "BranchA should have 2 rows"); - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") - .collectAsList() - .size(), - "BranchB should have 3 rows"); - - // Verify content - List mainData = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals("main_data", mainData.get(0).getString(0), "Main should contain main_data"); - - List branchAData = - spark - .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchA' ORDER BY name") - .collectAsList(); - assertEquals( - "branchA_data", branchAData.get(0).getString(0), "BranchA should contain branchA_data"); - assertEquals( - "main_data", branchAData.get(1).getString(0), "BranchA should contain main_data"); - - List branchBData = - spark - .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchB' ORDER BY name") - .collectAsList(); - assertEquals( - "branchA_data", branchBData.get(0).getString(0), "BranchB should contain branchA_data"); - assertEquals( - "branchB_data", branchBData.get(1).getString(0), "BranchB should contain branchB_data"); - assertEquals( - "main_data", branchBData.get(2).getString(0), "BranchB should contain main_data"); - - // Verify parent-child relationships in snapshot metadata - List allSnapshots = - spark - .sql( - "SELECT snapshot_id, parent_id FROM " - + tableName - + ".snapshots ORDER BY committed_at") - .collectAsList(); - assertTrue(allSnapshots.size() >= 3, "Should have at least 3 snapshots"); - - // Clean up WAP configuration - spark.conf().unset("spark.wap.branch"); - } - } - - @Test - public void testRegularCommitWithMultipleBranches() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "regular_multi_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - // Create table (no WAP needed for this test) - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Step 1: Start with main at snapshotX - spark.sql("INSERT INTO " + tableName + " VALUES ('main_data')"); - - // Verify main branch exists and get its snapshot - List mainSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .collectAsList(); - assertEquals(1, mainSnapshots.size(), "Main branch should exist"); - long snapshotX = mainSnapshots.get(0).getLong(0); - System.out.println("SnapshotX (main): " + snapshotX); - - // Step 2: Create branchA from main → branchA also points to snapshotX - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchA"); - - // Verify branchA points to same snapshot as main - List branchASnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") - .collectAsList(); - assertEquals(1, branchASnapshots.size(), "BranchA should exist"); - long branchASnapshotAfterCreation = branchASnapshots.get(0).getLong(0); - assertEquals( - snapshotX, branchASnapshotAfterCreation, "BranchA should point to same snapshot as main"); - - // Step 3: Commit some data on branchA → branchA now points to snapshotY (child of snapshotX) - spark.sql("INSERT INTO " + tableName + ".branch_branchA VALUES ('branchA_data')"); - - // Verify branchA now points to snapshotY (child of snapshotX) - List branchASnapshotsAfterCommit = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") - .collectAsList(); - long snapshotY = branchASnapshotsAfterCommit.get(0).getLong(0); - assertNotEquals( - snapshotX, snapshotY, "BranchA should now point to a new snapshot (snapshotY)"); - System.out.println("SnapshotY (branchA after commit): " + snapshotY); - - // Verify branchA has both main_data and branchA_data - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") - .collectAsList() - .size(), - "BranchA should have 2 rows after commit"); - - // Verify main still points to snapshotX and has only main_data - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should still have 1 row"); - - // Step 4: Create branchB from branchA → branchB points to snapshotY - // First create the branch, then set it to point to the same snapshot as branchA - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH branchB"); - spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'branchB', 'branchA')"); - - // Verify branchB points to snapshotY - List branchBSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") - .collectAsList(); - long branchBSnapshotAfterCreation = branchBSnapshots.get(0).getLong(0); - assertEquals( - snapshotY, - branchBSnapshotAfterCreation, - "BranchB should point to snapshotY (same as branchA)"); - - // Step 5: Make a commit on branchB → branchB now points to snapshotZ (child of snapshotY) - spark.sql("INSERT INTO " + tableName + ".branch_branchB VALUES ('branchB_data')"); - - // Verify branchB now points to snapshotZ - List branchBSnapshotsAfterCommit = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") - .collectAsList(); - long snapshotZ = branchBSnapshotsAfterCommit.get(0).getLong(0); - assertNotEquals( - snapshotY, snapshotZ, "BranchB should now point to a new snapshot (snapshotZ)"); - System.out.println("SnapshotZ (branchB after commit): " + snapshotZ); - - // ===== VERIFICATION OF FINAL STATE ===== - - // Verify all three branches exist and point to different snapshots - List allRefs = - spark - .sql("SELECT name, snapshot_id FROM " + tableName + ".refs ORDER BY name") - .collectAsList(); - assertEquals(3, allRefs.size(), "Should have 3 branches: main, branchA, branchB"); - - // Verify snapshot relationships - List mainFinalSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .collectAsList(); - List branchAFinalSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") - .collectAsList(); - List branchBFinalSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") - .collectAsList(); - - long finalSnapshotX = mainFinalSnapshots.get(0).getLong(0); - long finalSnapshotY = branchAFinalSnapshots.get(0).getLong(0); - long finalSnapshotZ = branchBFinalSnapshots.get(0).getLong(0); - - assertEquals(snapshotX, finalSnapshotX, "Main should still point to snapshotX"); - assertEquals(snapshotY, finalSnapshotY, "BranchA should still point to snapshotY"); - assertEquals(snapshotZ, finalSnapshotZ, "BranchB should point to snapshotZ"); - - // Verify all snapshots are different - assertNotEquals( - finalSnapshotX, finalSnapshotY, "SnapshotX and snapshotY should be different"); - assertNotEquals( - finalSnapshotY, finalSnapshotZ, "SnapshotY and snapshotZ should be different"); - assertNotEquals( - finalSnapshotX, finalSnapshotZ, "SnapshotX and snapshotZ should be different"); - - // Verify data isolation between branches - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main branch should have 1 row"); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") - .collectAsList() - .size(), - "BranchA should have 2 rows"); - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") - .collectAsList() - .size(), - "BranchB should have 3 rows"); - - // Verify content - List mainData = - spark.sql("SELECT name FROM " + tableName + " ORDER BY name").collectAsList(); - assertEquals("main_data", mainData.get(0).getString(0), "Main should contain main_data"); - - List branchAData = - spark - .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchA' ORDER BY name") - .collectAsList(); - assertEquals( - "branchA_data", branchAData.get(0).getString(0), "BranchA should contain branchA_data"); - assertEquals( - "main_data", branchAData.get(1).getString(0), "BranchA should contain main_data"); - - List branchBData = - spark - .sql("SELECT name FROM " + tableName + " VERSION AS OF 'branchB' ORDER BY name") - .collectAsList(); - assertEquals( - "branchA_data", branchBData.get(0).getString(0), "BranchB should contain branchA_data"); - assertEquals( - "branchB_data", branchBData.get(1).getString(0), "BranchB should contain branchB_data"); - assertEquals( - "main_data", branchBData.get(2).getString(0), "BranchB should contain main_data"); - - // ===== TEST THE SPECIFIC SCENARIO THAT WOULD HAVE BEEN AMBIGUOUS ===== - - // At this point, we have: - // - main points to snapshotX - // - branchA points to snapshotY - // - branchB points to snapshotZ - // - // If we were to commit a new snapshot as child of snapshotY, our fixed logic should work - // because only the explicitly targeted branch (via branch-specific insert syntax) should be - // considered - - // Verify that we can still commit to branchA even though multiple branches exist - spark.sql("INSERT INTO " + tableName + ".branch_branchA VALUES ('additional_branchA_data')"); - - // Verify branchA advanced but branchB didn't - List branchAFinalSnapshots2 = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchA'") - .collectAsList(); - List branchBFinalSnapshots2 = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'branchB'") - .collectAsList(); - - long finalSnapshotY2 = branchAFinalSnapshots2.get(0).getLong(0); - long finalSnapshotZ2 = branchBFinalSnapshots2.get(0).getLong(0); - - assertNotEquals(snapshotY, finalSnapshotY2, "BranchA should have advanced to a new snapshot"); - assertEquals(snapshotZ, finalSnapshotZ2, "BranchB should remain at the same snapshot"); - - // Verify data counts - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchA'") - .collectAsList() - .size(), - "BranchA should now have 3 rows"); - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'branchB'") - .collectAsList() - .size(), - "BranchB should still have 3 rows (unchanged)"); - } - } - - // ===== CHERRY PICKING BETWEEN BRANCHES ===== - - @Test - public void testCherryPickToMainWithFeatureBranch() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup branches - spark.sql("INSERT INTO " + tableName + " VALUES ('main.base')"); - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Create WAP snapshot - spark.conf().set("spark.wap.id", "feature-target-wap"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap.for.feature')"); - String wapSnapshotId = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'feature-target-wap'") - .first() - .mkString(); - - // CRITICAL: Unset WAP ID before advancing main branch to force non-fast-forward cherry-pick - spark.conf().unset("spark.wap.id"); - spark.sql("INSERT INTO " + tableName + " VALUES ('main.advance')"); - - // Cherry-pick WAP to main branch (this tests our enhanced applySnapshotOperations) - // Main should have 2 rows now (main.base + main.advance) - assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - spark.sql( - String.format( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', %s)", - wapSnapshotId)); - - // Verify cherry-pick worked - 3 rows of data should appear in main (main.base + main.advance - // + wap.for.feature) - assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - - // Verify published WAP snapshot properties - List publishedSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['published-wap-id'] = 'feature-target-wap'") - .collectAsList(); - assertTrue( - publishedSnapshots.size() >= 1, - "Should find at least one snapshot with published-wap-id"); - } - } - - // ===== FAST FORWARD MERGES ===== - - @Test - public void testFastForwardMergeToMain() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); - - // Create feature branch from main - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Advance feature branch - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data1')"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data2')"); - - // Verify initial state - assertEquals( - 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature has 3 rows - - // Fast-forward main to feature_a - spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"); - - // Verify fast-forward worked - main should now have same data as feature_a - assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - - // Verify both branches point to same snapshot - String mainSnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .first() - .mkString(); - String featureSnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") - .first() - .mkString(); - assertEquals(mainSnapshot, featureSnapshot); - } - } - - @Test - public void testFastForwardMergeToFeature() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); - - // Create feature branch from main - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Advance main branch (feature_a stays at base) - spark.sql("INSERT INTO " + tableName + " VALUES ('main.data1')"); - spark.sql("INSERT INTO " + tableName + " VALUES ('main.data2')"); - - // Verify initial state - assertEquals( - 3, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 3 rows - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature has 1 row - - // Fast-forward feature_a to main - spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'feature_a', 'main')"); - - // Verify fast-forward worked - feature_a should now have same data as main - assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - - // Verify both branches point to same snapshot - String mainSnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .first() - .mkString(); - String featureSnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") - .first() - .mkString(); - assertEquals(mainSnapshot, featureSnapshot); - } - } - - @Test - public void testFastForwardFeatureToMainAndWapId() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); - - // Create feature branch - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Create WAP snapshot - spark.conf().set("spark.wap.id", "test-wap"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap.data')"); - String wapSnapshotId = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'test-wap'") - .first() - .mkString(); - - // Unset WAP ID before advancing feature branch normally (not using WAP - else WAP staged - // snapshot will apply to feature branch) - spark.conf().unset("spark.wap.id"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data')"); - - // Verify WAP snapshot doesn't interfere with fast-forward - assertEquals( - 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature advanced - - // Fast-forward main to feature_a should work despite WAP presence - spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"); - - // Verify fast-forward worked - assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - - // Verify WAP snapshot is still available for cherry-pick - List wapSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'test-wap'") - .collectAsList(); - assertEquals(1, wapSnapshots.size()); - assertEquals(wapSnapshotId, wapSnapshots.get(0).mkString()); - } - } - - @Test - public void testFastForwardMergeBetweenTwoFeatureBranches() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); - - // Create two feature branches from main - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_b"); - - // Advance feature_a - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature_a.data1')"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature_a.data2')"); - - // Verify initial state - assertEquals( - 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 1 row - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature_a has 3 rows - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_b'") - .collectAsList() - .size()); // feature_b has 1 row - - // Fast-forward feature_b to feature_a - spark.sql( - "CALL openhouse.system.fast_forward('" + tableName + "', 'feature_b', 'feature_a')"); - - // Verify fast-forward worked - assertEquals( - 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature_a unchanged - assertEquals( - 3, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_b'") - .collectAsList() - .size()); // feature_b now matches feature_a - - // Verify feature_a and feature_b point to same snapshot - String featureASnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") - .first() - .mkString(); - String featureBSnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_b'") - .first() - .mkString(); - assertEquals(featureASnapshot, featureBSnapshot); - } - } - - @Test - public void testFastForwardMergeIncompatibleLineage() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); - - // Create feature branch - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Advance both branches independently (creating divergent history) - spark.sql("INSERT INTO " + tableName + " VALUES ('main.divergent')"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.divergent')"); - - // Verify divergent state - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main has 2 rows - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature_a has 2 rows (different) - - // Attempt fast-forward should fail due to incompatible lineage - assertThrows( - Exception.class, - () -> - spark.sql( - "CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'feature_a')"), - "Fast-forward should fail when branches have divergent history"); - - // Verify branches remain unchanged after failed fast-forward - assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - - // Verify snapshots are still different - String mainSnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'main'") - .first() - .mkString(); - String featureSnapshot = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".refs WHERE name = 'feature_a'") - .first() - .mkString(); - assertNotEquals(mainSnapshot, featureSnapshot); - } - } - - // ===== SNAPSHOT EXPIRATION FROM NON-MAIN BRANCHES ===== - - @Test - public void testSnapshotExpirationFromFeatureBranch() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup: Create multiple snapshots to have some that can be expired - - // 1. Create initial main data - spark.sql("INSERT INTO " + tableName + " VALUES ('main.initial')"); - - // 2. Create feature branch from main - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // 3. Add multiple snapshots to feature branch - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data1')"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data2')"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.data3')"); - - // 4. Query metadata tables to find snapshots that are NOT current branch heads - - // Get all snapshots - List allSnapshots = - spark - .sql("SELECT snapshot_id FROM " + tableName + ".snapshots ORDER BY committed_at") - .collectAsList(); - assertTrue(allSnapshots.size() >= 4, "Should have at least 4 snapshots"); - - // Get current branch head snapshots from refs table - List branchHeads = - spark.sql("SELECT snapshot_id FROM " + tableName + ".refs").collectAsList(); - Set referencedSnapshots = - branchHeads.stream().map(row -> row.mkString()).collect(Collectors.toSet()); - - System.out.println( - "DEBUG: All snapshots: " - + allSnapshots.stream().map(Row::mkString).collect(Collectors.toList())); - System.out.println("DEBUG: Referenced snapshots (branch heads): " + referencedSnapshots); - - // Find snapshots that are NOT referenced by any branch head - List unreferencedSnapshots = - allSnapshots.stream() - .map(Row::mkString) - .filter(snapshotId -> !referencedSnapshots.contains(snapshotId)) - .collect(Collectors.toList()); - - System.out.println("DEBUG: Unreferenced snapshots: " + unreferencedSnapshots); - - // We should have at least one unreferenced snapshot (intermediate feature snapshots) - assertFalse( - unreferencedSnapshots.isEmpty(), - "Should have at least one unreferenced snapshot to expire"); - - // Select the first unreferenced snapshot to expire - String snapshotToExpire = unreferencedSnapshots.get(0); - - // Verify this snapshot exists in the snapshots table - List beforeExpiration = - spark.sql("SELECT snapshot_id FROM " + tableName + ".snapshots").collectAsList(); - assertTrue( - beforeExpiration.stream().anyMatch(row -> row.mkString().equals(snapshotToExpire)), - "Snapshot to expire should exist before expiration"); - - // Expire the unreferenced snapshot - spark.sql( - String.format( - "CALL openhouse.system.expire_snapshots(table => '" - + tableName.replace("openhouse.", "") - + "', snapshot_ids => Array(%s))", - snapshotToExpire)); - - // Verify snapshot is gone - List afterExpiration = - spark.sql("SELECT snapshot_id FROM " + tableName + ".snapshots").collectAsList(); - assertFalse( - afterExpiration.stream().anyMatch(row -> row.mkString().equals(snapshotToExpire)), - "Expired snapshot should no longer exist"); - - // Verify branches are still intact after expiration - // Main should have: main.initial = 1 row - assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - - // Feature_a should have: main.initial + feature.data1 + feature.data2 + feature.data3 = 4 - // rows - assertEquals( - 4, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - } - } - - @Test - public void testWapSnapshotExpirationWithMultipleBranches() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup multi-branch environment - spark.sql("INSERT INTO " + tableName + " VALUES ('main.base')"); - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('feature.base')"); - - // Create multiple WAP snapshots - spark.conf().set("spark.wap.id", "wap-to-keep"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap.keep.data')"); - - spark.conf().set("spark.wap.id", "wap-to-expire"); - spark.sql("INSERT INTO " + tableName + " VALUES ('wap.expire.data')"); - String expireWapId = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'wap-to-expire'") - .first() - .mkString(); - - // Expire specific WAP snapshot - spark.sql( - String.format( - "CALL openhouse.system.expire_snapshots(table => '" - + tableName.replace("openhouse.", "") - + "', snapshot_ids => Array(%s))", - expireWapId)); - - // Verify selective WAP expiration - List remainingWaps = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'wap-to-keep'") - .collectAsList(); - assertEquals(1, remainingWaps.size()); - - List expiredWaps = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'wap-to-expire'") - .collectAsList(); - assertEquals(0, expiredWaps.size()); - - // Verify branches unchanged - assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - } - } - - // ===== BACKWARD COMPATIBILITY ===== - - @Test - public void testWapIdOnFeatureBranchAndMainBranch() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data in main branch - spark.sql("INSERT INTO " + tableName + " VALUES (0, 'main_base')"); - - // Create feature branch and add base data to it - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES (10, 'feature_base')"); - - // Verify initial state - main has 1 row, feature has 2 rows - assertEquals(1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - assertEquals( - 2, spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size()); - - // Create WAP staged snapshot (invisible to normal reads) - spark.conf().set("spark.wap.id", "shared-wap-snapshot"); - spark.sql("INSERT INTO " + tableName + " VALUES (99, 'wap_staged_data')"); - - // Get the WAP snapshot ID - String wapSnapshotId = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'shared-wap-snapshot'") - .first() - .mkString(); - - // Verify WAP staging doesn't affect normal reads (principle 2: invisible until published) - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main should not see WAP staged data"); - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), - "Feature should not see WAP staged data"); - - // Clear WAP ID to avoid contamination - spark.conf().unset("spark.wap.id"); - - // Cherry-pick the same WAP snapshot to MAIN branch - spark.sql( - String.format( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', %s)", - wapSnapshotId)); - - // Verify cherry-pick to main worked - main should now have the WAP data - List mainAfterCherryPick = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); - assertEquals(2, mainAfterCherryPick.size(), "Main should have base + cherry-picked WAP data"); - boolean mainHasWapData = - mainAfterCherryPick.stream().anyMatch(row -> "wap_staged_data".equals(row.getString(1))); - assertTrue(mainHasWapData, "Main should contain cherry-picked WAP data"); - - // Verify feature branch is still unaffected - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), - "Feature branch should be unchanged"); - - // Demonstrate that WAP snapshots work independently on different branches by - // creating a separate WAP snapshot while on the feature branch context - - // Create another WAP snapshot that could be applied to feature branch - spark.conf().set("spark.wap.id", "feature-specific-wap"); - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES (50, 'feature_wap_data')"); - - String featureWapSnapshotId = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'feature-specific-wap'") - .first() - .mkString(); - - // Clear WAP ID again - spark.conf().unset("spark.wap.id"); - - // Verify that both WAP snapshots exist but are invisible to normal reads - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "Main should still only show cherry-picked data"); - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList().size(), - "Feature should not show new WAP data yet"); - - // Show that we can cherry-pick the feature WAP to main as well (demonstrating cross-branch - // capability) - spark.sql( - String.format( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', %s)", - featureWapSnapshotId)); - - // Verify main now has both cherry-picked WAP snapshots - List finalMain = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); - assertEquals(3, finalMain.size(), "Main should have base + first WAP + second WAP data"); - - boolean hasOriginalWap = - finalMain.stream().anyMatch(row -> "wap_staged_data".equals(row.getString(1))); - boolean hasFeatureWap = - finalMain.stream().anyMatch(row -> "feature_wap_data".equals(row.getString(1))); - assertTrue(hasOriginalWap, "Main should contain first cherry-picked WAP data"); - assertTrue(hasFeatureWap, "Main should contain second cherry-picked WAP data"); - - // Verify feature branch is still independent and unchanged by main's cherry-picks - List finalFeature = - spark.sql("SELECT * FROM " + tableName + ".branch_feature_a").collectAsList(); - assertEquals( - 2, finalFeature.size(), "Feature should still only have base + feature_base data"); - - // Verify that both original WAP snapshots are still available in metadata - List originalWapSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'shared-wap-snapshot'") - .collectAsList(); - List featureWapSnapshots = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'feature-specific-wap'") - .collectAsList(); - assertEquals(1, originalWapSnapshots.size(), "Original WAP snapshot should still exist"); - assertEquals(1, featureWapSnapshots.size(), "Feature WAP snapshot should still exist"); - } - } - - @Test - public void testBackwardCompatibilityMainBranchOnly() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Traditional main-only workflow (should work exactly as before) - spark.sql("INSERT INTO " + tableName + " VALUES ('main.1')"); - spark.sql("INSERT INTO " + tableName + " VALUES ('main.2')"); - - // WAP staging (traditional) - spark.conf().set("spark.wap.id", "compat-test-wap"); - spark.sql("INSERT INTO " + tableName + " VALUES ('compat.wap.data')"); - String wapSnapshotId = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'compat-test-wap'") - .first() - .mkString(); - - // Traditional cherry-pick to main - spark.sql( - String.format( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', %s)", - wapSnapshotId)); - - // Verify traditional behavior preserved - assertEquals(3, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - List refs = spark.sql("SELECT name FROM " + tableName + ".refs").collectAsList(); - assertEquals(1, refs.size()); - Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); - assertTrue(refNames.contains("main")); - - // Traditional snapshot queries should work - assertTrue( - spark.sql("SELECT * FROM " + tableName + ".snapshots").collectAsList().size() >= 3); - } - } - - // ===== WAP BRANCH TESTING ===== - // These tests validate the intended WAP branch functionality. - // WAP branch should stage writes to a specific branch without affecting main. - - @Test - public void testStagedChangesVisibleViaConf() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES (1, 'base_data')"); - - // Create WAP branch and insert staged data - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH wap_branch"); - spark.conf().set("spark.wap.branch", "wap_branch"); - spark.sql("INSERT INTO " + tableName + " VALUES (2, 'staged_data')"); - - // When spark.wap.branch is set, SELECT should see WAP branch data (2 rows) - List wapVisible = spark.sql("SELECT * FROM " + tableName).collectAsList(); - assertEquals( - 2, wapVisible.size(), "Should see both base and staged data when wap.branch is set"); - - // When spark.wap.branch is unset, SELECT should see only main data (1 row) - spark.conf().unset("spark.wap.branch"); - List mainOnly = spark.sql("SELECT * FROM " + tableName).collectAsList(); - assertEquals(1, mainOnly.size(), "Should see only base data when wap.branch is unset"); - } - } - - @Test - public void testStagedChangesHidden() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); - - // Create WAP branch for staged operations - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH wap"); - - // Set WAP branch for staged testing - spark.conf().set("spark.wap.branch", "wap"); - - // INSERT INTO table -> inserts to the WAP branch - spark.sql("INSERT INTO " + tableName + " VALUES (1, 'staged_data')"); - - // When spark.wap.branch is set: - // ✅ SELECT * FROM table → reads from the WAP branch - List tableData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); - assertEquals( - 2, - tableData.size(), - "SELECT * FROM table should read from WAP branch when spark.wap.branch is set"); - boolean hasBase = tableData.stream().anyMatch(row -> "base".equals(row.getString(1))); - boolean hasStaged = - tableData.stream().anyMatch(row -> "staged_data".equals(row.getString(1))); - assertTrue(hasBase, "WAP branch should contain base data"); - assertTrue(hasStaged, "WAP branch should contain staged data"); - - // ✅ SELECT * FROM table.branch_wap → explicitly reads from WAP branch - List wapBranchData = - spark.sql("SELECT * FROM " + tableName + ".branch_wap").collectAsList(); - assertEquals(2, wapBranchData.size(), "Explicit WAP branch select should show staged data"); - - // ✅ SELECT * FROM table.branch_main → explicitly reads from main branch - List mainBranchData = - spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList(); - assertEquals( - 1, mainBranchData.size(), "Explicit main branch select should only show base data"); - assertEquals( - "base", mainBranchData.get(0).getString(1), "Main branch should only contain base data"); - - // Now unset spark.wap.branch and ensure main branch is the referenced data - spark.conf().unset("spark.wap.branch"); - - // When spark.wap.branch is unset, SELECT * FROM table should read from main branch - List afterUnsetData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); - assertEquals( - 1, - afterUnsetData.size(), - "SELECT * FROM table should read from main branch when spark.wap.branch is unset"); - assertEquals( - "base", - afterUnsetData.get(0).getString(1), - "After unsetting wap.branch, should read from main"); - - // INSERT INTO table should go to main branch when spark.wap.branch is unset - spark.sql("INSERT INTO " + tableName + " VALUES (2, 'main_data')"); - List finalMainData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); - assertEquals( - 2, finalMainData.size(), "Main branch should now have 2 rows after unsetting wap.branch"); - boolean hasMainData = - finalMainData.stream().anyMatch(row -> "main_data".equals(row.getString(1))); - assertTrue(hasMainData, "Main branch should contain the newly inserted data"); - - // WAP branch should remain unchanged - List finalWapData = - spark.sql("SELECT * FROM " + tableName + ".branch_wap").collectAsList(); - assertEquals( - 2, finalWapData.size(), "WAP branch should remain unchanged with base + staged data"); - } - } - - @Test - public void testPublishWapBranch() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); - - // Create staging branch - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); - - // Stage changes to WAP branch - spark.conf().set("spark.wap.branch", "staging"); - spark.sql("INSERT INTO " + tableName + " VALUES (1, 'staged_for_publish')"); - - // When spark.wap.branch is set, SELECT * FROM table should read from WAP branch - assertEquals( - 2, - spark.sql("SELECT * FROM " + tableName + "").collectAsList().size(), - "SELECT * FROM table should read from WAP branch when spark.wap.branch is set"); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'staging'") - .collectAsList() - .size(), - "Staging should have staged data"); - - // Verify main branch still only has base data - assertEquals( - 1, - spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList().size(), - "Main branch should not have staged data"); - - // Fast-forward main branch to staging branch to publish the staged changes - spark.sql("CALL openhouse.system.fast_forward('" + tableName + "', 'main', 'staging')"); - - // Verify data is now published to main branch (need to explicitly check main branch) - List publishedData = - spark.sql("SELECT * FROM " + tableName + ".branch_main").collectAsList(); - assertEquals(2, publishedData.size(), "Main branch should now have published data"); - - boolean hasPublished = - publishedData.stream().anyMatch(row -> "staged_for_publish".equals(row.getString(1))); - assertTrue(hasPublished, "Main branch should contain the published staged data"); - - // Verify that with wap.branch still set, SELECT * FROM table still reads from WAP branch - List wapData = spark.sql("SELECT * FROM " + tableName + "").collectAsList(); - assertEquals(2, wapData.size(), "SELECT * FROM table should still read from WAP branch"); - } - } - - @Test - public void testWapIdAndWapBranchIncompatible() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); - - // Create staging branch - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); - - // Set both WAP ID and WAP branch - this should be invalid - spark.conf().set("spark.wap.id", "test-wap-id"); - spark.conf().set("spark.wap.branch", "staging"); - - // Attempt to write with both configurations should fail - assertThrows( - Exception.class, - () -> spark.sql("INSERT INTO " + tableName + " VALUES (1, 'invalid')"), - "Cannot use both wap.id and wap.branch simultaneously"); - } - } - - @Test - public void testCannotWriteToBothBranches() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "wap_branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (id int, data string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES (0, 'base')"); - - // Create branches - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature"); - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH staging"); - - // Set WAP branch - spark.conf().set("spark.wap.branch", "staging"); - - // ❌ INVALID: Cannot write to both normal branch and WAP branch - assertThrows( - Exception.class, - () -> spark.sql("INSERT INTO " + tableName + ".branch_feature VALUES (1, 'invalid')"), - "Cannot write to explicit branch when wap.branch is set"); - } - } - - // ===== ERROR SCENARIOS ===== - - @Test - public void testErrorInsertToNonExistentBranch() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - - // Setup base data - spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); - - // Create one valid branch - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Verify valid branch works - spark.sql("INSERT INTO " + tableName + ".branch_feature_a VALUES ('valid.data')"); - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); - - // Attempt to insert into non-existent branch should fail - assertThrows( - Exception.class, - () -> - spark.sql("INSERT INTO " + tableName + ".branch_nonexistent VALUES ('invalid.data')"), - "Insert to non-existent branch should fail"); - - // Verify table state unchanged after failed insert - assertEquals( - 1, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged - assertEquals( - 2, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature_a unchanged - - // Verify only valid branches exist - List refs = - spark.sql("SELECT name FROM " + tableName + ".refs ORDER BY name").collectAsList(); - assertEquals(2, refs.size()); - Set refNames = refs.stream().map(row -> row.getString(0)).collect(Collectors.toSet()); - assertTrue(refNames.contains("feature_a")); - assertTrue(refNames.contains("main")); - } - } - - @Test - public void testErrorCherryPickNonExistentWapId() throws Exception { - try (SparkSession spark = getSparkSession()) { - String tableId = "branch_test_" + System.currentTimeMillis(); - String tableName = "openhouse.d1." + tableId; - - spark.sql("CREATE TABLE " + tableName + " (name string)"); - spark.sql("ALTER TABLE " + tableName + " SET TBLPROPERTIES ('write.wap.enabled'='true')"); - - // Setup base data and branch - spark.sql("INSERT INTO " + tableName + " VALUES ('base.data')"); - spark.sql("ALTER TABLE " + tableName + " CREATE BRANCH feature_a"); - - // Create a valid WAP snapshot - spark.conf().set("spark.wap.id", "valid-wap"); - spark.sql("INSERT INTO " + tableName + " VALUES ('valid.wap.data')"); - String validWapId = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") - .first() - .mkString(); - - // Verify valid WAP cherry-pick works - spark.sql( - String.format( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', %s)", - validWapId)); - assertEquals(2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); - - // Attempt to cherry-pick non-existent snapshot ID should fail - long nonExistentSnapshotId = 999999999L; - assertThrows( - Exception.class, - () -> - spark.sql( - String.format( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', %s)", - nonExistentSnapshotId)), - "Cherry-pick of non-existent snapshot should fail"); - - // Attempt to cherry-pick with malformed snapshot ID should fail - assertThrows( - Exception.class, - () -> - spark.sql( - String.format( - "CALL openhouse.system.cherrypick_snapshot('" - + tableName.replace("openhouse.", "") - + "', %s)", - "invalid-id")), - "Cherry-pick with invalid snapshot ID should fail"); - - // Verify table state unchanged after failed cherry-picks - assertEquals( - 2, spark.sql("SELECT * FROM " + tableName + "").collectAsList().size()); // main unchanged - assertEquals( - 1, - spark - .sql("SELECT * FROM " + tableName + " VERSION AS OF 'feature_a'") - .collectAsList() - .size()); // feature_a unchanged - - // Verify valid WAP snapshot still exists - List validWaps = - spark - .sql( - "SELECT snapshot_id FROM " - + tableName - + ".snapshots WHERE summary['wap.id'] = 'valid-wap'") - .collectAsList(); - assertEquals(1, validWaps.size()); - } - } -} From 9b5a3d0f298dfa567a7acbaed5b7f2a0472db3a6 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 3 Nov 2025 19:09:43 -0800 Subject: [PATCH 21/35] fixing small things --- .../internal/catalog/SnapshotDiffApplier.java | 59 ++++++++++++------- 1 file changed, 37 insertions(+), 22 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index 90dbc2b85..c01a6a827 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -35,8 +35,8 @@ public class SnapshotDiffApplier { private final MetricsReporter metricsReporter; /** - * Applies snapshot updates from metadata properties. Clear flow: parse input, compute diff, - * validate, apply, build. + * Applies snapshot updates from metadata properties. Simple and clear: parse input, compute diff, + * validate, apply, record metrics, build. * * @param base The base table metadata (may be null for table creation) * @param metadata The new metadata with properties containing snapshot updates @@ -57,19 +57,21 @@ public TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) List existingSnapshots = base != null ? base.snapshots() : Collections.emptyList(); - // Compute diff (minimal maps in constructor) + // Compute diff (all maps created once in constructor) SnapshotDiff diff = new SnapshotDiff(providedSnapshots, existingSnapshots, metadata, providedRefs); - // Validate, apply, build + // Validate, apply, record metrics, build diff.validate(base); TableMetadata.Builder builder = diff.applyTo(metadata); + diff.recordMetrics(builder); return builder.build(); } /** - * State object that computes minimal snapshot diff. Computes only essential maps in the - * constructor for the refactoring. Provides simple validation and application methods. + * State object that computes and caches all snapshot analysis. Computes all maps once in the + * constructor to avoid redundant operations. Provides clear methods for validation and + * application. */ private class SnapshotDiff { // Input state @@ -78,12 +80,17 @@ private class SnapshotDiff { private final TableMetadata metadata; private final Map providedRefs; - // Computed maps (minimal for original behavior) + // Computed maps (created once) private final Map providedSnapshotByIds; private final Map existingSnapshotByIds; private final List newSnapshots; private final List deletedSnapshots; + // Categorized snapshots (computed during applyTo) + private List appendedSnapshots; + private List stagedSnapshots; + private List cherryPickedSnapshots; + SnapshotDiff( List providedSnapshots, List existingSnapshots, @@ -94,13 +101,13 @@ private class SnapshotDiff { this.metadata = metadata; this.providedRefs = providedRefs; - // Compute basic maps + // Compute all maps once this.providedSnapshotByIds = providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); this.existingSnapshotByIds = existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - // Compute diff (symmetric difference) + // Compute changes this.newSnapshots = providedSnapshots.stream() .filter(s -> !existingSnapshotByIds.containsKey(s.snapshotId())) @@ -112,10 +119,25 @@ private class SnapshotDiff { } /** - * Validates snapshots update - ensures we don't delete the latest snapshot without adding new - * ones. This is the same validation logic from SnapshotInspector.validateSnapshotsUpdate(). + * Validates all snapshot changes before applying them to table metadata. + * + * @param base The base table metadata to validate against (may be null for table creation) + * @throws InvalidIcebergSnapshotException if any validation check fails */ void validate(TableMetadata base) { + validateCurrentSnapshotNotDeleted(base); + } + + /** + * Validates that the current snapshot is not deleted without providing replacement snapshots. + * This is the same validation logic from SnapshotInspector.validateSnapshotsUpdate(). + * + * @param base The base table metadata containing the current snapshot (may be null for table + * creation) + * @throws InvalidIcebergSnapshotException if the current snapshot is being deleted without + * replacements + */ + private void validateCurrentSnapshotNotDeleted(TableMetadata base) { if (base == null || base.currentSnapshot() == null) { return; } @@ -132,9 +154,9 @@ void validate(TableMetadata base) { TableMetadata.Builder applyTo(TableMetadata metadata) { TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); - List appendedSnapshots = new ArrayList<>(); - List stagedSnapshots = new ArrayList<>(); - List cherryPickedSnapshots = new ArrayList<>(); + this.appendedSnapshots = new ArrayList<>(); + this.stagedSnapshots = new ArrayList<>(); + this.cherryPickedSnapshots = new ArrayList<>(); // Validate only MAIN branch for (Map.Entry entry : providedRefs.entrySet()) { @@ -201,17 +223,10 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { metadataBuilder.removeSnapshots(snapshotIds); } - // Record metrics and properties - recordMetrics(metadataBuilder, appendedSnapshots, stagedSnapshots, cherryPickedSnapshots); - return metadataBuilder; } - private void recordMetrics( - TableMetadata.Builder builder, - List appendedSnapshots, - List stagedSnapshots, - List cherryPickedSnapshots) { + void recordMetrics(TableMetadata.Builder builder) { Map updatedProperties = new HashMap<>(metadata.properties()); if (CollectionUtils.isNotEmpty(appendedSnapshots)) { From 15e1337e3907491fb6bee8ee039a3c7d91cee652 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 3 Nov 2025 19:25:21 -0800 Subject: [PATCH 22/35] removing props --- .../openhouse/internal/catalog/SnapshotDiffApplier.java | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index c01a6a827..2740365e6 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -5,8 +5,10 @@ import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; @@ -262,6 +264,12 @@ void recordMetrics(TableMetadata.Builder builder) { } builder.setProperties(updatedProperties); + + // Remove temporary snapshot properties that were used for processing + builder.removeProperties( + new HashSet<>( + Arrays.asList( + CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY))); } } } From 75a1e2a13116379aac7aed3f992d96bf5efc9327 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 3 Nov 2025 22:18:38 -0800 Subject: [PATCH 23/35] changing update properties --- .../internal/catalog/SnapshotDiffApplier.java | 12 ++++-------- 1 file changed, 4 insertions(+), 8 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index 2740365e6..ad6a93abb 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -5,10 +5,8 @@ import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; import java.util.ArrayList; -import java.util.Arrays; import java.util.Collections; import java.util.HashMap; -import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; @@ -231,6 +229,10 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { void recordMetrics(TableMetadata.Builder builder) { Map updatedProperties = new HashMap<>(metadata.properties()); + // Remove temporary snapshot properties that were used for processing + updatedProperties.remove(CatalogConstants.SNAPSHOTS_JSON_KEY); + updatedProperties.remove(CatalogConstants.SNAPSHOTS_REFS_KEY); + if (CollectionUtils.isNotEmpty(appendedSnapshots)) { updatedProperties.put( getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), @@ -264,12 +266,6 @@ void recordMetrics(TableMetadata.Builder builder) { } builder.setProperties(updatedProperties); - - // Remove temporary snapshot properties that were used for processing - builder.removeProperties( - new HashSet<>( - Arrays.asList( - CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY))); } } } From c65dd9506a6488a6296e5b901c7b9deb312722a4 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 3 Nov 2025 23:00:51 -0800 Subject: [PATCH 24/35] fixing tests --- .../internal/catalog/SnapshotDiffApplier.java | 67 ++- .../OpenHouseInternalTableOperationsTest.java | 524 ------------------ 2 files changed, 45 insertions(+), 546 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index ad6a93abb..cbdae7960 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -5,8 +5,10 @@ import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; import java.util.ArrayList; +import java.util.Arrays; import java.util.Collections; import java.util.HashMap; +import java.util.HashSet; import java.util.List; import java.util.Map; import java.util.Optional; @@ -148,7 +150,9 @@ private void validateCurrentSnapshotNotDeleted(TableMetadata base) { if (!deletedSnapshots.isEmpty() && deletedSnapshots.get(deletedSnapshots.size() - 1).snapshotId() == latestSnapshotId) { throw new InvalidIcebergSnapshotException( - String.format("Cannot delete the latest snapshot %s", latestSnapshotId)); + String.format( + "Cannot delete the current snapshot %s without adding replacement snapshots.", + latestSnapshotId)); } } @@ -227,45 +231,64 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { } void recordMetrics(TableMetadata.Builder builder) { - Map updatedProperties = new HashMap<>(metadata.properties()); - - // Remove temporary snapshot properties that were used for processing - updatedProperties.remove(CatalogConstants.SNAPSHOTS_JSON_KEY); - updatedProperties.remove(CatalogConstants.SNAPSHOTS_REFS_KEY); + // First, explicitly remove temp properties from the builder + builder.removeProperties( + new HashSet<>( + Arrays.asList( + CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY))); + // Then add result properties if (CollectionUtils.isNotEmpty(appendedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - String.join(",", appendedSnapshots)); + builder.setProperties( + new HashMap() { + { + put( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + String.join(",", appendedSnapshots)); + } + }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); } if (CollectionUtils.isNotEmpty(stagedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - String.join(",", stagedSnapshots)); + builder.setProperties( + new HashMap() { + { + put( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + String.join(",", stagedSnapshots)); + } + }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); } if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - String.join(",", cherryPickedSnapshots)); + builder.setProperties( + new HashMap() { + { + put( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + String.join(",", cherryPickedSnapshots)); + } + }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickedSnapshots.size()); } if (CollectionUtils.isNotEmpty(deletedSnapshots)) { - updatedProperties.put( - getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - deletedSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(","))); + builder.setProperties( + new HashMap() { + { + put( + getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), + deletedSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(","))); + } + }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, deletedSnapshots.size()); } - - builder.setProperties(updatedProperties); } } } diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index cbced7f7a..634d8eeb6 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -27,7 +27,6 @@ import java.util.List; import java.util.Map; import java.util.Optional; -import java.util.Set; import java.util.UUID; import java.util.function.Consumer; import java.util.stream.Collectors; @@ -43,7 +42,6 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; -import org.apache.iceberg.SnapshotRefParser; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortOrder; import org.apache.iceberg.TableMetadata; @@ -1686,68 +1684,6 @@ void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), Mockito.anyDouble()); } - /** - * Tests that attempting to delete all snapshots fails when the main branch references a snapshot. - * Verifies that InvalidIcebergSnapshotException is thrown to prevent deleting referenced - * snapshots. - */ - @Test - void testDeleteAllSnapshotsFailsWhenMainBranchReferenced() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - // Create metadata with 2 snapshots: one referenced by multiple branches, one unreferenced - Snapshot unreferencedSnapshot = - testSnapshots.get(0); // This will be referenced by both branches - Snapshot mainSnapshot = testSnapshots.get(1); // This one stays but is not referenced - - TableMetadata baseMetadata = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .addSnapshot(unreferencedSnapshot) - .addSnapshot(mainSnapshot) - .setRef( - SnapshotRef.MAIN_BRANCH, - SnapshotRef.branchBuilder(mainSnapshot.snapshotId()).build()) - .build(); - - // Attempt to delete the shared snapshot by creating new metadata without it - // Keep the unreferenced snapshot so we're not deleting everything - List remainingSnapshots = List.of(mainSnapshot); - - // Keep refs pointing to the shared snapshot (causing conflict) - Map refs = baseMetadata.refs(); - Map serializedRefs = - refs.entrySet().stream() - .collect( - Collectors.toMap( - Map.Entry::getKey, - e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); - - Map properties = new HashMap<>(baseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(List.of())); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(mainSnapshot))); - TableMetadata newMetadata = baseMetadata.replaceProperties(properties); - - // This MUST throw InvalidIcebergSnapshotException for snapshots referenced by multiple branches - InvalidIcebergSnapshotException exception = - Assertions.assertThrows( - InvalidIcebergSnapshotException.class, - () -> - openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( - baseMetadata, newMetadata), - "Should throw InvalidIcebergSnapshotException when trying to delete snapshot referenced by multiple branches"); - - // Verify error message mentions the snapshot is still referenced - String exceptionMessage = exception.getMessage(); - String expectedMessage = - "Cannot delete the current snapshot " - + mainSnapshot.snapshotId() - + " without adding replacement snapshots."; - Assertions.assertTrue(exceptionMessage.contains(expectedMessage)); - } - /** * Tests that deleting all unreferenced snapshots succeeds without errors. Verifies that all * snapshots can be deleted when no branches or tags reference them. @@ -1803,464 +1739,4 @@ void testDeleteAllUnreferencedSnapshotsSucceeds() throws IOException { "Snapshot " + snapshot.snapshotId() + " should be tracked as deleted"); } } - - /** - * Tests the standard Write-Audit-Publish (WAP) workflow where a staged snapshot becomes main. - * Verifies that pulling a WAP snapshot into the main branch succeeds without errors. - */ - @Test - void testStandardWAPScenario() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - List wapSnapshots = IcebergTestUtil.getWapSnapshots(); - - // Create base with existing snapshots and a WAP snapshot - TableMetadata baseMetadata = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .addSnapshot(wapSnapshots.get(0)) // WAP snapshot (not referenced by any branch) - .build(); - - // Standard WAP scenario: pull the WAP snapshot into main branch - Snapshot wapSnapshot = wapSnapshots.get(0); - - // New metadata keeps the same snapshots but changes the main branch ref to point to WAP - // snapshot - List allSnapshots = List.of(testSnapshots.get(0), wapSnapshot); - - // Create refs to pull WAP snapshot into main branch - Map refs = new HashMap<>(); - refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRef.branchBuilder(wapSnapshot.snapshotId()).build()); - - // Serialize the refs - Map serializedRefs = - refs.entrySet().stream() - .collect( - Collectors.toMap( - Map.Entry::getKey, - e -> org.apache.iceberg.SnapshotRefParser.toJson(e.getValue()))); - - Map properties = new HashMap<>(baseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(serializedRefs)); - - TableMetadata newMetadata = baseMetadata.replaceProperties(properties); - - // Should succeed - standard WAP workflow where WAP snapshot becomes the new main - Assertions.assertDoesNotThrow( - () -> - openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( - baseMetadata, newMetadata), - "Should successfully pull WAP snapshot into main branch"); - } - - /** - * Tests committing metadata that has diverged multiple versions from the base (N to N+3). - * Verifies that "jump" commits succeed with all snapshots and references correctly applied. - */ - @Test - void testMultipleDiffCommit() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - try (MockedStatic ignoreWriteMock = - Mockito.mockStatic(TableMetadataParser.class)) { - - // ========== Create base at N with 1 snapshot ========== - TableMetadata baseAtN = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .build(); - - // ========== Create divergent metadata at N+3 with 4 snapshots ========== - // Simulate evolving through N+1 and N+2 without committing - TableMetadata intermediate1 = - TableMetadata.buildFrom(baseAtN) - .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) - .build(); - - TableMetadata intermediate2 = - TableMetadata.buildFrom(intermediate1) - .setBranchSnapshot(testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) - .build(); - - TableMetadata metadataAtNPlus3 = - TableMetadata.buildFrom(intermediate2) - .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) - .build(); - - // Add custom properties for commit - Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); - List snapshots4 = testSnapshots.subList(0, 4); - divergentProperties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); - divergentProperties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots4.get(3)))); - - TableMetadata finalDivergentMetadata = - metadataAtNPlus3.replaceProperties(divergentProperties); - - // ========== COMMIT: Base at N, Metadata at N+3 (divergent by 3 commits) ========== - openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); - Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); - - TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); - - // Verify the divergent commit contains all 4 snapshots - Assertions.assertEquals( - 4, - capturedMetadata.snapshots().size(), - "Divergent commit should contain all 4 snapshots despite jumping from base with 1 snapshot"); - - Set expectedSnapshotIds = - snapshots4.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - Set actualSnapshotIds = - capturedMetadata.snapshots().stream() - .map(Snapshot::snapshotId) - .collect(Collectors.toSet()); - Assertions.assertEquals( - expectedSnapshotIds, - actualSnapshotIds, - "All snapshot IDs should be present after divergent commit"); - - // Verify main ref points to the expected snapshot (the 4th snapshot) - SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); - Assertions.assertNotNull(mainRef, "Main branch ref should exist"); - Assertions.assertEquals( - testSnapshots.get(3).snapshotId(), - mainRef.snapshotId(), - "Main branch should point to the 4th snapshot after divergent commit"); - } - } - - /** - * Tests divergent commit (N to N+3) with multiple branches pointing to different snapshots. - * Verifies that divergent commits succeed when branch references are valid and non-conflicting. - */ - @Test - void testMultipleDiffCommitWithValidBranch() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - - try (MockedStatic ignoreWriteMock = - Mockito.mockStatic(TableMetadataParser.class)) { - - // ========== Create base at N with 1 snapshot ========== - TableMetadata baseAtN = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .build(); - - // ========== Create divergent metadata at N+3 with 4 snapshots and 2 branches ========== - TableMetadata intermediate1 = - TableMetadata.buildFrom(baseAtN) - .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) - .build(); - - TableMetadata intermediate2 = - TableMetadata.buildFrom(intermediate1) - .setBranchSnapshot(testSnapshots.get(2), SnapshotRef.MAIN_BRANCH) - .build(); - - TableMetadata metadataAtNPlus3 = - TableMetadata.buildFrom(intermediate2) - .setBranchSnapshot(testSnapshots.get(3), SnapshotRef.MAIN_BRANCH) - .build(); - - // Add custom properties for commit with multiple branches - Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); - List snapshots4 = testSnapshots.subList(0, 4); - divergentProperties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots4)); - - // Create refs for both MAIN (pointing to snapshot 3) and feature_a (pointing to snapshot 2) - Map multipleRefs = new HashMap<>(); - multipleRefs.put( - SnapshotRef.MAIN_BRANCH, - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(testSnapshots.get(3).snapshotId()).build())); - multipleRefs.put( - "feature_a", - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(testSnapshots.get(2).snapshotId()).build())); - - divergentProperties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(multipleRefs)); - - TableMetadata finalDivergentMetadata = - metadataAtNPlus3.replaceProperties(divergentProperties); - - // ========== COMMIT: Should succeed with multiple valid branches ========== - openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); - Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); - - TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); - - // Verify all 4 snapshots are present - Assertions.assertEquals( - 4, - capturedMetadata.snapshots().size(), - "Divergent commit with multiple branches should contain all 4 snapshots"); - - // Verify main ref points to the expected snapshot - SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); - Assertions.assertNotNull(mainRef, "Main branch ref should exist"); - Assertions.assertEquals( - testSnapshots.get(3).snapshotId(), - mainRef.snapshotId(), - "Main branch should point to the 4th snapshot"); - - // Verify feature_a ref points to the expected snapshot - SnapshotRef featureRef = capturedMetadata.ref("feature_a"); - Assertions.assertNotNull(featureRef, "Feature_a branch ref should exist"); - Assertions.assertEquals( - testSnapshots.get(2).snapshotId(), - featureRef.snapshotId(), - "Feature_a branch should point to the 3rd snapshot"); - } - } - - /** - * Tests committing with multiple branches advancing forward, each pointing to different - * snapshots. Verifies that complex multi-branch commits succeed when each branch has a unique - * target snapshot. - */ - @Test - void testMultipleDiffCommitWithMultipleBranchesPointingToSameSnapshot() throws IOException { - // Combine regular snapshots (4) + extra snapshots (4) to get 8 total snapshots - List testSnapshots = IcebergTestUtil.getSnapshots(); - List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); - List allSnapshots = new ArrayList<>(); - allSnapshots.addAll(testSnapshots); - allSnapshots.addAll(extraSnapshots); - - // ========== Create base metadata with 2 branches ========== - // Base has snapshots 0, 1, 2, 3 with MAIN at snapshot 0 and feature_a at snapshot 1 - TableMetadata.Builder baseBuilder = TableMetadata.buildFrom(BASE_TABLE_METADATA); - baseBuilder.addSnapshot(allSnapshots.get(0)); - baseBuilder.addSnapshot(allSnapshots.get(1)); - baseBuilder.addSnapshot(allSnapshots.get(2)); - baseBuilder.addSnapshot(allSnapshots.get(3)); - baseBuilder.setBranchSnapshot(allSnapshots.get(0).snapshotId(), SnapshotRef.MAIN_BRANCH); - baseBuilder.setBranchSnapshot(allSnapshots.get(1).snapshotId(), "feature_a"); - TableMetadata baseMetadata = baseBuilder.build(); - - // Add custom properties with base snapshots - Map baseProperties = new HashMap<>(baseMetadata.properties()); - List baseSnapshots = allSnapshots.subList(0, 4); - baseProperties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(baseSnapshots)); - - Map baseRefs = new HashMap<>(); - baseRefs.put( - SnapshotRef.MAIN_BRANCH, - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(allSnapshots.get(0).snapshotId()).build())); - baseRefs.put( - "feature_a", - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(allSnapshots.get(1).snapshotId()).build())); - - baseProperties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(baseRefs)); - - TableMetadata finalBaseMetadata = baseMetadata.replaceProperties(baseProperties); - - // ========== Create new metadata with 3 branches, all advanced 2 snapshots further ========== - // New metadata has snapshots 0-7 with MAIN at snapshot 2, feature_a at snapshot 3, feature_b at - // snapshot 4 - TableMetadata.Builder newBuilder = TableMetadata.buildFrom(BASE_TABLE_METADATA); - for (int i = 0; i < 8; i++) { - newBuilder.addSnapshot(allSnapshots.get(i)); - } - newBuilder.setBranchSnapshot(allSnapshots.get(2).snapshotId(), SnapshotRef.MAIN_BRANCH); - newBuilder.setBranchSnapshot(allSnapshots.get(3).snapshotId(), "feature_a"); - newBuilder.setBranchSnapshot(allSnapshots.get(4).snapshotId(), "feature_b"); - TableMetadata newMetadata = newBuilder.build(); - - // Add custom properties with new snapshots - Map newProperties = new HashMap<>(newMetadata.properties()); - List newSnapshots = allSnapshots.subList(0, 8); - newProperties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(newSnapshots)); - - Map newRefs = new HashMap<>(); - newRefs.put( - SnapshotRef.MAIN_BRANCH, - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(allSnapshots.get(2).snapshotId()).build())); - newRefs.put( - "feature_a", - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(allSnapshots.get(3).snapshotId()).build())); - newRefs.put( - "feature_b", - SnapshotRefParser.toJson( - SnapshotRef.branchBuilder(allSnapshots.get(4).snapshotId()).build())); - - newProperties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(newRefs)); - - TableMetadata finalNewMetadata = newMetadata.replaceProperties(newProperties); - - // commit should succeed - openHouseInternalTableOperations.doCommit(finalBaseMetadata, finalNewMetadata); - Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); - - TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); - - // Verify all 8 snapshots are present - Assertions.assertEquals( - 8, capturedMetadata.snapshots().size(), "Commit should contain all 8 snapshots"); - - // Verify MAIN branch advanced 2 snapshots (from snapshot 0 to snapshot 2) - SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); - Assertions.assertNotNull(mainRef, "Main branch ref should exist"); - Assertions.assertEquals( - allSnapshots.get(2).snapshotId(), - mainRef.snapshotId(), - "Main branch should point to snapshot 2 (advanced 2 snapshots from snapshot 0)"); - - // Verify feature_a branch advanced 2 snapshots (from snapshot 1 to snapshot 3) - SnapshotRef featureARef = capturedMetadata.ref("feature_a"); - Assertions.assertNotNull(featureARef, "Feature_a branch ref should exist"); - Assertions.assertEquals( - allSnapshots.get(3).snapshotId(), - featureARef.snapshotId(), - "Feature_a branch should point to snapshot 3 (advanced 2 snapshots from snapshot 1)"); - - // Verify feature_b branch exists and points to snapshot 4 (new branch in this commit) - SnapshotRef featureBRef = capturedMetadata.ref("feature_b"); - Assertions.assertNotNull(featureBRef, "Feature_b branch ref should exist"); - Assertions.assertEquals( - allSnapshots.get(4).snapshotId(), - featureBRef.snapshotId(), - "Feature_b branch should point to snapshot 4"); - - // Verify correct lineage: snapshots should be in order - List capturedSnapshots = capturedMetadata.snapshots(); - for (int i = 0; i < 8; i++) { - Assertions.assertEquals( - allSnapshots.get(i).snapshotId(), - capturedSnapshots.get(i).snapshotId(), - "Snapshot " + i + " should be preserved in correct order"); - } - } - - /** - * Tests divergent commit (N to N+3) that includes both regular snapshots and WAP staged - * snapshots. Verifies that staged snapshots remain properly tracked as staged even during a - * multi-version jump commit. - */ - @Test - void testMultipleDiffCommitWithWAPSnapshots() throws IOException { - List testSnapshots = IcebergTestUtil.getSnapshots(); - List wapSnapshots = IcebergTestUtil.getWapSnapshots(); - - try (MockedStatic ignoreWriteMock = - Mockito.mockStatic(TableMetadataParser.class)) { - - // ========== Create base at N with 1 snapshot ========== - TableMetadata baseAtN = - TableMetadata.buildFrom(BASE_TABLE_METADATA) - .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) - .build(); - - // ========== Create divergent metadata at N+3 with 2 regular + 2 WAP snapshots ========== - // Simulate evolving through N+1 and N+2 without committing - // The new metadata will have: - // - testSnapshots[0] (existing in base, main branch) - // - testSnapshots[1] (new, main branch will advance here) - // - wapSnapshots[0] (new, staged - no branch reference) - // - wapSnapshots[1] (new, staged - no branch reference) - - TableMetadata metadataAtNPlus3 = - TableMetadata.buildFrom(baseAtN) - .setBranchSnapshot(testSnapshots.get(1), SnapshotRef.MAIN_BRANCH) - .addSnapshot(wapSnapshots.get(0)) - .addSnapshot(wapSnapshots.get(1)) - .build(); - - // Add custom properties for commit - Map divergentProperties = new HashMap<>(metadataAtNPlus3.properties()); - - // Include 2 regular snapshots (0, 1) and 2 WAP snapshots (0, 1) - List allSnapshots = new ArrayList<>(); - allSnapshots.add(testSnapshots.get(0)); - allSnapshots.add(testSnapshots.get(1)); - allSnapshots.add(wapSnapshots.get(0)); - allSnapshots.add(wapSnapshots.get(1)); - - divergentProperties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); - - // Only main branch ref pointing to testSnapshots[1], WAP snapshots have no refs - divergentProperties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testSnapshots.get(1)))); - divergentProperties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); - - TableMetadata finalDivergentMetadata = - metadataAtNPlus3.replaceProperties(divergentProperties); - - // ========== COMMIT: Base at N, Metadata at N+3 (divergent by 3 commits) ========== - openHouseInternalTableOperations.doCommit(baseAtN, finalDivergentMetadata); - Mockito.verify(mockHouseTableMapper).toHouseTable(tblMetadataCaptor.capture(), Mockito.any()); - - TableMetadata capturedMetadata = tblMetadataCaptor.getValue(); - Map updatedProperties = capturedMetadata.properties(); - - // Verify the divergent commit contains all 4 snapshots - Assertions.assertEquals( - 4, - capturedMetadata.snapshots().size(), - "Divergent commit should contain all 4 snapshots (2 regular + 2 WAP)"); - - Set expectedSnapshotIds = - allSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); - Set actualSnapshotIds = - capturedMetadata.snapshots().stream() - .map(Snapshot::snapshotId) - .collect(Collectors.toSet()); - Assertions.assertEquals( - expectedSnapshotIds, - actualSnapshotIds, - "All snapshot IDs (regular + WAP) should be present after divergent commit"); - - // Verify main ref points to the expected snapshot (testSnapshots[1]) - SnapshotRef mainRef = capturedMetadata.ref(SnapshotRef.MAIN_BRANCH); - Assertions.assertNotNull(mainRef, "Main branch ref should exist"); - Assertions.assertEquals( - testSnapshots.get(1).snapshotId(), - mainRef.snapshotId(), - "Main branch should point to testSnapshots[1] after divergent commit"); - - // Verify WAP snapshots are tracked as staged - String stagedSnapshots = updatedProperties.get(getCanonicalFieldName("staged_snapshots")); - Assertions.assertNotNull(stagedSnapshots, "Staged snapshots should be tracked"); - Set stagedSnapshotIds = Set.of(stagedSnapshots.split(",")); - Assertions.assertTrue( - stagedSnapshotIds.contains(Long.toString(wapSnapshots.get(0).snapshotId())), - "WAP snapshot 0 should be tracked as staged"); - Assertions.assertTrue( - stagedSnapshotIds.contains(Long.toString(wapSnapshots.get(1).snapshotId())), - "WAP snapshot 1 should be tracked as staged"); - - // Verify regular snapshot is tracked as appended (not testSnapshots[0] since it was in base) - String appendedSnapshots = updatedProperties.get(getCanonicalFieldName("appended_snapshots")); - Assertions.assertNotNull(appendedSnapshots, "Appended snapshots should be tracked"); - Assertions.assertEquals( - Long.toString(testSnapshots.get(1).snapshotId()), - appendedSnapshots, - "testSnapshots[1] should be tracked as appended"); - - Assertions.assertNull( - updatedProperties.get(getCanonicalFieldName("cherry_picked_snapshots")), - "No snapshots should be cherry-picked in this scenario"); - Assertions.assertNull( - updatedProperties.get(getCanonicalFieldName("deleted_snapshots")), - "No snapshots should be deleted in this scenario"); - - Mockito.verify(mockHouseTableRepository, Mockito.times(1)).save(Mockito.eq(mockHouseTable)); - } - } } From 5d3d03fe276d00d9bca680deafe1f3cc25e6b8a8 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 3 Nov 2025 23:09:32 -0800 Subject: [PATCH 25/35] fixing --- .../internal/catalog/SnapshotDiffApplier.java | 74 +++++++++---------- 1 file changed, 33 insertions(+), 41 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index cbdae7960..e9f204f43 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -231,64 +231,56 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { } void recordMetrics(TableMetadata.Builder builder) { - // First, explicitly remove temp properties from the builder - builder.removeProperties( - new HashSet<>( - Arrays.asList( - CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY))); - - // Then add result properties if (CollectionUtils.isNotEmpty(appendedSnapshots)) { - builder.setProperties( - new HashMap() { - { - put( - getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - String.join(",", appendedSnapshots)); - } - }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); } if (CollectionUtils.isNotEmpty(stagedSnapshots)) { - builder.setProperties( - new HashMap() { - { - put( - getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - String.join(",", stagedSnapshots)); - } - }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); } if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { - builder.setProperties( - new HashMap() { - { - put( - getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - String.join(",", cherryPickedSnapshots)); - } - }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickedSnapshots.size()); } if (CollectionUtils.isNotEmpty(deletedSnapshots)) { - builder.setProperties( - new HashMap() { - { - put( - getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - deletedSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(","))); - } - }); metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, deletedSnapshots.size()); } + + // Record snapshot IDs in properties + if (CollectionUtils.isNotEmpty(appendedSnapshots)) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), + String.join(",", appendedSnapshots))); + } + if (CollectionUtils.isNotEmpty(stagedSnapshots)) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), + String.join(",", stagedSnapshots))); + } + if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), + String.join(",", cherryPickedSnapshots))); + } + if (CollectionUtils.isNotEmpty(deletedSnapshots)) { + builder.setProperties( + Collections.singletonMap( + getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), + deletedSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")))); + } + + builder.removeProperties( + new HashSet<>( + Arrays.asList( + CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY))); } } } From 088de1c00e3cdf0117f25d21e037d1f3293dd33b Mon Sep 17 00:00:00 2001 From: cbb330 Date: Mon, 3 Nov 2025 23:19:40 -0800 Subject: [PATCH 26/35] fixing tests --- .../catalog/SnapshotDiffApplierTest.java | 301 +++++++----------- 1 file changed, 121 insertions(+), 180 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java index 4fa913b4d..08fc48a52 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java @@ -26,6 +26,10 @@ import org.junit.jupiter.api.Test; import org.mockito.Mockito; +/** + * Unit tests for {@link SnapshotDiffApplier}. Tests the refactored snapshot logic that was + * extracted from OpenHouseInternalTableOperations. + */ public class SnapshotDiffApplierTest { private SnapshotDiffApplier snapshotDiffApplier; @@ -57,6 +61,64 @@ void setup() { new HashMap<>()); } + // ========== Helper Methods ========== + + /** + * Creates metadata with snapshots and refs properties for testing. + * + * @param base Base metadata to start from + * @param snapshots Snapshots to include + * @param refs Snapshot refs to include (nullable) + * @return Metadata with properties set + */ + private TableMetadata createMetadataWithSnapshots( + TableMetadata base, List snapshots, Map refs) { + Map properties = new HashMap<>(base.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + if (refs != null) { + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(refs)); + } + return base.replaceProperties(properties); + } + + /** + * Creates metadata with snapshots pointing to the last snapshot as main branch. + * + * @param base Base metadata to start from + * @param snapshots Snapshots to include + * @return Metadata with snapshots and main branch ref + */ + private TableMetadata createMetadataWithSnapshotsAndMainRef( + TableMetadata base, List snapshots) { + Map refs = + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)); + return createMetadataWithSnapshots(base, snapshots, refs); + } + + /** + * Adds snapshots to metadata and sets main branch to the last snapshot. + * + * @param metadata Base metadata + * @param snapshots Snapshots to add + * @return Updated metadata + */ + private TableMetadata addSnapshotsToMetadata(TableMetadata metadata, List snapshots) { + TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); + for (Snapshot snapshot : snapshots) { + builder.addSnapshot(snapshot); + } + if (!snapshots.isEmpty()) { + Snapshot lastSnapshot = snapshots.get(snapshots.size() - 1); + SnapshotRef ref = SnapshotRef.branchBuilder(lastSnapshot.snapshotId()).build(); + builder.setRef(SnapshotRef.MAIN_BRANCH, ref); + } + return builder.build(); + } + + // ========== Edge Case Tests ========== + + /** Verifies that when no snapshot JSON is provided, metadata is returned unmodified. */ @Test void testApplySnapshots_noSnapshotsJson_returnsUnmodified() { TableMetadata result = snapshotDiffApplier.applySnapshots(null, baseMetadata); @@ -65,24 +127,21 @@ void testApplySnapshots_noSnapshotsJson_returnsUnmodified() { verifyNoInteractions(mockMetricsReporter); } + /** Verifies that table creation (null base) is handled correctly. */ @Test void testApplySnapshots_nullBase_handlesTableCreation() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); - Map properties = new HashMap<>(baseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); + TableMetadata newMetadata = createMetadataWithSnapshotsAndMainRef(baseMetadata, snapshots); - TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); assertNotNull(result); assertEquals(snapshots.size(), result.snapshots().size()); } + // ========== Basic Functionality Tests ========== + + /** Verifies that new snapshots are added correctly. */ @Test void testApplySnapshots_addNewSnapshots_success() throws IOException { List initialSnapshots = IcebergTestUtil.getSnapshots(); @@ -90,109 +149,73 @@ void testApplySnapshots_addNewSnapshots_success() throws IOException { List allSnapshots = new ArrayList<>(initialSnapshots); allSnapshots.addAll(IcebergTestUtil.getExtraSnapshots()); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, allSnapshots); - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - allSnapshots.get(allSnapshots.size() - 1)))); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); assertNotNull(result); assertTrue(result.snapshots().size() > baseWithSnapshots.snapshots().size()); - verify(mockMetricsReporter, atLeastOnce()).count(anyString(), anyDouble()); } + /** Verifies that deleting snapshots works correctly and updates main branch. */ @Test - void testValidateCurrentSnapshotNotDeleted_whenCurrentDeleted_throwsException() - throws IOException { + void testApplySnapshots_deleteSnapshots_success() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, - SnapshotsUtil.serializedSnapshots(Collections.emptyList())); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(new HashMap<>())); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, remainingSnapshots); - InvalidIcebergSnapshotException exception = - assertThrows( - InvalidIcebergSnapshotException.class, - () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); - assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + assertNotNull(result); + assertEquals(remainingSnapshots.size(), result.snapshots().size()); } + /** Verifies that updating branch references works correctly. */ @Test - void testValidateNoAmbiguousCommits_whenSnapshotReferencedByMultipleBranches_throwsException() - throws IOException { + void testApplySnapshots_branchUpdates_success() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); - Snapshot targetSnapshot = snapshots.get(0); - - Map snapshotRefs = new HashMap<>(); - SnapshotRef ref = SnapshotRef.branchBuilder(targetSnapshot.snapshotId()).build(); - snapshotRefs.put("branch1", org.apache.iceberg.SnapshotRefParser.toJson(ref)); - snapshotRefs.put("branch2", org.apache.iceberg.SnapshotRefParser.toJson(ref)); - - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + Snapshot newBranchTarget = snapshots.get(1); + Map refs = IcebergTestUtil.obtainSnapshotRefsFromSnapshot(newBranchTarget); + TableMetadata newMetadata = createMetadataWithSnapshots(baseWithSnapshots, snapshots, refs); - InvalidIcebergSnapshotException exception = - assertThrows( - InvalidIcebergSnapshotException.class, - () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); - assertTrue(exception.getMessage().contains("Ambiguous commit")); - assertTrue(exception.getMessage().contains("referenced by multiple branches")); + assertNotNull(result); + assertNotNull(result.currentSnapshot()); + assertEquals(newBranchTarget.snapshotId(), result.currentSnapshot().snapshotId()); } + // ========== Validation Tests ========== + + /** Verifies that deleting the current snapshot without replacements throws an exception. */ @Test - void - testValidateDeletedSnapshotsNotReferenced_whenDeletedSnapshotStillReferenced_throwsException() - throws IOException { + void testValidation_deletingCurrentSnapshotWithoutReplacement_throwsException() + throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); - Snapshot snapshotToDelete = snapshots.get(0); - List remainingSnapshots = snapshots.subList(1, snapshots.size()); - - Map snapshotRefs = new HashMap<>(); - SnapshotRef ref = SnapshotRef.branchBuilder(snapshotToDelete.snapshotId()).build(); - snapshotRefs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(ref)); - - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, Collections.emptyList(), new HashMap<>()); InvalidIcebergSnapshotException exception = assertThrows( InvalidIcebergSnapshotException.class, () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); - assertTrue(exception.getMessage().contains("Cannot delete snapshots")); - assertTrue(exception.getMessage().contains("still referenced")); + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); } + // ========== Metrics Tests ========== + /** Verifies that WAP (staged) snapshots trigger the correct metrics. */ @Test - void testApplySnapshots_withWapSnapshots_recordsCorrectMetrics() throws IOException { + void testMetrics_wapSnapshots_recordsStagedCounter() throws IOException { List baseSnapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); @@ -200,160 +223,78 @@ void testApplySnapshots_withWapSnapshots_recordsCorrectMetrics() throws IOExcept List allSnapshots = new ArrayList<>(baseSnapshots); allSnapshots.addAll(wapSnapshots); - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - baseSnapshots.get(baseSnapshots.size() - 1)))); + Map refs = + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(baseSnapshots.get(baseSnapshots.size() - 1)); + TableMetadata newMetadata = createMetadataWithSnapshots(baseWithSnapshots, allSnapshots, refs); - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); assertNotNull(result); - verify(mockMetricsReporter) .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR), anyDouble()); } + /** Verifies that deleting snapshots triggers the correct metrics. */ @Test - void testApplySnapshots_deleteSnapshots_recordsCorrectMetrics() throws IOException { + void testMetrics_deleteSnapshots_recordsDeletedCounter() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); List remainingSnapshots = snapshots.subList(1, snapshots.size()); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, remainingSnapshots); - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(remainingSnapshots)); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - remainingSnapshots.get(remainingSnapshots.size() - 1)))); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); assertNotNull(result); assertEquals(remainingSnapshots.size(), result.snapshots().size()); - verify(mockMetricsReporter) .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR), eq(1.0)); } + // ========== Property Management Tests ========== + + /** Verifies that appended snapshot IDs are recorded in properties. */ @Test - void testApplySnapshots_recordsSnapshotIdsInProperties() throws IOException { + void testProperties_appendedSnapshots_recordedCorrectly() throws IOException { List baseSnapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); List newSnapshotsList = IcebergTestUtil.getExtraSnapshots(); List allSnapshots = new ArrayList<>(baseSnapshots); allSnapshots.addAll(newSnapshotsList); + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, allSnapshots); - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(allSnapshots)); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( - allSnapshots.get(allSnapshots.size() - 1)))); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); assertNotNull(result); - String appendedSnapshots = result.properties().get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); assertNotNull(appendedSnapshots, "Appended snapshots should be recorded in properties"); - assertTrue(appendedSnapshots.contains(",") || !appendedSnapshots.isEmpty()); + // Verify actual snapshot IDs are present + for (Snapshot newSnapshot : newSnapshotsList) { + assertTrue( + appendedSnapshots.contains(String.valueOf(newSnapshot.snapshotId())), + "Snapshot ID " + newSnapshot.snapshotId() + " should be in appended_snapshots"); + } } + /** Verifies that temporary snapshot processing keys are removed from final properties. */ @Test - void testApplySnapshots_removesSnapshotKeysFromProperties() throws IOException { + void testProperties_tempKeysRemoved_success() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata newMetadata = createMetadataWithSnapshotsAndMainRef(baseMetadata, snapshots); - Map properties = new HashMap<>(baseMetadata.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); - properties.put( - CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); - - TableMetadata newMetadata = baseMetadata.replaceProperties(properties); TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); assertNotNull(result); - assertFalse( result.properties().containsKey(CatalogConstants.SNAPSHOTS_JSON_KEY), - "Snapshots JSON key should be removed from final properties"); + "Temp snapshots JSON key should be removed"); assertFalse( result.properties().containsKey(CatalogConstants.SNAPSHOTS_REFS_KEY), - "Snapshots refs key should be removed from final properties"); - } - - @Test - void testApplySnapshots_branchUpdates_appliesCorrectly() throws IOException { - List snapshots = IcebergTestUtil.getSnapshots(); - TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); - - Snapshot newBranchTarget = snapshots.get(1); - Map snapshotRefs = - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(newBranchTarget); - - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); - TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); - - assertNotNull(result); - assertNotNull(result.currentSnapshot()); - assertEquals(newBranchTarget.snapshotId(), result.currentSnapshot().snapshotId()); - } - - @Test - void testApplySnapshots_multipleBranchUpdates_success() throws IOException { - List snapshots = IcebergTestUtil.getSnapshots(); - TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); - - Map snapshotRefs = new HashMap<>(); - SnapshotRef mainRef = SnapshotRef.branchBuilder(snapshots.get(0).snapshotId()).build(); - SnapshotRef devRef = SnapshotRef.branchBuilder(snapshots.get(1).snapshotId()).build(); - snapshotRefs.put(SnapshotRef.MAIN_BRANCH, org.apache.iceberg.SnapshotRefParser.toJson(mainRef)); - snapshotRefs.put("dev", org.apache.iceberg.SnapshotRefParser.toJson(devRef)); - - Map properties = new HashMap<>(baseWithSnapshots.properties()); - properties.put( - CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); - properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(snapshotRefs)); - - TableMetadata newMetadata = baseWithSnapshots.replaceProperties(properties); - TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); - - assertNotNull(result); - assertEquals(2, result.refs().size()); - } - - private TableMetadata addSnapshotsToMetadata(TableMetadata metadata, List snapshots) { - TableMetadata.Builder builder = TableMetadata.buildFrom(metadata); - for (Snapshot snapshot : snapshots) { - builder.addSnapshot(snapshot); - } - if (!snapshots.isEmpty()) { - Snapshot lastSnapshot = snapshots.get(snapshots.size() - 1); - SnapshotRef ref = SnapshotRef.branchBuilder(lastSnapshot.snapshotId()).build(); - builder.setRef(SnapshotRef.MAIN_BRANCH, ref); - } - return builder.build(); + "Temp snapshots refs key should be removed"); } } From 3d1a758b0f87dcba7f0ca8ffbbf03b87559c4fa1 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 4 Nov 2025 12:20:10 -0800 Subject: [PATCH 27/35] small refactor --- .../internal/catalog/SnapshotDiffApplier.java | 109 +++++++++--------- 1 file changed, 56 insertions(+), 53 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index e9f204f43..33db2fabb 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -88,10 +88,10 @@ private class SnapshotDiff { private final List newSnapshots; private final List deletedSnapshots; - // Categorized snapshots (computed during applyTo) - private List appendedSnapshots; - private List stagedSnapshots; - private List cherryPickedSnapshots; + // Categorized snapshots + private final List stagedSnapshots; + private final List regularSnapshots; + private final List cherryPickedSnapshots; SnapshotDiff( List providedSnapshots, @@ -118,6 +118,23 @@ private class SnapshotDiff { existingSnapshots.stream() .filter(s -> !providedSnapshotByIds.containsKey(s.snapshotId())) .collect(Collectors.toList()); + + // Categorize snapshots (simple logic for PR1 - just check summary properties) + this.stagedSnapshots = + newSnapshots.stream() + .filter(s -> s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) + .collect(Collectors.toList()); + this.cherryPickedSnapshots = + newSnapshots.stream() + .filter(s -> s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + .collect(Collectors.toList()); + this.regularSnapshots = + newSnapshots.stream() + .filter( + s -> + !s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP) + && !s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + .collect(Collectors.toList()); } /** @@ -158,9 +175,6 @@ private void validateCurrentSnapshotNotDeleted(TableMetadata base) { TableMetadata.Builder applyTo(TableMetadata metadata) { TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); - this.appendedSnapshots = new ArrayList<>(); - this.stagedSnapshots = new ArrayList<>(); - this.cherryPickedSnapshots = new ArrayList<>(); // Validate only MAIN branch for (Map.Entry entry : providedRefs.entrySet()) { @@ -170,53 +184,32 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { } /** - * First check if there are new snapshots to be appended to current TableMetadata. If yes, - * following are the cases to be handled: - * - *

[1] A regular (non-wap) snapshot is being added to the MAIN branch. + * Apply categorized snapshots to metadata: * - *

[2] A staged (wap) snapshot is being created on top of current snapshot as its base. - * Recognized by STAGED_WAP_ID_PROP. + *

[1] Staged (WAP) snapshots - added without branch reference * - *

[3] A staged (wap) snapshot is being cherry picked to the MAIN branch wherein current - * snapshot in the MAIN branch is not the same as the base snapshot the staged (wap) snapshot - * was created on. Recognized by SOURCE_SNAPSHOT_ID_PROP. This case is called non-fast forward - * cherry pick. + *

[2] Cherry-picked snapshots - set as main branch snapshot * - *

In case no new snapshots are to be appended to current TableMetadata, there could be a - * cherrypick of a staged (wap) snapshot on top of the current snapshot in the MAIN branch - * which is the same as the base snapshot the staged (wap) snapshot was created on. This case - * is called fast forward cherry pick. + *

[3] Regular snapshots - set as main branch snapshot */ - if (CollectionUtils.isNotEmpty(newSnapshots)) { - for (Snapshot snapshot : newSnapshots) { - if (snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) { - // a stage only snapshot using wap.id - metadataBuilder.addSnapshot(snapshot); - stagedSnapshots.add(String.valueOf(snapshot.snapshotId())); - } else if (snapshot.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { - // a snapshot created on a non fast-forward cherry-pick snapshot - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - cherryPickedSnapshots.add( - String.valueOf(snapshot.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))); - } else { - // a regular snapshot - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); - appendedSnapshots.add(String.valueOf(snapshot.snapshotId())); - } - } - } else if (MapUtils.isNotEmpty(providedRefs)) { - // Updated ref in the main branch with no new snapshot means this is a - // fast-forward cherry-pick or rollback operation. + for (Snapshot snapshot : stagedSnapshots) { + metadataBuilder.addSnapshot(snapshot); + } + + for (Snapshot snapshot : cherryPickedSnapshots) { + metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + } + + for (Snapshot snapshot : regularSnapshots) { + metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + } + + // Handle fast-forward cherry-pick (ref update without new snapshot) + if (newSnapshots.isEmpty() && MapUtils.isNotEmpty(providedRefs)) { long newSnapshotId = providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); - // Either the current snapshot is null or the current snapshot is not equal - // to the new snapshot indicates an update. The first case happens when the - // stage/wap snapshot being cherry-picked is the first snapshot. if (MapUtils.isEmpty(metadata.refs()) || metadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() != newSnapshotId) { metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); - cherryPickedSnapshots.add(String.valueOf(newSnapshotId)); } } @@ -231,9 +224,11 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { } void recordMetrics(TableMetadata.Builder builder) { - if (CollectionUtils.isNotEmpty(appendedSnapshots)) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedSnapshots.size()); + // Compute appended snapshots (regular + cherry-picked) + int appendedCount = regularSnapshots.size() + cherryPickedSnapshots.size(); + + if (appendedCount > 0) { + metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedCount); } if (CollectionUtils.isNotEmpty(stagedSnapshots)) { metricsReporter.count( @@ -250,23 +245,31 @@ void recordMetrics(TableMetadata.Builder builder) { } // Record snapshot IDs in properties - if (CollectionUtils.isNotEmpty(appendedSnapshots)) { + if (appendedCount > 0) { + List appendedSnapshots = new ArrayList<>(regularSnapshots); + appendedSnapshots.addAll(cherryPickedSnapshots); builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - String.join(",", appendedSnapshots))); + appendedSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")))); } if (CollectionUtils.isNotEmpty(stagedSnapshots)) { builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - String.join(",", stagedSnapshots))); + stagedSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")))); } if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - String.join(",", cherryPickedSnapshots))); + cherryPickedSnapshots.stream() + .map(s -> s.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + .collect(Collectors.joining(",")))); } if (CollectionUtils.isNotEmpty(deletedSnapshots)) { builder.setProperties( From 65666d23f19f4ab10894649db284380bd4004e92 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 4 Nov 2025 16:08:37 -0800 Subject: [PATCH 28/35] updating containers --- .../internal/catalog/SnapshotDiffApplier.java | 76 ++++++++++++++----- 1 file changed, 56 insertions(+), 20 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index 33db2fabb..ece059b27 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -16,8 +16,6 @@ import java.util.stream.Collectors; import lombok.AllArgsConstructor; import lombok.extern.slf4j.Slf4j; -import org.apache.commons.collections.CollectionUtils; -import org.apache.commons.collections.MapUtils; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; import org.apache.iceberg.SnapshotSummary; @@ -58,10 +56,12 @@ public TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) .orElse(new HashMap<>()); List existingSnapshots = base != null ? base.snapshots() : Collections.emptyList(); + Map existingRefs = base != null ? base.refs() : Collections.emptyMap(); // Compute diff (all maps created once in constructor) SnapshotDiff diff = - new SnapshotDiff(providedSnapshots, existingSnapshots, metadata, providedRefs); + new SnapshotDiff( + providedSnapshots, existingSnapshots, metadata, providedRefs, existingRefs); // Validate, apply, record metrics, build diff.validate(base); @@ -81,10 +81,13 @@ private class SnapshotDiff { private final List existingSnapshots; private final TableMetadata metadata; private final Map providedRefs; + private final Map existingRefs; // Computed maps (created once) private final Map providedSnapshotByIds; private final Map existingSnapshotByIds; + private final Set existingBranchRefIds; + private final Set providedBranchRefIds; private final List newSnapshots; private final List deletedSnapshots; @@ -97,17 +100,27 @@ private class SnapshotDiff { List providedSnapshots, List existingSnapshots, TableMetadata metadata, - Map providedRefs) { + Map providedRefs, + Map existingRefs) { this.providedSnapshots = providedSnapshots; this.existingSnapshots = existingSnapshots; this.metadata = metadata; this.providedRefs = providedRefs; + this.existingRefs = existingRefs; // Compute all maps once this.providedSnapshotByIds = providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); this.existingSnapshotByIds = existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + this.existingBranchRefIds = + existingRefs.values().stream() + .map(SnapshotRef::snapshotId) + .collect(Collectors.toSet()); + this.providedBranchRefIds = + providedRefs.values().stream() + .map(SnapshotRef::snapshotId) + .collect(Collectors.toSet()); // Compute changes this.newSnapshots = @@ -125,8 +138,24 @@ private class SnapshotDiff { .filter(s -> s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); this.cherryPickedSnapshots = - newSnapshots.stream() - .filter(s -> s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + providedSnapshots.stream() + .filter( + s -> { + // New snapshot with SOURCE_SNAPSHOT_ID_PROP (actual cherry-pick) + if (!existingSnapshotByIds.containsKey(s.snapshotId()) + && s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { + return true; + } + // WAP snapshot being published (staged → branch transition) + // For new snapshots: WAP created and immediately published + // For existing snapshots: existing WAP being published (fast-forward) + boolean hasWapId = + s.summary() != null + && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + boolean wasStaged = !existingBranchRefIds.contains(s.snapshotId()); + boolean isNowOnBranch = providedBranchRefIds.contains(s.snapshotId()); + return hasWapId && wasStaged && isNowOnBranch; + }) .collect(Collectors.toList()); this.regularSnapshots = newSnapshots.stream() @@ -196,8 +225,12 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { metadataBuilder.addSnapshot(snapshot); } + // Only apply NEW cherry-picked snapshots + // Existing cherry-picked snapshots are handled by fast-forward block below for (Snapshot snapshot : cherryPickedSnapshots) { - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + if (newSnapshots.contains(snapshot)) { + metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + } } for (Snapshot snapshot : regularSnapshots) { @@ -205,16 +238,16 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { } // Handle fast-forward cherry-pick (ref update without new snapshot) - if (newSnapshots.isEmpty() && MapUtils.isNotEmpty(providedRefs)) { + if (newSnapshots.isEmpty() && !providedRefs.isEmpty()) { long newSnapshotId = providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); - if (MapUtils.isEmpty(metadata.refs()) + if (metadata.refs().isEmpty() || metadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() != newSnapshotId) { metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); } } // Delete snapshots - if (CollectionUtils.isNotEmpty(deletedSnapshots)) { + if (!deletedSnapshots.isEmpty()) { Set snapshotIds = deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); metadataBuilder.removeSnapshots(snapshotIds); @@ -224,22 +257,25 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { } void recordMetrics(TableMetadata.Builder builder) { - // Compute appended snapshots (regular + cherry-picked) - int appendedCount = regularSnapshots.size() + cherryPickedSnapshots.size(); + // Compute appended snapshots (regular + NEW cherry-picked only) + // Existing cherry-picked snapshots (fast-forward) are not appended + List newCherryPicked = + cherryPickedSnapshots.stream().filter(newSnapshots::contains).collect(Collectors.toList()); + int appendedCount = regularSnapshots.size() + newCherryPicked.size(); if (appendedCount > 0) { metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedCount); } - if (CollectionUtils.isNotEmpty(stagedSnapshots)) { + if (!stagedSnapshots.isEmpty()) { metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); } - if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { + if (!cherryPickedSnapshots.isEmpty()) { metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, cherryPickedSnapshots.size()); } - if (CollectionUtils.isNotEmpty(deletedSnapshots)) { + if (!deletedSnapshots.isEmpty()) { metricsReporter.count( InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, deletedSnapshots.size()); } @@ -247,7 +283,7 @@ void recordMetrics(TableMetadata.Builder builder) { // Record snapshot IDs in properties if (appendedCount > 0) { List appendedSnapshots = new ArrayList<>(regularSnapshots); - appendedSnapshots.addAll(cherryPickedSnapshots); + appendedSnapshots.addAll(newCherryPicked); builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), @@ -255,7 +291,7 @@ void recordMetrics(TableMetadata.Builder builder) { .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")))); } - if (CollectionUtils.isNotEmpty(stagedSnapshots)) { + if (!stagedSnapshots.isEmpty()) { builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), @@ -263,15 +299,15 @@ void recordMetrics(TableMetadata.Builder builder) { .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")))); } - if (CollectionUtils.isNotEmpty(cherryPickedSnapshots)) { + if (!cherryPickedSnapshots.isEmpty()) { builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), cherryPickedSnapshots.stream() - .map(s -> s.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")))); } - if (CollectionUtils.isNotEmpty(deletedSnapshots)) { + if (!deletedSnapshots.isEmpty()) { builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), From 062f38628ff8497774f6fa127c193f5dfb72d7f7 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 4 Nov 2025 16:09:25 -0800 Subject: [PATCH 29/35] updating containers --- .../internal/catalog/SnapshotDiffApplier.java | 12 +++++------- 1 file changed, 5 insertions(+), 7 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index ece059b27..bfd3438d9 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -114,13 +114,9 @@ private class SnapshotDiff { this.existingSnapshotByIds = existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); this.existingBranchRefIds = - existingRefs.values().stream() - .map(SnapshotRef::snapshotId) - .collect(Collectors.toSet()); + existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); this.providedBranchRefIds = - providedRefs.values().stream() - .map(SnapshotRef::snapshotId) - .collect(Collectors.toSet()); + providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); // Compute changes this.newSnapshots = @@ -260,7 +256,9 @@ void recordMetrics(TableMetadata.Builder builder) { // Compute appended snapshots (regular + NEW cherry-picked only) // Existing cherry-picked snapshots (fast-forward) are not appended List newCherryPicked = - cherryPickedSnapshots.stream().filter(newSnapshots::contains).collect(Collectors.toList()); + cherryPickedSnapshots.stream() + .filter(newSnapshots::contains) + .collect(Collectors.toList()); int appendedCount = regularSnapshots.size() + newCherryPicked.size(); if (appendedCount > 0) { From 85d8696b9054b066744581b60aa4aeb391e030bc Mon Sep 17 00:00:00 2001 From: cbb330 Date: Tue, 4 Nov 2025 22:17:30 -0800 Subject: [PATCH 30/35] fixing tests --- .../internal/catalog/SnapshotDiffApplier.java | 67 ++++++++++--------- 1 file changed, 35 insertions(+), 32 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index bfd3438d9..de5112442 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -4,7 +4,6 @@ import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; -import java.util.ArrayList; import java.util.Arrays; import java.util.Collections; import java.util.HashMap; @@ -133,32 +132,47 @@ private class SnapshotDiff { newSnapshots.stream() .filter(s -> s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); + + // Compute source IDs for cherry-pick operations (from ForReference.java) + Set cherryPickSourceIds = + providedSnapshots.stream() + .filter( + s -> + s.summary() != null + && s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + .map(s -> Long.parseLong(s.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))) + .collect(Collectors.toSet()); + this.cherryPickedSnapshots = providedSnapshots.stream() .filter( - s -> { - // New snapshot with SOURCE_SNAPSHOT_ID_PROP (actual cherry-pick) - if (!existingSnapshotByIds.containsKey(s.snapshotId()) - && s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) { + provided -> { + // Only consider EXISTING snapshots as cherry-picked (from ForReference.java) + Snapshot existing = existingSnapshotByIds.get(provided.snapshotId()); + if (existing == null) { + return false; + } + + // Is source of cherry-pick (from ForReference.java) + if (cherryPickSourceIds.contains(provided.snapshotId())) { return true; } + // WAP snapshot being published (staged → branch transition) - // For new snapshots: WAP created and immediately published - // For existing snapshots: existing WAP being published (fast-forward) boolean hasWapId = - s.summary() != null - && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); - boolean wasStaged = !existingBranchRefIds.contains(s.snapshotId()); - boolean isNowOnBranch = providedBranchRefIds.contains(s.snapshotId()); + provided.summary() != null + && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + boolean wasStaged = !existingBranchRefIds.contains(provided.snapshotId()); + boolean isNowOnBranch = providedBranchRefIds.contains(provided.snapshotId()); return hasWapId && wasStaged && isNowOnBranch; }) .collect(Collectors.toList()); + // Regular snapshots = all new snapshots that are not staged WAP + // (From ForReference.java: everything that's not cherry-picked and not WAP) + // Note: NEW snapshots with SOURCE_SNAPSHOT_ID_PROP are regular (new commits being appended) this.regularSnapshots = newSnapshots.stream() - .filter( - s -> - !s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP) - && !s.summary().containsKey(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP)) + .filter(s -> !s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); } @@ -221,13 +235,8 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { metadataBuilder.addSnapshot(snapshot); } - // Only apply NEW cherry-picked snapshots - // Existing cherry-picked snapshots are handled by fast-forward block below - for (Snapshot snapshot : cherryPickedSnapshots) { - if (newSnapshots.contains(snapshot)) { - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); - } - } + // Cherry-picked snapshots are all existing, handled by fast-forward block below + // (No need to apply them here) for (Snapshot snapshot : regularSnapshots) { metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); @@ -253,13 +262,9 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { } void recordMetrics(TableMetadata.Builder builder) { - // Compute appended snapshots (regular + NEW cherry-picked only) - // Existing cherry-picked snapshots (fast-forward) are not appended - List newCherryPicked = - cherryPickedSnapshots.stream() - .filter(newSnapshots::contains) - .collect(Collectors.toList()); - int appendedCount = regularSnapshots.size() + newCherryPicked.size(); + // Compute appended snapshots (only regular snapshots) + // Cherry-picked snapshots are all existing, not appended + int appendedCount = regularSnapshots.size(); if (appendedCount > 0) { metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedCount); @@ -280,12 +285,10 @@ void recordMetrics(TableMetadata.Builder builder) { // Record snapshot IDs in properties if (appendedCount > 0) { - List appendedSnapshots = new ArrayList<>(regularSnapshots); - appendedSnapshots.addAll(newCherryPicked); builder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - appendedSnapshots.stream() + regularSnapshots.stream() .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")))); } From d64f57b00f2a9c33d3fa7c84218029cf2fd2723b Mon Sep 17 00:00:00 2001 From: cbb330 Date: Wed, 5 Nov 2025 15:11:03 -0800 Subject: [PATCH 31/35] cleaning up practices --- .../internal/catalog/SnapshotDiffApplier.java | 257 +++++++++++------- 1 file changed, 166 insertions(+), 91 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index de5112442..16c26d1d1 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -23,9 +23,8 @@ /** * Service responsible for applying snapshot changes to Iceberg table metadata. * - *

This class extracts snapshot logic from OpenHouseInternalTableOperations while maintaining the - * same behavior. The main entry point applySnapshots() has a clear flow: parse input → compute diff - * → validate → apply. + *

The main entry point applySnapshots() has a clear flow: parse input → compute diff → validate + * → apply. */ @AllArgsConstructor @Slf4j @@ -37,36 +36,44 @@ public class SnapshotDiffApplier { * Applies snapshot updates from metadata properties. Simple and clear: parse input, compute diff, * validate, apply, record metrics, build. * - * @param base The base table metadata (may be null for table creation) - * @param metadata The new metadata with properties containing snapshot updates + * @param existingMetadata The existing table metadata (may be null for table creation) + * @param providedMetadata The new metadata with properties containing snapshot updates * @return Updated metadata with snapshots applied */ - public TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) { - String snapshotsJson = metadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY); + public TableMetadata applySnapshots( + TableMetadata existingMetadata, TableMetadata providedMetadata) { + String snapshotsJson = providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY); if (snapshotsJson == null) { - return metadata; + return providedMetadata; } // Parse input List providedSnapshots = SnapshotsUtil.parseSnapshots(null, snapshotsJson); Map providedRefs = - Optional.ofNullable(metadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) + Optional.ofNullable(providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) .map(SnapshotsUtil::parseSnapshotRefs) .orElse(new HashMap<>()); - List existingSnapshots = base != null ? base.snapshots() : Collections.emptyList(); - Map existingRefs = base != null ? base.refs() : Collections.emptyMap(); + List existingSnapshots = + existingMetadata != null ? existingMetadata.snapshots() : Collections.emptyList(); + Map existingRefs = + existingMetadata != null ? existingMetadata.refs() : Collections.emptyMap(); - // Compute diff (all maps created once in constructor) + // Compute diff (all maps created once in factory method) SnapshotDiff diff = - new SnapshotDiff( - providedSnapshots, existingSnapshots, metadata, providedRefs, existingRefs); - - // Validate, apply, record metrics, build - diff.validate(base); - TableMetadata.Builder builder = diff.applyTo(metadata); - diff.recordMetrics(builder); - return builder.build(); + SnapshotDiff.create( + metricsReporter, + existingMetadata, + providedSnapshots, + existingSnapshots, + providedMetadata, + providedRefs, + existingRefs); + + // Validate, apply, record metrics + diff.validate(); + diff.recordMetrics(); + return diff.applyTo(); } /** @@ -74,11 +81,15 @@ public TableMetadata applySnapshots(TableMetadata base, TableMetadata metadata) * constructor to avoid redundant operations. Provides clear methods for validation and * application. */ - private class SnapshotDiff { + private static class SnapshotDiff { + // Injected dependency + private final MetricsReporter metricsReporter; + // Input state + private final TableMetadata existingMetadata; private final List providedSnapshots; private final List existingSnapshots; - private final TableMetadata metadata; + private final TableMetadata providedMetadata; private final Map providedRefs; private final Map existingRefs; @@ -94,46 +105,56 @@ private class SnapshotDiff { private final List stagedSnapshots; private final List regularSnapshots; private final List cherryPickedSnapshots; + private final int appendedCount; - SnapshotDiff( + /** + * Creates a SnapshotDiff by computing all snapshot analysis from the provided inputs. + * + * @param metricsReporter Metrics reporter for recording snapshot operations + * @param existingMetadata The existing table metadata (may be null for table creation) + * @param providedSnapshots Snapshots provided in the update + * @param existingSnapshots Snapshots currently in the table + * @param providedMetadata The new metadata with properties containing snapshot updates + * @param providedRefs Snapshot refs provided in the update + * @param existingRefs Snapshot refs currently in the table + * @return A new SnapshotDiff with all analysis computed + */ + static SnapshotDiff create( + MetricsReporter metricsReporter, + TableMetadata existingMetadata, List providedSnapshots, List existingSnapshots, - TableMetadata metadata, + TableMetadata providedMetadata, Map providedRefs, Map existingRefs) { - this.providedSnapshots = providedSnapshots; - this.existingSnapshots = existingSnapshots; - this.metadata = metadata; - this.providedRefs = providedRefs; - this.existingRefs = existingRefs; - // Compute all maps once - this.providedSnapshotByIds = + // Compute all index maps once + Map providedSnapshotByIds = providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - this.existingSnapshotByIds = + Map existingSnapshotByIds = existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); - this.existingBranchRefIds = + Set existingBranchRefIds = existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); - this.providedBranchRefIds = + Set providedBranchRefIds = providedRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); // Compute changes - this.newSnapshots = + List newSnapshots = providedSnapshots.stream() .filter(s -> !existingSnapshotByIds.containsKey(s.snapshotId())) .collect(Collectors.toList()); - this.deletedSnapshots = + List deletedSnapshots = existingSnapshots.stream() .filter(s -> !providedSnapshotByIds.containsKey(s.snapshotId())) .collect(Collectors.toList()); - // Categorize snapshots (simple logic for PR1 - just check summary properties) - this.stagedSnapshots = + // Categorize snapshots + List stagedSnapshots = newSnapshots.stream() .filter(s -> s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); - // Compute source IDs for cherry-pick operations (from ForReference.java) + // Compute source IDs for cherry-pick operations Set cherryPickSourceIds = providedSnapshots.stream() .filter( @@ -143,17 +164,17 @@ private class SnapshotDiff { .map(s -> Long.parseLong(s.summary().get(SnapshotSummary.SOURCE_SNAPSHOT_ID_PROP))) .collect(Collectors.toSet()); - this.cherryPickedSnapshots = + List cherryPickedSnapshots = providedSnapshots.stream() .filter( provided -> { - // Only consider EXISTING snapshots as cherry-picked (from ForReference.java) + // Only consider EXISTING snapshots as cherry-picked Snapshot existing = existingSnapshotByIds.get(provided.snapshotId()); if (existing == null) { return false; } - // Is source of cherry-pick (from ForReference.java) + // Is source of cherry-pick if (cherryPickSourceIds.contains(provided.snapshotId())) { return true; } @@ -167,42 +188,97 @@ private class SnapshotDiff { return hasWapId && wasStaged && isNowOnBranch; }) .collect(Collectors.toList()); + // Regular snapshots = all new snapshots that are not staged WAP - // (From ForReference.java: everything that's not cherry-picked and not WAP) - // Note: NEW snapshots with SOURCE_SNAPSHOT_ID_PROP are regular (new commits being appended) - this.regularSnapshots = + List regularSnapshots = newSnapshots.stream() .filter(s -> !s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); + + // Compute appended count (only regular snapshots, not cherry-picked) + int appendedCount = regularSnapshots.size(); + + return new SnapshotDiff( + metricsReporter, + existingMetadata, + providedSnapshots, + existingSnapshots, + providedMetadata, + providedRefs, + existingRefs, + providedSnapshotByIds, + existingSnapshotByIds, + existingBranchRefIds, + providedBranchRefIds, + newSnapshots, + deletedSnapshots, + stagedSnapshots, + regularSnapshots, + cherryPickedSnapshots, + appendedCount); + } + + /** Private constructor that accepts all pre-computed values. Use {@link #create} instead. */ + private SnapshotDiff( + MetricsReporter metricsReporter, + TableMetadata existingMetadata, + List providedSnapshots, + List existingSnapshots, + TableMetadata providedMetadata, + Map providedRefs, + Map existingRefs, + Map providedSnapshotByIds, + Map existingSnapshotByIds, + Set existingBranchRefIds, + Set providedBranchRefIds, + List newSnapshots, + List deletedSnapshots, + List stagedSnapshots, + List regularSnapshots, + List cherryPickedSnapshots, + int appendedCount) { + this.metricsReporter = metricsReporter; + this.existingMetadata = existingMetadata; + this.providedSnapshots = providedSnapshots; + this.existingSnapshots = existingSnapshots; + this.providedMetadata = providedMetadata; + this.providedRefs = providedRefs; + this.existingRefs = existingRefs; + this.providedSnapshotByIds = providedSnapshotByIds; + this.existingSnapshotByIds = existingSnapshotByIds; + this.existingBranchRefIds = existingBranchRefIds; + this.providedBranchRefIds = providedBranchRefIds; + this.newSnapshots = newSnapshots; + this.deletedSnapshots = deletedSnapshots; + this.stagedSnapshots = stagedSnapshots; + this.regularSnapshots = regularSnapshots; + this.cherryPickedSnapshots = cherryPickedSnapshots; + this.appendedCount = appendedCount; } /** * Validates all snapshot changes before applying them to table metadata. * - * @param base The base table metadata to validate against (may be null for table creation) * @throws InvalidIcebergSnapshotException if any validation check fails */ - void validate(TableMetadata base) { - validateCurrentSnapshotNotDeleted(base); + void validate() { + validateCurrentSnapshotNotDeleted(); } /** * Validates that the current snapshot is not deleted without providing replacement snapshots. - * This is the same validation logic from SnapshotInspector.validateSnapshotsUpdate(). * - * @param base The base table metadata containing the current snapshot (may be null for table - * creation) * @throws InvalidIcebergSnapshotException if the current snapshot is being deleted without * replacements */ - private void validateCurrentSnapshotNotDeleted(TableMetadata base) { - if (base == null || base.currentSnapshot() == null) { + private void validateCurrentSnapshotNotDeleted() { + if (this.existingMetadata == null || this.existingMetadata.currentSnapshot() == null) { return; } if (!newSnapshots.isEmpty()) { return; } - long latestSnapshotId = base.currentSnapshot().snapshotId(); + long latestSnapshotId = this.existingMetadata.currentSnapshot().snapshotId(); if (!deletedSnapshots.isEmpty() && deletedSnapshots.get(deletedSnapshots.size() - 1).snapshotId() == latestSnapshotId) { throw new InvalidIcebergSnapshotException( @@ -212,8 +288,8 @@ private void validateCurrentSnapshotNotDeleted(TableMetadata base) { } } - TableMetadata.Builder applyTo(TableMetadata metadata) { - TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(metadata); + TableMetadata applyTo() { + TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(this.providedMetadata); // Validate only MAIN branch for (Map.Entry entry : providedRefs.entrySet()) { @@ -245,8 +321,9 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { // Handle fast-forward cherry-pick (ref update without new snapshot) if (newSnapshots.isEmpty() && !providedRefs.isEmpty()) { long newSnapshotId = providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); - if (metadata.refs().isEmpty() - || metadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() != newSnapshotId) { + if (this.providedMetadata.refs().isEmpty() + || this.providedMetadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() + != newSnapshotId) { metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); } } @@ -258,34 +335,9 @@ TableMetadata.Builder applyTo(TableMetadata metadata) { metadataBuilder.removeSnapshots(snapshotIds); } - return metadataBuilder; - } - - void recordMetrics(TableMetadata.Builder builder) { - // Compute appended snapshots (only regular snapshots) - // Cherry-picked snapshots are all existing, not appended - int appendedCount = regularSnapshots.size(); - - if (appendedCount > 0) { - metricsReporter.count(InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, appendedCount); - } - if (!stagedSnapshots.isEmpty()) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, stagedSnapshots.size()); - } - if (!cherryPickedSnapshots.isEmpty()) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, - cherryPickedSnapshots.size()); - } - if (!deletedSnapshots.isEmpty()) { - metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, deletedSnapshots.size()); - } - - // Record snapshot IDs in properties - if (appendedCount > 0) { - builder.setProperties( + // Record snapshot IDs in properties and cleanup input properties + if (this.appendedCount > 0) { + metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), regularSnapshots.stream() @@ -293,7 +345,7 @@ void recordMetrics(TableMetadata.Builder builder) { .collect(Collectors.joining(",")))); } if (!stagedSnapshots.isEmpty()) { - builder.setProperties( + metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), stagedSnapshots.stream() @@ -301,7 +353,7 @@ void recordMetrics(TableMetadata.Builder builder) { .collect(Collectors.joining(",")))); } if (!cherryPickedSnapshots.isEmpty()) { - builder.setProperties( + metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), cherryPickedSnapshots.stream() @@ -309,18 +361,41 @@ void recordMetrics(TableMetadata.Builder builder) { .collect(Collectors.joining(",")))); } if (!deletedSnapshots.isEmpty()) { - builder.setProperties( + metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), deletedSnapshots.stream() .map(s -> Long.toString(s.snapshotId())) .collect(Collectors.joining(",")))); } - - builder.removeProperties( + metadataBuilder.removeProperties( new HashSet<>( Arrays.asList( CatalogConstants.SNAPSHOTS_JSON_KEY, CatalogConstants.SNAPSHOTS_REFS_KEY))); + + return metadataBuilder.build(); + } + + void recordMetrics() { + // Record metrics for appended snapshots (only regular snapshots) + // Cherry-picked snapshots are all existing, not appended + if (this.appendedCount > 0) { + this.metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, this.appendedCount); + } + if (!this.stagedSnapshots.isEmpty()) { + this.metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, this.stagedSnapshots.size()); + } + if (!this.cherryPickedSnapshots.isEmpty()) { + this.metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, + this.cherryPickedSnapshots.size()); + } + if (!this.deletedSnapshots.isEmpty()) { + this.metricsReporter.count( + InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, this.deletedSnapshots.size()); + } } } } From 37af32c2f318ed2c4a32414883d45677f2c8f610 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Wed, 5 Nov 2025 15:44:55 -0800 Subject: [PATCH 32/35] small cleanup --- .../internal/catalog/SnapshotDiffApplier.java | 81 ++++++++++++------- 1 file changed, 51 insertions(+), 30 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index 16c26d1d1..ca1bc60b7 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -10,6 +10,7 @@ import java.util.HashSet; import java.util.List; import java.util.Map; +import java.util.Objects; import java.util.Optional; import java.util.Set; import java.util.stream.Collectors; @@ -39,9 +40,13 @@ public class SnapshotDiffApplier { * @param existingMetadata The existing table metadata (may be null for table creation) * @param providedMetadata The new metadata with properties containing snapshot updates * @return Updated metadata with snapshots applied + * @throws NullPointerException if providedMetadata is null */ public TableMetadata applySnapshots( TableMetadata existingMetadata, TableMetadata providedMetadata) { + // Validate at system boundary + Objects.requireNonNull(providedMetadata, "providedMetadata cannot be null"); + String snapshotsJson = providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY); if (snapshotsJson == null) { return providedMetadata; @@ -70,15 +75,16 @@ public TableMetadata applySnapshots( providedRefs, existingRefs); - // Validate, apply, record metrics + // Validate, apply, record metrics (in correct order) diff.validate(); + TableMetadata result = diff.applyTo(); diff.recordMetrics(); - return diff.applyTo(); + return result; } /** * State object that computes and caches all snapshot analysis. Computes all maps once in the - * constructor to avoid redundant operations. Provides clear methods for validation and + * factory method to avoid redundant operations. Provides clear methods for validation and * application. */ private static class SnapshotDiff { @@ -110,6 +116,9 @@ private static class SnapshotDiff { /** * Creates a SnapshotDiff by computing all snapshot analysis from the provided inputs. * + *

Preconditions: All parameters except existingMetadata must be non-null. Collections should + * be empty rather than null. + * * @param metricsReporter Metrics reporter for recording snapshot operations * @param existingMetadata The existing table metadata (may be null for table creation) * @param providedSnapshots Snapshots provided in the update @@ -130,9 +139,15 @@ static SnapshotDiff create( // Compute all index maps once Map providedSnapshotByIds = - providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + providedSnapshots.stream() + .collect( + Collectors.toMap( + Snapshot::snapshotId, s -> s, (existing, replacement) -> existing)); Map existingSnapshotByIds = - existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); + existingSnapshots.stream() + .collect( + Collectors.toMap( + Snapshot::snapshotId, s -> s, (existing, replacement) -> existing)); Set existingBranchRefIds = existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); Set providedBranchRefIds = @@ -275,12 +290,14 @@ private void validateCurrentSnapshotNotDeleted() { if (this.existingMetadata == null || this.existingMetadata.currentSnapshot() == null) { return; } - if (!newSnapshots.isEmpty()) { + if (!this.newSnapshots.isEmpty()) { return; } long latestSnapshotId = this.existingMetadata.currentSnapshot().snapshotId(); - if (!deletedSnapshots.isEmpty() - && deletedSnapshots.get(deletedSnapshots.size() - 1).snapshotId() == latestSnapshotId) { + // Check if the last deleted snapshot is the current one (snapshots are ordered by time) + if (!this.deletedSnapshots.isEmpty() + && this.deletedSnapshots.get(this.deletedSnapshots.size() - 1).snapshotId() + == latestSnapshotId) { throw new InvalidIcebergSnapshotException( String.format( "Cannot delete the current snapshot %s without adding replacement snapshots.", @@ -292,7 +309,7 @@ TableMetadata applyTo() { TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(this.providedMetadata); // Validate only MAIN branch - for (Map.Entry entry : providedRefs.entrySet()) { + for (Map.Entry entry : this.providedRefs.entrySet()) { if (!entry.getKey().equals(SnapshotRef.MAIN_BRANCH)) { throw new UnsupportedOperationException("OpenHouse supports only MAIN branch"); } @@ -307,20 +324,20 @@ TableMetadata applyTo() { * *

[3] Regular snapshots - set as main branch snapshot */ - for (Snapshot snapshot : stagedSnapshots) { + for (Snapshot snapshot : this.stagedSnapshots) { metadataBuilder.addSnapshot(snapshot); } // Cherry-picked snapshots are all existing, handled by fast-forward block below // (No need to apply them here) - for (Snapshot snapshot : regularSnapshots) { + for (Snapshot snapshot : this.regularSnapshots) { metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); } // Handle fast-forward cherry-pick (ref update without new snapshot) - if (newSnapshots.isEmpty() && !providedRefs.isEmpty()) { - long newSnapshotId = providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); + if (this.newSnapshots.isEmpty() && !this.providedRefs.isEmpty()) { + long newSnapshotId = this.providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); if (this.providedMetadata.refs().isEmpty() || this.providedMetadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() != newSnapshotId) { @@ -329,9 +346,9 @@ TableMetadata applyTo() { } // Delete snapshots - if (!deletedSnapshots.isEmpty()) { + if (!this.deletedSnapshots.isEmpty()) { Set snapshotIds = - deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); + this.deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); metadataBuilder.removeSnapshots(snapshotIds); } @@ -340,33 +357,25 @@ TableMetadata applyTo() { metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - regularSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(",")))); + formatSnapshotIds(this.regularSnapshots))); } - if (!stagedSnapshots.isEmpty()) { + if (!this.stagedSnapshots.isEmpty()) { metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - stagedSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(",")))); + formatSnapshotIds(this.stagedSnapshots))); } - if (!cherryPickedSnapshots.isEmpty()) { + if (!this.cherryPickedSnapshots.isEmpty()) { metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS), - cherryPickedSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(",")))); + formatSnapshotIds(this.cherryPickedSnapshots))); } - if (!deletedSnapshots.isEmpty()) { + if (!this.deletedSnapshots.isEmpty()) { metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS), - deletedSnapshots.stream() - .map(s -> Long.toString(s.snapshotId())) - .collect(Collectors.joining(",")))); + formatSnapshotIds(this.deletedSnapshots))); } metadataBuilder.removeProperties( new HashSet<>( @@ -397,5 +406,17 @@ void recordMetrics() { InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, this.deletedSnapshots.size()); } } + + /** + * Helper method to format a list of snapshots into a comma-separated string of snapshot IDs. + * + * @param snapshots List of snapshots to format + * @return Comma-separated string of snapshot IDs + */ + private static String formatSnapshotIds(List snapshots) { + return snapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")); + } } } From 71bebbe6e3a13df7f31dfa902a1214db698a1b56 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Sat, 8 Nov 2025 13:00:09 -0800 Subject: [PATCH 33/35] responding to comments --- .../internal/catalog/SnapshotDiffApplier.java | 253 ++++++++----- .../OpenHouseInternalTableOperationsTest.java | 97 +++++ .../catalog/SnapshotDiffApplierTest.java | 337 ++++++++++++++++-- 3 files changed, 571 insertions(+), 116 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index ca1bc60b7..b1055ae3d 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -48,16 +48,24 @@ public TableMetadata applySnapshots( Objects.requireNonNull(providedMetadata, "providedMetadata cannot be null"); String snapshotsJson = providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_JSON_KEY); + Map providedRefs = + Optional.ofNullable(providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) + .map(SnapshotsUtil::parseSnapshotRefs) + .orElse(new HashMap<>()); + + // Validate MAIN-only restriction early (PR1 limitation) + for (Map.Entry entry : providedRefs.entrySet()) { + if (!entry.getKey().equals(SnapshotRef.MAIN_BRANCH)) { + throw new UnsupportedOperationException("OpenHouse supports only MAIN branch"); + } + } + if (snapshotsJson == null) { return providedMetadata; } // Parse input List providedSnapshots = SnapshotsUtil.parseSnapshots(null, snapshotsJson); - Map providedRefs = - Optional.ofNullable(providedMetadata.properties().get(CatalogConstants.SNAPSHOTS_REFS_KEY)) - .map(SnapshotsUtil::parseSnapshotRefs) - .orElse(new HashMap<>()); List existingSnapshots = existingMetadata != null ? existingMetadata.snapshots() : Collections.emptyList(); @@ -69,11 +77,11 @@ public TableMetadata applySnapshots( SnapshotDiff.create( metricsReporter, existingMetadata, - providedSnapshots, - existingSnapshots, providedMetadata, - providedRefs, - existingRefs); + existingSnapshots, + providedSnapshots, + existingRefs, + providedRefs); // Validate, apply, record metrics (in correct order) diff.validate(); @@ -93,11 +101,12 @@ private static class SnapshotDiff { // Input state private final TableMetadata existingMetadata; - private final List providedSnapshots; - private final List existingSnapshots; private final TableMetadata providedMetadata; - private final Map providedRefs; + private final String databaseId; + private final List existingSnapshots; + private final List providedSnapshots; private final Map existingRefs; + private final Map providedRefs; // Computed maps (created once) private final Map providedSnapshotByIds; @@ -106,10 +115,11 @@ private static class SnapshotDiff { private final Set providedBranchRefIds; private final List newSnapshots; private final List deletedSnapshots; + private final Set deletedIds; // Categorized snapshots - private final List stagedSnapshots; - private final List regularSnapshots; + private final List newStagedSnapshots; + private final List newMainBranchSnapshots; private final List cherryPickedSnapshots; private final int appendedCount; @@ -121,33 +131,27 @@ private static class SnapshotDiff { * * @param metricsReporter Metrics reporter for recording snapshot operations * @param existingMetadata The existing table metadata (may be null for table creation) - * @param providedSnapshots Snapshots provided in the update - * @param existingSnapshots Snapshots currently in the table * @param providedMetadata The new metadata with properties containing snapshot updates - * @param providedRefs Snapshot refs provided in the update + * @param existingSnapshots Snapshots currently in the table + * @param providedSnapshots Snapshots provided in the update * @param existingRefs Snapshot refs currently in the table + * @param providedRefs Snapshot refs provided in the update * @return A new SnapshotDiff with all analysis computed */ static SnapshotDiff create( MetricsReporter metricsReporter, TableMetadata existingMetadata, - List providedSnapshots, - List existingSnapshots, TableMetadata providedMetadata, - Map providedRefs, - Map existingRefs) { + List existingSnapshots, + List providedSnapshots, + Map existingRefs, + Map providedRefs) { // Compute all index maps once Map providedSnapshotByIds = - providedSnapshots.stream() - .collect( - Collectors.toMap( - Snapshot::snapshotId, s -> s, (existing, replacement) -> existing)); + providedSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); Map existingSnapshotByIds = - existingSnapshots.stream() - .collect( - Collectors.toMap( - Snapshot::snapshotId, s -> s, (existing, replacement) -> existing)); + existingSnapshots.stream().collect(Collectors.toMap(Snapshot::snapshotId, s -> s)); Set existingBranchRefIds = existingRefs.values().stream().map(SnapshotRef::snapshotId).collect(Collectors.toSet()); Set providedBranchRefIds = @@ -162,9 +166,11 @@ static SnapshotDiff create( existingSnapshots.stream() .filter(s -> !providedSnapshotByIds.containsKey(s.snapshotId())) .collect(Collectors.toList()); + Set deletedIds = + deletedSnapshots.stream().map(Snapshot::snapshotId).collect(Collectors.toSet()); // Categorize snapshots - List stagedSnapshots = + List newStagedSnapshots = newSnapshots.stream() .filter(s -> s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); @@ -198,37 +204,50 @@ static SnapshotDiff create( boolean hasWapId = provided.summary() != null && provided.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP); + // TODO: This works for MAIN branch only, but fails in the branch scenario and + // should be revisited in followup PR + // Snapshot exists on branch-A + // Cherry-pick to branch-B + // Would be classified as NOT wasStaged (because it's in existingBranchRefIds) + // Wouldn't be detected as cherry-picked boolean wasStaged = !existingBranchRefIds.contains(provided.snapshotId()); boolean isNowOnBranch = providedBranchRefIds.contains(provided.snapshotId()); return hasWapId && wasStaged && isNowOnBranch; }) .collect(Collectors.toList()); - // Regular snapshots = all new snapshots that are not staged WAP - List regularSnapshots = + // New main branch snapshots = all new snapshots that are not staged WAP + // (includes both regular commits and cherry-pick result snapshots) + List newMainBranchSnapshots = newSnapshots.stream() .filter(s -> !s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); - // Compute appended count (only regular snapshots, not cherry-picked) - int appendedCount = regularSnapshots.size(); + // Compute appended count + int appendedCount = newMainBranchSnapshots.size(); + + // Extract database ID from metadata properties + String databaseId = + providedMetadata.properties().get(CatalogConstants.OPENHOUSE_DATABASEID_KEY); return new SnapshotDiff( metricsReporter, existingMetadata, - providedSnapshots, - existingSnapshots, providedMetadata, - providedRefs, + databaseId, + existingSnapshots, + providedSnapshots, existingRefs, + providedRefs, providedSnapshotByIds, existingSnapshotByIds, existingBranchRefIds, providedBranchRefIds, newSnapshots, deletedSnapshots, - stagedSnapshots, - regularSnapshots, + deletedIds, + newStagedSnapshots, + newMainBranchSnapshots, cherryPickedSnapshots, appendedCount); } @@ -237,36 +256,40 @@ static SnapshotDiff create( private SnapshotDiff( MetricsReporter metricsReporter, TableMetadata existingMetadata, - List providedSnapshots, - List existingSnapshots, TableMetadata providedMetadata, - Map providedRefs, + String databaseId, + List existingSnapshots, + List providedSnapshots, Map existingRefs, + Map providedRefs, Map providedSnapshotByIds, Map existingSnapshotByIds, Set existingBranchRefIds, Set providedBranchRefIds, List newSnapshots, List deletedSnapshots, - List stagedSnapshots, - List regularSnapshots, + Set deletedIds, + List newStagedSnapshots, + List newMainBranchSnapshots, List cherryPickedSnapshots, int appendedCount) { this.metricsReporter = metricsReporter; this.existingMetadata = existingMetadata; - this.providedSnapshots = providedSnapshots; - this.existingSnapshots = existingSnapshots; this.providedMetadata = providedMetadata; - this.providedRefs = providedRefs; + this.databaseId = databaseId; + this.existingSnapshots = existingSnapshots; + this.providedSnapshots = providedSnapshots; this.existingRefs = existingRefs; + this.providedRefs = providedRefs; this.providedSnapshotByIds = providedSnapshotByIds; this.existingSnapshotByIds = existingSnapshotByIds; this.existingBranchRefIds = existingBranchRefIds; this.providedBranchRefIds = providedBranchRefIds; this.newSnapshots = newSnapshots; this.deletedSnapshots = deletedSnapshots; - this.stagedSnapshots = stagedSnapshots; - this.regularSnapshots = regularSnapshots; + this.deletedIds = deletedIds; + this.newStagedSnapshots = newStagedSnapshots; + this.newMainBranchSnapshots = newMainBranchSnapshots; this.cherryPickedSnapshots = cherryPickedSnapshots; this.appendedCount = appendedCount; } @@ -278,6 +301,7 @@ private SnapshotDiff( */ void validate() { validateCurrentSnapshotNotDeleted(); + validateDeletedSnapshotsNotReferenced(); } /** @@ -293,6 +317,8 @@ private void validateCurrentSnapshotNotDeleted() { if (!this.newSnapshots.isEmpty()) { return; } + // TODO -- validate what are the requirements around deleting the latest snapshot on a + // "branch". long latestSnapshotId = this.existingMetadata.currentSnapshot().snapshotId(); // Check if the last deleted snapshot is the current one (snapshots are ordered by time) if (!this.deletedSnapshots.isEmpty() @@ -305,44 +331,69 @@ private void validateCurrentSnapshotNotDeleted() { } } - TableMetadata applyTo() { - TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(this.providedMetadata); + /** + * Validates that snapshots being deleted are not still referenced by any branches or tags. This + * prevents data loss and maintains referential integrity by ensuring that all branch and tag + * pointers reference valid snapshots that will continue to exist after the commit. + * + * @throws InvalidIcebergSnapshotException if any deleted snapshot is still referenced by a + * branch or tag + */ + private void validateDeletedSnapshotsNotReferenced() { + Map> referencedIdsToRefs = + providedRefs.entrySet().stream() + .collect( + Collectors.groupingBy( + e -> e.getValue().snapshotId(), + Collectors.mapping(Map.Entry::getKey, Collectors.toList()))); + + List invalidDeleteDetails = + deletedIds.stream() + .filter(referencedIdsToRefs::containsKey) + .map( + id -> + String.format( + "snapshot %s (referenced by: %s)", + id, String.join(", ", referencedIdsToRefs.get(id)))) + .collect(Collectors.toList()); - // Validate only MAIN branch - for (Map.Entry entry : this.providedRefs.entrySet()) { - if (!entry.getKey().equals(SnapshotRef.MAIN_BRANCH)) { - throw new UnsupportedOperationException("OpenHouse supports only MAIN branch"); - } + if (!invalidDeleteDetails.isEmpty()) { + throw new InvalidIcebergSnapshotException( + String.format( + "Cannot delete snapshots that are still referenced by branches/tags: %s", + String.join("; ", invalidDeleteDetails))); } + } + + TableMetadata applyTo() { + TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(this.providedMetadata); /** * Apply categorized snapshots to metadata: * *

[1] Staged (WAP) snapshots - added without branch reference * - *

[2] Cherry-picked snapshots - set as main branch snapshot + *

[2] New main branch snapshots - added without branch reference (branch pointer set + * below) * - *

[3] Regular snapshots - set as main branch snapshot + *

[3] Cherry-picked snapshots - existing snapshots, branch pointer set below */ - for (Snapshot snapshot : this.stagedSnapshots) { - metadataBuilder.addSnapshot(snapshot); - } - - // Cherry-picked snapshots are all existing, handled by fast-forward block below - // (No need to apply them here) - - for (Snapshot snapshot : this.regularSnapshots) { - metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); - } - - // Handle fast-forward cherry-pick (ref update without new snapshot) - if (this.newSnapshots.isEmpty() && !this.providedRefs.isEmpty()) { + // Add staged snapshots in timestamp order (explicit ordering for consistency) + this.newStagedSnapshots.stream() + .sorted(java.util.Comparator.comparingLong(Snapshot::timestampMillis)) + .forEach(metadataBuilder::addSnapshot); + + // Add new main branch snapshots in timestamp order (explicit ordering) + // Note: While the branch pointer (not list order) determines currentSnapshot(), + // other code assumes snapshots are time-ordered (e.g., validation at line 308) + this.newMainBranchSnapshots.stream() + .sorted(java.util.Comparator.comparingLong(Snapshot::timestampMillis)) + .forEach(metadataBuilder::addSnapshot); + + // Set branch pointer once using providedRefs (covers both new snapshots and cherry-pick) + if (!this.providedRefs.isEmpty()) { long newSnapshotId = this.providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); - if (this.providedMetadata.refs().isEmpty() - || this.providedMetadata.refs().get(SnapshotRef.MAIN_BRANCH).snapshotId() - != newSnapshotId) { - metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); - } + metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); } // Delete snapshots @@ -357,13 +408,13 @@ TableMetadata applyTo() { metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS), - formatSnapshotIds(this.regularSnapshots))); + formatSnapshotIds(this.newMainBranchSnapshots))); } - if (!this.stagedSnapshots.isEmpty()) { + if (!this.newStagedSnapshots.isEmpty()) { metadataBuilder.setProperties( Collections.singletonMap( getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS), - formatSnapshotIds(this.stagedSnapshots))); + formatSnapshotIds(this.newStagedSnapshots))); } if (!this.cherryPickedSnapshots.isEmpty()) { metadataBuilder.setProperties( @@ -386,24 +437,34 @@ TableMetadata applyTo() { } void recordMetrics() { - // Record metrics for appended snapshots (only regular snapshots) - // Cherry-picked snapshots are all existing, not appended - if (this.appendedCount > 0) { - this.metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, this.appendedCount); - } - if (!this.stagedSnapshots.isEmpty()) { - this.metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, this.stagedSnapshots.size()); - } - if (!this.cherryPickedSnapshots.isEmpty()) { - this.metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, - this.cherryPickedSnapshots.size()); - } - if (!this.deletedSnapshots.isEmpty()) { - this.metricsReporter.count( - InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, this.deletedSnapshots.size()); + // Record metrics for appended snapshots (includes regular commits and cherry-pick results) + // Note: cherryPickedSnapshots list contains existing source snapshots, not the new results + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_ADDED_CTR, this.appendedCount); + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR, this.newStagedSnapshots.size()); + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR, + this.cherryPickedSnapshots.size()); + recordMetricWithDatabaseTag( + InternalCatalogMetricsConstant.SNAPSHOTS_DELETED_CTR, this.deletedSnapshots.size()); + } + + /** + * Helper method to record a metric with database tag if count is greater than zero. + * + * @param metricName The name of the metric to record + * @param count The count value to record + */ + private void recordMetricWithDatabaseTag(String metricName, int count) { + if (count > 0) { + // Only add database tag if databaseId is present; otherwise record metric without tag + if (this.databaseId != null) { + this.metricsReporter.count( + metricName, count, InternalCatalogMetricsConstant.DATABASE_TAG, this.databaseId); + } else { + this.metricsReporter.count(metricName, count); + } } } diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index 634d8eeb6..476435a61 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -42,6 +42,7 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotSummary; import org.apache.iceberg.SortDirection; import org.apache.iceberg.SortOrder; import org.apache.iceberg.TableMetadata; @@ -489,6 +490,102 @@ void testDoCommitDoesntPersistForStagedTable() { .get()); } + /** + * Tests staged table creation with no snapshots (initial version). Verifies that the table + * metadata is set locally but no persistence occurs to the repository. + */ + @Test + void testStagedTableCreationWithoutSnapshots() throws IOException { + Map properties = new HashMap<>(BASE_TABLE_METADATA.properties()); + properties.put(CatalogConstants.IS_STAGE_CREATE_KEY, "true"); + + TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class, Mockito.CALLS_REAL_METHODS)) { + openHouseInternalTableOperations.doCommit(null, metadata); + + // Verify TableMetadata is set locally + Assertions.assertNotNull(openHouseInternalTableOperations.currentMetadataLocation()); + Assertions.assertNotNull(openHouseInternalTableOperations.current()); + + // Verify no snapshots were added + Assertions.assertEquals(0, openHouseInternalTableOperations.current().snapshots().size()); + + // Verify no persistence to repository + verify(mockHouseTableRepository, times(0)).save(any()); + + // Verify no snapshot properties were set + Map resultProperties = + openHouseInternalTableOperations.current().properties(); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("appended_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("staged_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("deleted_snapshots"))); + } + } + + /** + * Tests staged table creation with staged (WAP) snapshots. Verifies that staged snapshots are + * added to the table but no persistence occurs to the repository. + */ + @Test + void testStagedTableCreationWithStagedSnapshots() throws IOException { + List testWapSnapshots = IcebergTestUtil.getWapSnapshots().subList(0, 2); + Map properties = new HashMap<>(BASE_TABLE_METADATA.properties()); + properties.put(CatalogConstants.IS_STAGE_CREATE_KEY, "true"); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(testWapSnapshots)); + + TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); + + try (MockedStatic ignoreWriteMock = + Mockito.mockStatic(TableMetadataParser.class, Mockito.CALLS_REAL_METHODS)) { + openHouseInternalTableOperations.doCommit(null, metadata); + + // Verify TableMetadata is set locally + Assertions.assertNotNull(openHouseInternalTableOperations.currentMetadataLocation()); + Assertions.assertNotNull(openHouseInternalTableOperations.current()); + + // Verify staged snapshots were added + TableMetadata currentMetadata = openHouseInternalTableOperations.current(); + Assertions.assertEquals( + testWapSnapshots.size(), + currentMetadata.snapshots().size(), + "Staged snapshots should be added"); + + // Verify all snapshots are staged (have WAP ID) + for (Snapshot snapshot : currentMetadata.snapshots()) { + Assertions.assertTrue( + snapshot.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP), + "All snapshots should be staged with WAP ID"); + } + + // Verify no branch references exist (staged snapshots should not be on main) + Assertions.assertTrue( + currentMetadata.refs().isEmpty() + || !currentMetadata.refs().containsKey(SnapshotRef.MAIN_BRANCH), + "Staged snapshots should not have main branch reference"); + + // Verify no persistence to repository + verify(mockHouseTableRepository, times(0)).save(any()); + + // Verify snapshot properties tracking + Map resultProperties = currentMetadata.properties(); + Assertions.assertEquals( + testWapSnapshots.stream() + .map(s -> Long.toString(s.snapshotId())) + .collect(Collectors.joining(",")), + resultProperties.get(getCanonicalFieldName("staged_snapshots")), + "Staged snapshots should be tracked in properties"); + Assertions.assertNull( + resultProperties.get(getCanonicalFieldName("appended_snapshots")), + "No snapshots should be appended to main"); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("cherry_picked_snapshots"))); + Assertions.assertNull(resultProperties.get(getCanonicalFieldName("deleted_snapshots"))); + } + } + /** * Tests that repository exceptions are properly converted to Iceberg exceptions. Verifies that * various repository exceptions map to CommitFailedException or CommitStateUnknownException. diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java index 08fc48a52..f325459df 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java @@ -4,6 +4,8 @@ import static org.junit.jupiter.api.Assertions.*; import static org.mockito.Mockito.*; +import com.google.gson.Gson; +import com.google.gson.JsonObject; import com.linkedin.openhouse.cluster.metrics.micrometer.MetricsReporter; import com.linkedin.openhouse.internal.catalog.exception.InvalidIcebergSnapshotException; import java.io.IOException; @@ -19,8 +21,10 @@ import org.apache.iceberg.Schema; import org.apache.iceberg.Snapshot; import org.apache.iceberg.SnapshotRef; +import org.apache.iceberg.SnapshotRefParser; import org.apache.iceberg.SortOrder; import org.apache.iceberg.TableMetadata; +import org.apache.iceberg.exceptions.ValidationException; import org.apache.iceberg.types.Types; import org.junit.jupiter.api.BeforeEach; import org.junit.jupiter.api.Test; @@ -116,8 +120,6 @@ private TableMetadata addSnapshotsToMetadata(TableMetadata metadata, List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata newMetadata = createMetadataWithSnapshotsAndMainRef(baseMetadata, snapshots); @@ -141,9 +143,9 @@ void testApplySnapshots_nullBase_handlesTableCreation() throws IOException { // ========== Basic Functionality Tests ========== - /** Verifies that new snapshots are added correctly. */ + /** Verifies that new snapshots are added correctly to the main branch. */ @Test - void testApplySnapshots_addNewSnapshots_success() throws IOException { + void testApplySnapshots_addNewSnapshotsToMainBranch_success() throws IOException { List initialSnapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, initialSnapshots); @@ -159,9 +161,9 @@ void testApplySnapshots_addNewSnapshots_success() throws IOException { verify(mockMetricsReporter, atLeastOnce()).count(anyString(), anyDouble()); } - /** Verifies that deleting snapshots works correctly and updates main branch. */ + /** Verifies that deleting snapshots from main branch works correctly. */ @Test - void testApplySnapshots_deleteSnapshots_success() throws IOException { + void testApplySnapshots_deleteSnapshotsFromMainBranch_success() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); @@ -175,9 +177,9 @@ void testApplySnapshots_deleteSnapshots_success() throws IOException { assertEquals(remainingSnapshots.size(), result.snapshots().size()); } - /** Verifies that updating branch references works correctly. */ + /** Verifies that updating main branch references works correctly. */ @Test - void testApplySnapshots_branchUpdates_success() throws IOException { + void testApplySnapshots_mainBranchUpdates_success() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); @@ -192,11 +194,52 @@ void testApplySnapshots_branchUpdates_success() throws IOException { assertEquals(newBranchTarget.snapshotId(), result.currentSnapshot().snapshotId()); } + /** Verifies that snapshots are added in timestamp order to the main branch. */ + @Test + void testApplySnapshots_snapshotsOrderedByTimestamp_success() throws IOException { + List initialSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, initialSnapshots); + + // Add extra snapshots which may have different timestamps + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + List allSnapshots = new ArrayList<>(initialSnapshots); + allSnapshots.addAll(extraSnapshots); + + // Deliberately shuffle to ensure ordering is not dependent on input order + List shuffledSnapshots = new ArrayList<>(allSnapshots); + Collections.shuffle(shuffledSnapshots); + + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, shuffledSnapshots); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + assertNotNull(result); + // Verify snapshots are ordered by timestamp + List resultSnapshots = result.snapshots(); + assertTrue(resultSnapshots.size() > 0, "Should have snapshots"); + + // Verify each snapshot timestamp is <= the next one + for (int i = 1; i < resultSnapshots.size(); i++) { + Snapshot prev = resultSnapshots.get(i - 1); + Snapshot current = resultSnapshots.get(i); + assertTrue( + prev.timestampMillis() <= current.timestampMillis(), + String.format( + "Snapshots should be ordered by timestamp: snapshot[%d].timestamp=%d " + + "should be <= snapshot[%d].timestamp=%d", + i - 1, prev.timestampMillis(), i, current.timestampMillis())); + } + } + // ========== Validation Tests ========== - /** Verifies that deleting the current snapshot without replacements throws an exception. */ + /** + * Verifies that deleting the current snapshot from main branch without replacements throws an + * exception. + */ @Test - void testValidation_deletingCurrentSnapshotWithoutReplacement_throwsException() + void testApplySnapshots_deletingCurrentSnapshotFromMainBranchWithoutReplacement_throwsException() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); @@ -211,11 +254,32 @@ void testValidation_deletingCurrentSnapshotWithoutReplacement_throwsException() assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); } + + /** Verifies that duplicate snapshot IDs in provided snapshots throw an exception. */ + @Test + void testApplySnapshots_duplicateSnapshotIds_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + // Create a list with duplicate snapshots (same snapshot ID appears twice) + List duplicateSnapshots = new ArrayList<>(); + duplicateSnapshots.add(snapshots.get(0)); + duplicateSnapshots.add(snapshots.get(0)); // Duplicate + + TableMetadata newMetadata = + createMetadataWithSnapshotsAndMainRef(baseWithSnapshots, duplicateSnapshots); + + // Should throw IllegalStateException due to duplicate keys in toMap collector + assertThrows( + IllegalStateException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + } + // ========== Metrics Tests ========== - /** Verifies that WAP (staged) snapshots trigger the correct metrics. */ + /** Verifies that staged snapshots (not on main branch) trigger the correct metrics. */ @Test - void testMetrics_wapSnapshots_recordsStagedCounter() throws IOException { + void testMetrics_addStagedSnapshots_recordsStagedCounter() throws IOException { List baseSnapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); @@ -234,9 +298,9 @@ void testMetrics_wapSnapshots_recordsStagedCounter() throws IOException { .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_STAGED_CTR), anyDouble()); } - /** Verifies that deleting snapshots triggers the correct metrics. */ + /** Verifies that deleting snapshots from main branch triggers the correct metrics. */ @Test - void testMetrics_deleteSnapshots_recordsDeletedCounter() throws IOException { + void testMetrics_deleteSnapshotsFromMainBranch_recordsDeletedCounter() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); @@ -254,9 +318,9 @@ void testMetrics_deleteSnapshots_recordsDeletedCounter() throws IOException { // ========== Property Management Tests ========== - /** Verifies that appended snapshot IDs are recorded in properties. */ + /** Verifies that appended snapshot IDs to main branch are recorded in properties. */ @Test - void testProperties_appendedSnapshots_recordedCorrectly() throws IOException { + void testProperties_appendedSnapshotsToMainBranch_recordedCorrectly() throws IOException { List baseSnapshots = IcebergTestUtil.getSnapshots(); TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); @@ -281,9 +345,12 @@ void testProperties_appendedSnapshots_recordedCorrectly() throws IOException { } } - /** Verifies that temporary snapshot processing keys are removed from final properties. */ + /** + * Verifies that temporary snapshot processing keys are removed from final properties when adding + * to main branch. + */ @Test - void testProperties_tempKeysRemoved_success() throws IOException { + void testProperties_tempKeysRemovedForMainBranch_success() throws IOException { List snapshots = IcebergTestUtil.getSnapshots(); TableMetadata newMetadata = createMetadataWithSnapshotsAndMainRef(baseMetadata, snapshots); @@ -297,4 +364,234 @@ void testProperties_tempKeysRemoved_success() throws IOException { result.properties().containsKey(CatalogConstants.SNAPSHOTS_REFS_KEY), "Temp snapshots refs key should be removed"); } + + /** Verifies that providing a non-MAIN branch reference throws UnsupportedOperationException. */ + @Test + void testApplySnapshots_nonMainBranchReference_throwsUnsupportedOperationException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + Snapshot lastSnapshot = snapshots.get(snapshots.size() - 1); + + // Create refs with a feature branch instead of MAIN + Map refs = new HashMap<>(); + SnapshotRef featureBranchRef = SnapshotRef.branchBuilder(lastSnapshot.snapshotId()).build(); + refs.put("feature-branch", SnapshotRefParser.toJson(featureBranchRef)); + + TableMetadata newMetadata = createMetadataWithSnapshots(baseMetadata, snapshots, refs); + + UnsupportedOperationException exception = + assertThrows( + UnsupportedOperationException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + + assertTrue(exception.getMessage().contains("OpenHouse supports only MAIN branch")); + } + + /** + * Verifies that providing a branch ref pointing to a non-existent snapshot ID causes an + * exception. This tests a critical bug where no validation exists before calling + * setBranchSnapshot. + */ + @Test + void testApplySnapshots_refPointingToNonExistentSnapshot_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + // Create a ref pointing to a snapshot ID that doesn't exist in the snapshot list + long nonExistentSnapshotId = 999999999L; + Map refs = new HashMap<>(); + SnapshotRef invalidRef = SnapshotRef.branchBuilder(nonExistentSnapshotId).build(); + refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRefParser.toJson(invalidRef)); + + TableMetadata newMetadata = createMetadataWithSnapshots(baseMetadata, snapshots, refs); + + // Iceberg's setBranchSnapshot should throw ValidationException when snapshot doesn't exist + assertThrows( + ValidationException.class, () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies that attempting to set a ref to a snapshot being deleted throws an exception. The + * validation correctly catches this case where a commit attempts to both delete a snapshot and + * set the main branch to point to that deleted snapshot. This prevents leaving the table in an + * invalid state. + */ + @Test + void testApplySnapshots_settingRefToDeletedSnapshot_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + // Try to delete the first snapshot, then point main branch to the first (deleted) one + Snapshot snapshotToDelete = snapshots.get(0); + List remainingSnapshots = snapshots.subList(1, snapshots.size()); + + // Create refs pointing to the snapshot we're trying to delete + Map refs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(snapshotToDelete.snapshotId()).build(); + refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRefParser.toJson(mainRef)); + + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, remainingSnapshots, refs); + + // This should throw an exception because we're trying to delete a snapshot + // while setting a branch reference to it + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue( + exception + .getMessage() + .contains("Cannot delete snapshots that are still referenced by branches/tags")); + assertTrue(exception.getMessage().contains("snapshot " + snapshotToDelete.snapshotId())); + assertTrue(exception.getMessage().contains("main")); + } + + /** + * Verifies that a snapshot with an invalid (non-numeric) source snapshot ID in cherry-pick causes + * JsonSyntaxException during parsing. NOTE: This fails at the JSON parsing stage due to Iceberg's + * strict validation, not at the cherry-pick categorization stage. + */ + @Test + void testApplySnapshots_invalidCherryPickSourceSnapshotId_failsAtParsingStage() { + // Create a custom snapshot JSON with invalid source-snapshot-id using Gson + // Note: Iceberg validates snapshot structure strictly, so this fails at Gson parsing + Gson gson = new Gson(); + JsonObject snapshotJson = new JsonObject(); + snapshotJson.addProperty("snapshot-id", 1234567890123456789L); + snapshotJson.addProperty("timestamp-ms", 1669126937912L); + JsonObject summary = new JsonObject(); + summary.addProperty("operation", "append"); + summary.addProperty("source-snapshot-id", "not-a-number"); + snapshotJson.add("summary", summary); + snapshotJson.addProperty("manifest-list", "/tmp/test.avro"); + snapshotJson.addProperty("schema-id", 0); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put(CatalogConstants.SNAPSHOTS_JSON_KEY, "[" + gson.toJson(snapshotJson) + "]"); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException when Gson tries to parse the invalid source-snapshot-id + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies that a snapshot with null summary is handled correctly during WAP detection. Tests + * lines 172, 180, 202 which check snapshot.summary(). NOTE: This currently fails at Iceberg's + * parsing stage due to strict validation. + */ + @Test + void testApplySnapshots_snapshotWithNullSummary_failsAtParsingStage() { + // Create a custom snapshot JSON with null/missing summary using Gson + // Note: Iceberg validates snapshot structure strictly, so this fails at parsing + Gson gson = new Gson(); + JsonObject snapshotJson = new JsonObject(); + snapshotJson.addProperty("snapshot-id", 1234567890123456789L); + snapshotJson.addProperty("timestamp-ms", 1669126937912L); + snapshotJson.addProperty("manifest-list", "/tmp/test.avro"); + snapshotJson.addProperty("schema-id", 0); + + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put(CatalogConstants.SNAPSHOTS_JSON_KEY, "[" + gson.toJson(snapshotJson) + "]"); + + // Add a main branch ref pointing to this snapshot + Map refs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(1234567890123456789L).build(); + refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRefParser.toJson(mainRef)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap(refs)); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException during Iceberg parsing due to missing required summary + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies behavior when provided snapshots are empty but refs are not. Tests that a ref pointing + * to nothing causes an exception. + */ + @Test + void testApplySnapshots_emptySnapshotsWithNonEmptyRefs_throwsException() { + // Create refs pointing to a snapshot that doesn't exist + Map refs = new HashMap<>(); + SnapshotRef mainRef = SnapshotRef.branchBuilder(123456789L).build(); + refs.put(SnapshotRef.MAIN_BRANCH, SnapshotRefParser.toJson(mainRef)); + + TableMetadata newMetadata = + createMetadataWithSnapshots(baseMetadata, Collections.emptyList(), refs); + + // Should throw ValidationException because ref points to non-existent snapshot + assertThrows( + org.apache.iceberg.exceptions.ValidationException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** Verifies that null providedMetadata throws NullPointerException. */ + @Test + void testApplySnapshots_nullProvidedMetadata_throwsNullPointerException() { + NullPointerException exception = + assertThrows( + NullPointerException.class, + () -> snapshotDiffApplier.applySnapshots(baseMetadata, null)); + + assertTrue(exception.getMessage().contains("providedMetadata cannot be null")); + } + + /** Verifies that malformed JSON in SNAPSHOTS_JSON_KEY property throws exception. */ + @Test + void testApplySnapshots_malformedSnapshotsJson_throwsException() { + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put(CatalogConstants.SNAPSHOTS_JSON_KEY, "{ invalid json {{"); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException or similar from Gson + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** Verifies that malformed JSON in SNAPSHOTS_REFS_KEY property throws exception. */ + @Test + void testApplySnapshots_malformedRefsJson_throwsException() throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + Map properties = new HashMap<>(baseMetadata.properties()); + properties.put( + CatalogConstants.SNAPSHOTS_JSON_KEY, SnapshotsUtil.serializedSnapshots(snapshots)); + properties.put(CatalogConstants.SNAPSHOTS_REFS_KEY, "{ invalid json {{"); + + TableMetadata newMetadata = baseMetadata.replaceProperties(properties); + + // Should throw JsonSyntaxException or similar from Gson + assertThrows( + com.google.gson.JsonSyntaxException.class, + () -> snapshotDiffApplier.applySnapshots(null, newMetadata)); + } + + /** + * Verifies behavior when attempting to delete all snapshots with no replacement. This should be + * caught by the existing validation. + */ + @Test + void testApplySnapshots_deletingAllSnapshotsWithNoReplacement_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); + + // Try to delete all snapshots without providing replacements + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, Collections.emptyList(), new HashMap<>()); + + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + } } From 323aa5a2481acfd04551f8e448a75ee64aa669a6 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Sun, 9 Nov 2025 02:55:41 -0800 Subject: [PATCH 34/35] adding more tests, and fixing small bug --- .../internal/catalog/SnapshotDiffApplier.java | 26 +- .../catalog/SnapshotDiffApplierTest.java | 249 ++++++++++++++++++ 2 files changed, 265 insertions(+), 10 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index b1055ae3d..08db88ba1 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -85,7 +85,7 @@ public TableMetadata applySnapshots( // Validate, apply, record metrics (in correct order) diff.validate(); - TableMetadata result = diff.applyTo(); + TableMetadata result = diff.apply(); diff.recordMetrics(); return result; } @@ -365,7 +365,7 @@ private void validateDeletedSnapshotsNotReferenced() { } } - TableMetadata applyTo() { + TableMetadata apply() { TableMetadata.Builder metadataBuilder = TableMetadata.buildFrom(this.providedMetadata); /** @@ -378,22 +378,28 @@ TableMetadata applyTo() { * *

[3] Cherry-picked snapshots - existing snapshots, branch pointer set below */ - // Add staged snapshots in timestamp order (explicit ordering for consistency) + // Add staged snapshots in sequence number order (ensures correct commit ordering) this.newStagedSnapshots.stream() - .sorted(java.util.Comparator.comparingLong(Snapshot::timestampMillis)) + .sorted(java.util.Comparator.comparingLong(Snapshot::sequenceNumber)) .forEach(metadataBuilder::addSnapshot); - // Add new main branch snapshots in timestamp order (explicit ordering) - // Note: While the branch pointer (not list order) determines currentSnapshot(), - // other code assumes snapshots are time-ordered (e.g., validation at line 308) - this.newMainBranchSnapshots.stream() - .sorted(java.util.Comparator.comparingLong(Snapshot::timestampMillis)) - .forEach(metadataBuilder::addSnapshot); + // Add new main branch snapshots in sequence number order (ensures correct commit ordering) + List sortedMainBranchSnapshots = + this.newMainBranchSnapshots.stream() + .sorted(java.util.Comparator.comparingLong(Snapshot::sequenceNumber)) + .collect(Collectors.toList()); + sortedMainBranchSnapshots.forEach(metadataBuilder::addSnapshot); // Set branch pointer once using providedRefs (covers both new snapshots and cherry-pick) if (!this.providedRefs.isEmpty()) { long newSnapshotId = this.providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); + } else if (!sortedMainBranchSnapshots.isEmpty()) { + // Auto-append to main: if no refs provided but there are new main branch snapshots, + // set MAIN to the last snapshot (latest by sequence number due to sort above) + Snapshot latestSnapshot = + sortedMainBranchSnapshots.get(sortedMainBranchSnapshots.size() - 1); + metadataBuilder.setBranchSnapshot(latestSnapshot.snapshotId(), SnapshotRef.MAIN_BRANCH); } // Delete snapshots diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java index f325459df..5a7bec3d7 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java @@ -594,4 +594,253 @@ void testApplySnapshots_deletingAllSnapshotsWithNoReplacement_throwsException() assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); } + + /** + * Verifies transition from table with unreferenced snapshots to having a MAIN branch. Tests + * ref-only update without snapshot changes. + */ + @Test + void testApplySnapshots_baseWithUnreferencedSnapshotsOnly_addFirstMainBranch() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + // Create base with snapshots but no refs (all unreferenced) + TableMetadata base = baseMetadata; + for (Snapshot snapshot : snapshots) { + base = TableMetadata.buildFrom(base).addSnapshot(snapshot).build(); + } + // Verify no refs in base + assertTrue(base.refs().isEmpty() || !base.refs().containsKey(SnapshotRef.MAIN_BRANCH)); + + // Provided: same snapshots + MAIN ref to one of them + Snapshot mainSnapshot = snapshots.get(2); + Map refs = IcebergTestUtil.obtainSnapshotRefsFromSnapshot(mainSnapshot); + TableMetadata newMetadata = createMetadataWithSnapshots(base, snapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(base, newMetadata); + + // Verify MAIN ref is set + assertNotNull(result.currentSnapshot()); + assertEquals(mainSnapshot.snapshotId(), result.currentSnapshot().snapshotId()); + + // Verify no add/delete operations (ref-only update) + assertEquals(snapshots.size(), result.snapshots().size()); + Map resultProps = result.properties(); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS))); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS))); + } + + /** + * Verifies table creation with no snapshots (empty state). Tests that an empty table can be + * created successfully. + */ + @Test + void testApplySnapshots_nullBaseEmptySnapshotsEmptyRefs_createsEmptyTable() { + // Provided: empty snapshots list, empty refs + TableMetadata newMetadata = + createMetadataWithSnapshots(baseMetadata, Collections.emptyList(), new HashMap<>()); + + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + // Verify empty table created + assertNotNull(result); + assertEquals(0, result.snapshots().size()); + assertNull(result.currentSnapshot()); + assertTrue(result.refs().isEmpty() || !result.refs().containsKey(SnapshotRef.MAIN_BRANCH)); + + // Verify no snapshot operations tracked + Map resultProps = result.properties(); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS))); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS))); + assertNull(resultProps.get(getCanonicalFieldName(CatalogConstants.DELETED_SNAPSHOTS))); + } + + /** + * Verifies adding both regular and staged snapshots in a single commit. Tests that snapshot + * categorization correctly handles mixed types. + */ + @Test + void testApplySnapshots_addRegularAndStagedSimultaneously() throws IOException { + // Start from empty base (no existing snapshots) + // Simulate a commit that adds both regular and staged snapshots simultaneously + + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + + // Create a custom WAP snapshot without hardcoded sequence number to avoid conflicts + // Build snapshot JSON manually and wrap it in a Gson array + String wapSnapshotJson = + String.format( + "{\"snapshot-id\":%d,\"timestamp-ms\":%d,\"summary\":%s,\"manifest-list\":\"%s\",\"schema-id\":%d}", + 999940701710231339L, + 1669126937912L, + new Gson() + .toJson( + Map.of( + "operation", "append", + "wap.id", "test-wap", + "spark.app.id", "local-1669126906634", + "added-data-files", "1", + "added-records", "1")), + "/data/test.avro", + 0); + String wapSnapshotArrayJson = new Gson().toJson(List.of(wapSnapshotJson)); + List customWapSnapshots = SnapshotsUtil.parseSnapshots(null, wapSnapshotArrayJson); + + List allSnapshots = new ArrayList<>(); + allSnapshots.add(extraSnapshots.get(0)); // New regular snapshot + allSnapshots.add(customWapSnapshots.get(0)); // New staged snapshot + + // MAIN ref points to the new regular snapshot + Map refs = + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(extraSnapshots.get(0)); + TableMetadata newMetadata = createMetadataWithSnapshots(baseMetadata, allSnapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); + + // Verify both snapshots added + assertEquals(2, result.snapshots().size()); + + // Verify regular snapshot is on MAIN + assertNotNull(result.currentSnapshot()); + assertEquals(extraSnapshots.get(0).snapshotId(), result.currentSnapshot().snapshotId()); + + // Verify tracking: regular appended, staged tracked separately + Map resultProps = result.properties(); + String appendedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + String stagedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.STAGED_SNAPSHOTS)); + + assertNotNull(appendedSnapshotsStr); + assertTrue(appendedSnapshotsStr.contains(Long.toString(extraSnapshots.get(0).snapshotId()))); + + assertNotNull(stagedSnapshotsStr); + assertTrue(stagedSnapshotsStr.contains(Long.toString(customWapSnapshots.get(0).snapshotId()))); + } + + /** + * Verifies cherry-picking a staged snapshot while adding a new snapshot in the same commit. Tests + * compound operation tracking. + */ + @Test + void testApplySnapshots_cherryPickAndAddNewSimultaneously() throws IOException { + List testWapSnapshots = IcebergTestUtil.getWapSnapshots(); + + // Base: MAIN snapshot + staged snapshot + TableMetadata base = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(testWapSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .addSnapshot(testWapSnapshots.get(1)) // Staged snapshot + .build(); + + // Provided: existing + new snapshot becomes MAIN, staged is cherry-picked + List allSnapshots = new ArrayList<>(); + allSnapshots.add(testWapSnapshots.get(0)); + allSnapshots.add(testWapSnapshots.get(1)); // Was staged, now cherry-picked + allSnapshots.add(testWapSnapshots.get(2)); // New snapshot + + // MAIN ref points to new snapshot + Map refs = + IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testWapSnapshots.get(2)); + TableMetadata newMetadata = createMetadataWithSnapshots(base, allSnapshots, refs); + + TableMetadata result = snapshotDiffApplier.applySnapshots(base, newMetadata); + + // Verify new snapshot is on MAIN + assertNotNull(result.currentSnapshot()); + assertEquals(testWapSnapshots.get(2).snapshotId(), result.currentSnapshot().snapshotId()); + + // Verify both operations tracked + Map resultProps = result.properties(); + String appendedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + String cherryPickedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); + + // New snapshot should be appended + assertNotNull(appendedSnapshotsStr); + assertTrue(appendedSnapshotsStr.contains(Long.toString(testWapSnapshots.get(2).snapshotId()))); + + // Staged snapshot should be cherry-picked + assertNotNull(cherryPickedSnapshotsStr); + assertTrue( + cherryPickedSnapshotsStr.contains(Long.toString(testWapSnapshots.get(1).snapshotId()))); + } + + /** + * Verifies that attempting to delete the current snapshot while unreferenced snapshots exist + * throws an exception. Tests current snapshot protection. + */ + @Test + void testApplySnapshots_attemptDeleteCurrentWithUnreferencedPresent_throwsException() + throws IOException { + List snapshots = IcebergTestUtil.getSnapshots(); + + // Base: MAIN snapshot + 2 unreferenced snapshots + TableMetadata base = + TableMetadata.buildFrom(baseMetadata) + .addSnapshot(snapshots.get(0)) // Unreferenced + .addSnapshot(snapshots.get(1)) // Unreferenced + .setBranchSnapshot(snapshots.get(2), SnapshotRef.MAIN_BRANCH) // Current snapshot + .build(); + + // Provided: only the 2 unreferenced (delete MAIN), no new snapshots + List remainingSnapshots = snapshots.subList(0, 2); + TableMetadata newMetadata = + createMetadataWithSnapshots(base, remainingSnapshots, new HashMap<>()); + + // Should throw exception because current snapshot is being deleted without replacement + InvalidIcebergSnapshotException exception = + assertThrows( + InvalidIcebergSnapshotException.class, + () -> snapshotDiffApplier.applySnapshots(base, newMetadata)); + + assertTrue(exception.getMessage().contains("Cannot delete the current snapshot")); + assertTrue(exception.getMessage().contains(Long.toString(snapshots.get(2).snapshotId()))); + } + + /** + * Verifies adding regular (non-WAP) snapshots with empty refs. historically, such snapshots were + * automatically added to MAIN branch and tracked as APPENDED_SNAPSHOTS. This test validates + * backward compatibility with that behavior. NOTE: The semantics here are questionable - + * snapshots with no refs should arguably not be "appended" to MAIN, but this preserves the + * original behavior. + */ + @Test + void testApplySnapshots_regularSnapshotsWithEmptyRefs_autoAppendedToMain() throws IOException { + List baseSnapshots = IcebergTestUtil.getSnapshots(); + TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, baseSnapshots); + + // Provided: existing + new snapshots, but empty refs map (no MAIN branch) + List extraSnapshots = IcebergTestUtil.getExtraSnapshots(); + List allSnapshots = new ArrayList<>(baseSnapshots); + allSnapshots.addAll(extraSnapshots); + + // Empty refs - no MAIN branch + TableMetadata newMetadata = + createMetadataWithSnapshots(baseWithSnapshots, allSnapshots, new HashMap<>()); + + TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); + + // Verify new snapshots added + assertEquals(allSnapshots.size(), result.snapshots().size()); + + // Verify MAIN branch points to the latest snapshot (auto-appended to main) + assertNotNull(result.ref(SnapshotRef.MAIN_BRANCH)); + assertEquals( + allSnapshots.get(allSnapshots.size() - 1).snapshotId(), + result.ref(SnapshotRef.MAIN_BRANCH).snapshotId()); + + // Verify new snapshots tracked as appended (even though unreferenced, they're not staged WAP) + Map resultProps = result.properties(); + String appendedSnapshotsStr = + resultProps.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + + assertNotNull(appendedSnapshotsStr); + for (Snapshot extraSnapshot : extraSnapshots) { + assertTrue( + appendedSnapshotsStr.contains(Long.toString(extraSnapshot.snapshotId())), + "Snapshot " + extraSnapshot.snapshotId() + " should be tracked as appended"); + } + } } From b51d5fbb4208a963a2a3706825a7c8aa24fcf9f9 Mon Sep 17 00:00:00 2001 From: cbb330 Date: Sun, 9 Nov 2025 20:47:10 -0800 Subject: [PATCH 35/35] responding to comments, adding test --- .../internal/catalog/SnapshotDiffApplier.java | 56 +++++----- .../internal/catalog/IcebergTestUtil.java | 5 +- .../OpenHouseInternalTableOperationsTest.java | 34 +++--- .../catalog/SnapshotDiffApplierTest.java | 102 ++++++++++++++++-- 4 files changed, 145 insertions(+), 52 deletions(-) diff --git a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java index 08db88ba1..345f811ef 100644 --- a/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java +++ b/iceberg/openhouse/internalcatalog/src/main/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplier.java @@ -172,7 +172,10 @@ static SnapshotDiff create( // Categorize snapshots List newStagedSnapshots = newSnapshots.stream() - .filter(s -> s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) + .filter( + s -> + s.summary() != null + && s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); // Compute source IDs for cherry-pick operations @@ -220,7 +223,10 @@ static SnapshotDiff create( // (includes both regular commits and cherry-pick result snapshots) List newMainBranchSnapshots = newSnapshots.stream() - .filter(s -> !s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) + .filter( + s -> + s.summary() == null + || !s.summary().containsKey(SnapshotSummary.STAGED_WAP_ID_PROP)) .collect(Collectors.toList()); // Compute appended count @@ -373,33 +379,33 @@ TableMetadata apply() { * *

[1] Staged (WAP) snapshots - added without branch reference * - *

[2] New main branch snapshots - added without branch reference (branch pointer set - * below) + *

[2] New main branch snapshots - added and moved to MAIN branch incrementally + * + *

[3] Cherry-picked snapshots - existing snapshots, final branch pointer set below * - *

[3] Cherry-picked snapshots - existing snapshots, branch pointer set below + *

We trust the client-provided order rather than sorting. Sequence numbers are + * monotonically increasing along a branch's lineage (following parent pointers) for both + * cherry-pick result snapshots and fast-forward snapshots. Iceberg's setBranchSnapshot() + * validates sequence numbers, so we can rely on its built-in validation. */ - // Add staged snapshots in sequence number order (ensures correct commit ordering) - this.newStagedSnapshots.stream() - .sorted(java.util.Comparator.comparingLong(Snapshot::sequenceNumber)) - .forEach(metadataBuilder::addSnapshot); - - // Add new main branch snapshots in sequence number order (ensures correct commit ordering) - List sortedMainBranchSnapshots = - this.newMainBranchSnapshots.stream() - .sorted(java.util.Comparator.comparingLong(Snapshot::sequenceNumber)) - .collect(Collectors.toList()); - sortedMainBranchSnapshots.forEach(metadataBuilder::addSnapshot); + // Add staged snapshots in client-provided order + this.newStagedSnapshots.forEach(metadataBuilder::addSnapshot); + + // Add new main branch snapshots and move MAIN pointer incrementally + // This works for both: + // - Regular commits: newly created snapshots + // - Cherry-pick results: newly created snapshots with SOURCE_SNAPSHOT_ID_PROP + for (Snapshot snapshot : this.newMainBranchSnapshots) { + metadataBuilder.setBranchSnapshot(snapshot, SnapshotRef.MAIN_BRANCH); + } - // Set branch pointer once using providedRefs (covers both new snapshots and cherry-pick) - if (!this.providedRefs.isEmpty()) { - long newSnapshotId = this.providedRefs.get(SnapshotRef.MAIN_BRANCH).snapshotId(); + // Set final branch pointer using providedRefs if present + // This handles fast-forward for cherry-pick/WAP publish where we're moving the branch + // to an existing snapshot + SnapshotRef mainBranchRef = this.providedRefs.get(SnapshotRef.MAIN_BRANCH); + if (mainBranchRef != null) { + long newSnapshotId = mainBranchRef.snapshotId(); metadataBuilder.setBranchSnapshot(newSnapshotId, SnapshotRef.MAIN_BRANCH); - } else if (!sortedMainBranchSnapshots.isEmpty()) { - // Auto-append to main: if no refs provided but there are new main branch snapshots, - // set MAIN to the last snapshot (latest by sequence number due to sort above) - Snapshot latestSnapshot = - sortedMainBranchSnapshots.get(sortedMainBranchSnapshots.size() - 1); - metadataBuilder.setBranchSnapshot(latestSnapshot.snapshotId(), SnapshotRef.MAIN_BRANCH); } // Delete snapshots diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java index d4fd6efaa..cdedb3e93 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/IcebergTestUtil.java @@ -44,15 +44,14 @@ private static List loadSnapshots(String snapshotFile) throws IOExcept return SnapshotsUtil.parseSnapshots(null, data); } - public static Map obtainSnapshotRefsFromSnapshot(Snapshot snapshot) { + public static Map createMainBranchRefPointingTo(Snapshot snapshot) { Map snapshotRefs = new HashMap<>(); SnapshotRef snapshotRef = SnapshotRef.branchBuilder(snapshot.snapshotId()).build(); snapshotRefs.put(SnapshotRef.MAIN_BRANCH, SnapshotRefParser.toJson(snapshotRef)); return snapshotRefs; } - public static Map obtainSnapshotRefsFromSnapshot( - Snapshot snapshot, String branch) { + public static Map createBranchRefPointingTo(Snapshot snapshot, String branch) { Map snapshotRefs = new HashMap<>(); SnapshotRef snapshotRef = SnapshotRef.branchBuilder(snapshot.snapshotId()).build(); snapshotRefs.put(branch, SnapshotRefParser.toJson(snapshotRef)); diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java index 476435a61..69b4027b9 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/OpenHouseInternalTableOperationsTest.java @@ -148,7 +148,7 @@ void testDoCommitAppendSnapshotsInitialVersion() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testSnapshots.get(testSnapshots.size() - 1)))); TableMetadata metadata = BASE_TABLE_METADATA.replaceProperties(properties); @@ -193,7 +193,7 @@ void testDoCommitAppendSnapshotsExistingVersion() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testSnapshots.get(testSnapshots.size() - 1)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -248,7 +248,7 @@ void testDoCommitAppendAndDeleteSnapshots() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( newSnapshots.get(newSnapshots.size() - 1)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -438,7 +438,7 @@ void testDoCommitDeleteSnapshots() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testSnapshots.get(testSnapshots.size() - 1)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -640,7 +640,7 @@ void testDoCommitSnapshotsValidationThrowsException() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testSnapshots.get(1)))); // But main refs snapshot 1 properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); metadata = metadata.replaceProperties(properties); @@ -727,7 +727,7 @@ void testDoCommitAppendStageOnlySnapshotsExistingVersion() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(newSnapshots.get(0)))); + IcebergTestUtil.createMainBranchRefPointingTo(newSnapshots.get(0)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); TableMetadata metadata = base.replaceProperties(properties); @@ -771,7 +771,7 @@ void testAppendSnapshotsWithOldSnapshots() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); + IcebergTestUtil.createMainBranchRefPointingTo(snapshots.get(snapshots.size() - 1)))); TableMetadata newMetadata = baseMetadata.replaceProperties(properties); @@ -790,7 +790,7 @@ void testAppendSnapshotsWithOldSnapshots() throws IOException { propertiesWithFuture.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)))); + IcebergTestUtil.createMainBranchRefPointingTo(snapshots.get(snapshots.size() - 1)))); TableMetadata newMetadataWithFuture = baseMetadata.replaceProperties(propertiesWithFuture); openHouseInternalTableOperations.snapshotDiffApplier.applySnapshots( @@ -823,7 +823,7 @@ void testDoCommitCherryPickSnapshotBaseUnchanged() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testWapSnapshots.get(0)))); + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(0)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); TableMetadata metadata = base.replaceProperties(properties); @@ -864,7 +864,7 @@ void testDoCommitCherryPickSnapshotBaseChanged() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot( + IcebergTestUtil.createMainBranchRefPointingTo( testWapSnapshots.get(2)))); // new snapshot properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); @@ -905,7 +905,7 @@ void testDoCommitCherryPickFirstSnapshot() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testWapSnapshots.get(0)))); + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(0)))); properties.put(getCanonicalFieldName("tableLocation"), TEST_LOCATION); TableMetadata metadata = base.replaceProperties(properties); @@ -1475,7 +1475,7 @@ void testDeleteSnapshotWithNoReference() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(referencedSnapshot))); + IcebergTestUtil.createMainBranchRefPointingTo(referencedSnapshot))); TableMetadata newMetadata = baseMetadata.replaceProperties(properties); @@ -1537,7 +1537,7 @@ void testDeleteEmptySnapshotList() throws IOException { SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); TableMetadata newMetadata = baseMetadata.replaceProperties(properties); @@ -1583,7 +1583,7 @@ void testDeleteNullSnapshotList() throws IOException { SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); TableMetadata newMetadata = baseMetadata.replaceProperties(properties); @@ -1633,7 +1633,7 @@ void testDeleteNonExistentSnapshot() throws IOException { SnapshotsUtil.serializedSnapshots(baseMetadata.snapshots())); properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); TableMetadata newMetadata = baseMetadata.replaceProperties(properties); @@ -1721,7 +1721,7 @@ void testDeleteSnapshotMetricsRecordedBranch() throws IOException { properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, SnapshotsUtil.serializeMap( - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(referencedSnapshot))); + IcebergTestUtil.createMainBranchRefPointingTo(referencedSnapshot))); TableMetadata newMetadata = baseMetadata.replaceProperties(properties); @@ -1768,7 +1768,7 @@ void testDeleteSnapshotMetricsRecordedNonExistent() throws IOException { SnapshotsUtil.serializedSnapshots(finalBaseMetadata.snapshots())); properties.put( CatalogConstants.SNAPSHOTS_REFS_KEY, - SnapshotsUtil.serializeMap(IcebergTestUtil.obtainSnapshotRefsFromSnapshot(lastSnapshot))); + SnapshotsUtil.serializeMap(IcebergTestUtil.createMainBranchRefPointingTo(lastSnapshot))); TableMetadata newMetadata = finalBaseMetadata.replaceProperties(properties); diff --git a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java index 5a7bec3d7..a1319475d 100644 --- a/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java +++ b/iceberg/openhouse/internalcatalog/src/test/java/com/linkedin/openhouse/internal/catalog/SnapshotDiffApplierTest.java @@ -96,7 +96,7 @@ private TableMetadata createMetadataWithSnapshots( private TableMetadata createMetadataWithSnapshotsAndMainRef( TableMetadata base, List snapshots) { Map refs = - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(snapshots.get(snapshots.size() - 1)); + IcebergTestUtil.createMainBranchRefPointingTo(snapshots.get(snapshots.size() - 1)); return createMetadataWithSnapshots(base, snapshots, refs); } @@ -184,7 +184,7 @@ void testApplySnapshots_mainBranchUpdates_success() throws IOException { TableMetadata baseWithSnapshots = addSnapshotsToMetadata(baseMetadata, snapshots); Snapshot newBranchTarget = snapshots.get(1); - Map refs = IcebergTestUtil.obtainSnapshotRefsFromSnapshot(newBranchTarget); + Map refs = IcebergTestUtil.createMainBranchRefPointingTo(newBranchTarget); TableMetadata newMetadata = createMetadataWithSnapshots(baseWithSnapshots, snapshots, refs); TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); @@ -288,7 +288,7 @@ void testMetrics_addStagedSnapshots_recordsStagedCounter() throws IOException { allSnapshots.addAll(wapSnapshots); Map refs = - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(baseSnapshots.get(baseSnapshots.size() - 1)); + IcebergTestUtil.createMainBranchRefPointingTo(baseSnapshots.get(baseSnapshots.size() - 1)); TableMetadata newMetadata = createMetadataWithSnapshots(baseWithSnapshots, allSnapshots, refs); TableMetadata result = snapshotDiffApplier.applySnapshots(baseWithSnapshots, newMetadata); @@ -614,7 +614,7 @@ void testApplySnapshots_baseWithUnreferencedSnapshotsOnly_addFirstMainBranch() // Provided: same snapshots + MAIN ref to one of them Snapshot mainSnapshot = snapshots.get(2); - Map refs = IcebergTestUtil.obtainSnapshotRefsFromSnapshot(mainSnapshot); + Map refs = IcebergTestUtil.createMainBranchRefPointingTo(mainSnapshot); TableMetadata newMetadata = createMetadataWithSnapshots(base, snapshots, refs); TableMetadata result = snapshotDiffApplier.applySnapshots(base, newMetadata); @@ -691,8 +691,7 @@ void testApplySnapshots_addRegularAndStagedSimultaneously() throws IOException { allSnapshots.add(customWapSnapshots.get(0)); // New staged snapshot // MAIN ref points to the new regular snapshot - Map refs = - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(extraSnapshots.get(0)); + Map refs = IcebergTestUtil.createMainBranchRefPointingTo(extraSnapshots.get(0)); TableMetadata newMetadata = createMetadataWithSnapshots(baseMetadata, allSnapshots, refs); TableMetadata result = snapshotDiffApplier.applySnapshots(null, newMetadata); @@ -741,7 +740,7 @@ void testApplySnapshots_cherryPickAndAddNewSimultaneously() throws IOException { // MAIN ref points to new snapshot Map refs = - IcebergTestUtil.obtainSnapshotRefsFromSnapshot(testWapSnapshots.get(2)); + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(2)); TableMetadata newMetadata = createMetadataWithSnapshots(base, allSnapshots, refs); TableMetadata result = snapshotDiffApplier.applySnapshots(base, newMetadata); @@ -843,4 +842,93 @@ void testApplySnapshots_regularSnapshotsWithEmptyRefs_autoAppendedToMain() throw "Snapshot " + extraSnapshot.snapshotId() + " should be tracked as appended"); } } + + /** + * Verifies cherry-picking multiple staged snapshots in sequence, testing both fast-forward and + * rebase scenarios. wap1 and wap2 both have the same parent. Cherry-picking wap1 first is a + * fast-forward (no new snapshot). Cherry-picking wap2 after main has moved requires a rebase (new + * snapshot created). + */ + @Test + void testApplySnapshots_cherryPickMultipleStagedSnapshotsOutOfOrder() throws IOException { + List testSnapshots = IcebergTestUtil.getSnapshots(); + List testWapSnapshots = IcebergTestUtil.getWapSnapshots(); + + // Setup: MAIN snapshot + 2 staged WAP snapshots (wap1, wap2) + TableMetadata base = + TableMetadata.buildFrom(baseMetadata) + .setBranchSnapshot(testSnapshots.get(0), SnapshotRef.MAIN_BRANCH) + .addSnapshot(testWapSnapshots.get(0)) // wap1 (wap.id="wap1") + .addSnapshot(testWapSnapshots.get(1)) // wap2 (wap.id="wap2") + .build(); + + // Step 1: Fast-forward cherry-pick wap1 + // wap1's parent == current main, so it's promoted directly (no new snapshot) + List allSnapshots1 = new ArrayList<>(); + allSnapshots1.add(testSnapshots.get(0)); + allSnapshots1.add(testWapSnapshots.get(0)); // wap1 now on main + allSnapshots1.add(testWapSnapshots.get(1)); // wap2 still staged + + // Set MAIN branch to point to wap1 + Map refs1 = + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(0)); + TableMetadata newMetadata1 = createMetadataWithSnapshots(base, allSnapshots1, refs1); + + TableMetadata result1 = snapshotDiffApplier.applySnapshots(base, newMetadata1); + + // Verify fast-forward: only cherry_picked tracked, no new snapshot appended + assertNotNull(result1.currentSnapshot()); + assertEquals(testWapSnapshots.get(0).snapshotId(), result1.currentSnapshot().snapshotId()); + + Map resultProps1 = result1.properties(); + String cherryPickedSnapshots1 = + resultProps1.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); + assertNotNull(cherryPickedSnapshots1); + assertTrue( + cherryPickedSnapshots1.contains(Long.toString(testWapSnapshots.get(0).snapshotId())), + "wap1 should be tracked as cherry-picked"); + assertNull( + resultProps1.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)), + "No new snapshot for fast-forward"); + + // Step 2: Rebase cherry-pick wap2 + // wap2's parent != current main (which is now wap1), so a new snapshot is created + // New snapshot has: parent=wap1, source-snapshot-id=wap2, published.wap.id="wap2" + List allSnapshots2 = new ArrayList<>(); + allSnapshots2.add(testSnapshots.get(0)); + allSnapshots2.add(testWapSnapshots.get(0)); // wap1 + allSnapshots2.add(testWapSnapshots.get(1)); // wap2 (source) + allSnapshots2.add(testWapSnapshots.get(2)); // New rebased snapshot + + Map refs2 = + IcebergTestUtil.createMainBranchRefPointingTo(testWapSnapshots.get(2)); + TableMetadata newMetadata2 = createMetadataWithSnapshots(result1, allSnapshots2, refs2); + + TableMetadata result2 = snapshotDiffApplier.applySnapshots(result1, newMetadata2); + + // Verify rebase: both cherry_picked (source) and appended (new snapshot) tracked + assertNotNull(result2.currentSnapshot()); + assertEquals(testWapSnapshots.get(2).snapshotId(), result2.currentSnapshot().snapshotId()); + + Map resultProps2 = result2.properties(); + + String cherryPickedSnapshots2 = + resultProps2.get(getCanonicalFieldName(CatalogConstants.CHERRY_PICKED_SNAPSHOTS)); + assertNotNull(cherryPickedSnapshots2); + assertTrue( + cherryPickedSnapshots2.contains(Long.toString(testWapSnapshots.get(1).snapshotId())), + "wap2 should be tracked as cherry-picked (source)"); + + String appendedSnapshots2 = + resultProps2.get(getCanonicalFieldName(CatalogConstants.APPENDED_SNAPSHOTS)); + assertNotNull(appendedSnapshots2); + assertTrue( + appendedSnapshots2.contains(Long.toString(testWapSnapshots.get(2).snapshotId())), + "New rebased snapshot should be tracked as appended"); + + // Verify all 4 snapshots present + assertEquals(4, result2.snapshots().size()); + verify(mockMetricsReporter, atLeastOnce()) + .count(eq(InternalCatalogMetricsConstant.SNAPSHOTS_CHERRY_PICKED_CTR), anyDouble()); + } }