From 18b6d52d19066cba282999acc78e785eec2a6402 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 20:42:54 -0500
Subject: [PATCH 01/17] Integration test

---
 .github/workflows/unit-tests.yml              |   1 -
 pyproject.toml                                |   3 +
 v03_pipeline/bin/pipeline_worker_test.py      | 187 ++++++++-
 v03_pipeline/lib/misc/clickhouse.py           |   2 +
 .../tasks/load_complete_run_to_clickhouse.py  |  23 +-
 ...annotations_table_with_new_samples_test.py |   2 -
 .../var/test/test_clickhouse_schema.sql       | 378 ++++++++++++++++++
 7 files changed, 581 insertions(+), 15 deletions(-)
 create mode 100644 v03_pipeline/var/test/test_clickhouse_schema.sql

diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml
index f5267dc46..3c0327a5c 100644
--- a/.github/workflows/unit-tests.yml
+++ b/.github/workflows/unit-tests.yml
@@ -70,6 +70,5 @@ jobs:
           export CLICKHOUSE_OPTIMIZE_TABLE_WAIT_S=1
           export PYSPARK_SUBMIT_ARGS='--driver-memory 8G pyspark-shell'
           export CLICKHOUSE_DATABASE=test
-          export LOCAL_DISK_MOUNT_PATH=.
           uv run nosetests --with-coverage --cover-package v03_pipeline v03_pipeline
           uv run coverage report --omit '*test*' --fail-under=90
diff --git a/pyproject.toml b/pyproject.toml
index 568175864..cb704996a 100644
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -102,6 +102,9 @@ inline-quotes = "single"
 '*clickhouse*' = [
     'S608' # unsafe sql
 ]
+'*pipeline_worker*' = [
+    'S608' # unsafe sql
+]
 
 [tool.ruff.pylint]
 max-args = 6
diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py
index 73cb5511a..96c7008d6 100644
--- a/v03_pipeline/bin/pipeline_worker_test.py
+++ b/v03_pipeline/bin/pipeline_worker_test.py
@@ -1,18 +1,34 @@
 import json
 import os
-from unittest.mock import patch
+import shutil
+from unittest.mock import Mock, patch
 
+import hail as hl
 import luigi
+import luigi.worker
 
 from v03_pipeline.bin.pipeline_worker import process_queue
 from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.core.environment import Env
+from v03_pipeline.lib.misc.clickhouse import (
+    STAGING_CLICKHOUSE_DATABASE,
+    get_clickhouse_client,
+)
 from v03_pipeline.lib.paths import (
+    db_id_to_gene_id_path,
     loading_pipeline_deadletter_queue_dir,
     loading_pipeline_queue_dir,
+    clickhouse_load_success_file_path,
+)
+from v03_pipeline.lib.test.misc import copy_project_pedigree_to_mocked_dir
+from v03_pipeline.lib.test.mocked_reference_datasets_testcase import (
+    MockedReferenceDatasetsTestCase,
 )
-from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
-from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase
+from v03_pipeline.var.test.vep.mock_vep_data import MOCK_38_VEP_DATA
 
+TEST_DB_ID_TO_GENE_ID = 'v03_pipeline/var/test/db_id_to_gene_id.csv.gz'
+TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv'
+TEST_SCHEMA = 'v03_pipeline/var/test/test_clickhouse_schema.sql'
 TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf'
 
 
@@ -25,25 +41,142 @@ def output(self):
         return luigi.LocalTarget('output.txt')
 
 
-class PipelineWorkerTest(MockedDatarootTestCase):
+class PipelineWorkerTest(MockedReferenceDatasetsTestCase):
+    def setUp(self):
+        super().setUp()
+        client = get_clickhouse_client()
+        client.execute(
+            f"""
+            DROP DATABASE IF EXISTS {STAGING_CLICKHOUSE_DATABASE};
+            """,
+        )
+        client.execute(
+            f"""
+            DROP DATABASE IF EXISTS {Env.CLICKHOUSE_DATABASE};
+            """,
+        )
+        client.execute(
+            f"""
+            CREATE DATABASE {Env.CLICKHOUSE_DATABASE};
+        """,
+        )
+        client = get_clickhouse_client(database=Env.CLICKHOUSE_DATABASE)
+        client.execute(
+            """
+            CREATE DICTIONARY seqrdb_affected_status_dict
+            (
+                `family_guid` String,
+                `sampleId` String,
+                `affected` String
+            )
+            PRIMARY KEY family_guid, sampleId
+            SOURCE(NULL())
+            LIFETIME(0)
+            LAYOUT(COMPLEX_KEY_HASHED())
+            """,
+        )
+        client.execute(
+            """
+            CREATE DICTIONARY `GRCh38/SNV_INDEL/project_partitions_dict`
+            (
+                `project_guid` String,
+                `n_partitions` UInt32
+            )
+            PRIMARY KEY project_guid
+            SOURCE(NULL())
+            LIFETIME(0)
+            LAYOUT(COMPLEX_KEY_HASHED())
+            """,
+        )
+        with open(TEST_SCHEMA) as f:
+            sql = f.read()
+        commands = [cmd.strip() for cmd in sql.split(';') if cmd.strip()]
+        for cmd in commands:
+            client.execute(cmd)
+        client.execute(
+            f"""
+            CREATE DICTIONARY `GRCh38/SNV_INDEL/gt_stats_dict`
+            (
+                `key` UInt32,
+                `ac_wes` UInt64,
+                `ac_wgs` UInt64,
+                `ac_affected` UInt64,
+                `hom_wes` UInt64,
+                `hom_wgs` UInt64,
+                `hom_affected` UInt64
+            )
+            PRIMARY KEY key
+            SOURCE(
+                CLICKHOUSE(
+                    USER {Env.CLICKHOUSE_WRITER_USER} PASSWORD {Env.CLICKHOUSE_WRITER_PASSWORD}
+                    DB {Env.CLICKHOUSE_DATABASE} TABLE `GRCh38/SNV_INDEL/gt_stats`
+                )
+            )
+            LIFETIME(0)
+            LAYOUT(FLAT(MAX_ARRAY_SIZE 1000000000))
+            """,
+        )
+        os.makedirs(
+            self.mock_env.LOADING_DATASETS_DIR,
+            exist_ok=True,
+        )
+        shutil.copy2(
+            TEST_DB_ID_TO_GENE_ID,
+            db_id_to_gene_id_path(),
+        )
+
+    def tearDown(self):
+        super().tearDown()
+        client = get_clickhouse_client()
+        client.execute(
+            f"""
+           DROP DATABASE IF EXISTS {STAGING_CLICKHOUSE_DATABASE};
+           """,
+        )
+        client.execute(
+            f"""
+           DROP DATABASE IF EXISTS {Env.CLICKHOUSE_DATABASE};
+           """,
+        )
+
+    @patch(
+        'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id',
+    )
+    @patch(
+        'v03_pipeline.lib.tasks.update_new_variants_with_caids.register_alleles_in_chunks',
+    )
+    @patch('v03_pipeline.lib.vep.hl.vep')
     @patch('v03_pipeline.lib.misc.slack._safe_post_to_slack')
-    @patch('v03_pipeline.api.request_handlers.WriteClickhouseLoadSuccessFileTask')
     @patch('v03_pipeline.bin.pipeline_worker.logger')
-    def test_process_queue(
+    def test_process_queue_integration_test(
         self,
         mock_logger,
-        mock_write_clickhouse_load_success_file_task,
         mock_safe_post_to_slack,
+        mock_vep: Mock,
+        mock_register_alleles: Mock,
+        mock_load_gencode_ensembl_to_refseq_id: Mock,
     ):
+        mock_load_gencode_ensembl_to_refseq_id.return_value = hl.dict(
+            {'ENST00000327044': 'NM_015658.4'},
+        )
+        mock_register_alleles.side_effect = None
+        mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA)
+        copy_project_pedigree_to_mocked_dir(
+            TEST_PEDIGREE_3_REMAP,
+            ReferenceGenome.GRCh38,
+            DatasetType.SNV_INDEL,
+            SampleType.WGS,
+            'R0113_test_project',
+        )
         raw_request = {
             'request_type': 'LoadingPipelineRequest',
             'callset_path': TEST_VCF,
-            'projects_to_run': ['project_a'],
+            'projects_to_run': ['R0113_test_project'],
             'sample_type': SampleType.WGS.value,
             'reference_genome': ReferenceGenome.GRCh38.value,
             'dataset_type': DatasetType.SNV_INDEL.value,
+            'skip_validation': True,
         }
-        mock_write_clickhouse_load_success_file_task.return_value = MockCompleteTask()
         os.makedirs(
             loading_pipeline_queue_dir(),
             exist_ok=True,
@@ -58,8 +191,42 @@ def test_process_queue(
             json.dump(raw_request, f)
         process_queue(local_scheduler=True)
         mock_safe_post_to_slack.assert_called_once_with(
-            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "project_a"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "skip_validation": false\n}```',
+            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "skip_validation": true\n}```',
         )
+        with open(
+            clickhouse_load_success_file(
+                ReferenceGenome.GRCh38,
+                DatasetType.SNV_INDEL,
+                'request_20250916-200704-123456',
+            )
+        ) as f:
+            self.assertEqual(f.read(), '')
+
+        client = get_clickhouse_client()
+        annotations_count = client.execute(
+            f"""
+            SELECT COUNT(*)
+            FROM
+            {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/annotations_memory`
+            """,
+        )[0][0]
+        self.assertEqual(annotations_count, 30)
+        entries_count = client.execute(
+            f"""
+            SELECT COUNT(*)
+            FROM
+            {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries`
+            """,
+        )[0][0]
+        self.assertEqual(entries_count, 16)
+        ac_wgs = client.execute(
+            f"""
+            SELECT sum(ac_wgs)
+            FROM
+            {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/gt_stats_dict`
+            """,
+        )[0][0]
+        self.assertEqual(ac_wgs, 69)
 
     @patch('v03_pipeline.lib.misc.slack._safe_post_to_slack')
     @patch('v03_pipeline.api.request_handlers.WriteClickhouseLoadSuccessFileTask')
diff --git a/v03_pipeline/lib/misc/clickhouse.py b/v03_pipeline/lib/misc/clickhouse.py
index 751c9dafb..6aabe5c44 100644
--- a/v03_pipeline/lib/misc/clickhouse.py
+++ b/v03_pipeline/lib/misc/clickhouse.py
@@ -965,12 +965,14 @@ def rebuild_gt_stats(
 
 def get_clickhouse_client(
     timeout: int | None = None,
+    database: str | None = None,
 ) -> Client:
     return Client(
         host=Env.CLICKHOUSE_SERVICE_HOSTNAME,
         port=Env.CLICKHOUSE_SERVICE_PORT,
         user=Env.CLICKHOUSE_WRITER_USER,
         password=Env.CLICKHOUSE_WRITER_PASSWORD,
+        **{'database': database} if database else {},
         **{'send_receive_timeout': timeout} if timeout else {},
         **{
             'settings': {
diff --git a/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py b/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py
index cc4140b3a..ca1af75dc 100644
--- a/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py
+++ b/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py
@@ -10,7 +10,7 @@
     load_complete_run,
     logged_query,
 )
-from v03_pipeline.lib.paths import metadata_for_run_path
+from v03_pipeline.lib.paths import metadata_for_run_path, pipeline_run_success_file_path
 from v03_pipeline.lib.tasks.base.base_loading_run_params import (
     BaseLoadingRunParams,
 )
@@ -25,6 +25,14 @@ def requires(self) -> luigi.Task:
         return [self.clone(WriteSuccessFileTask)]
 
     def complete(self):
+        if not hfs.exists(
+            pipeline_run_success_file_path(
+                self.reference_genome,
+                self.dataset_type,
+                self.run_id,
+            ),
+        ):
+            return False
         table_name_builder = TableNameBuilder(
             self.reference_genome,
             self.dataset_type,
@@ -35,7 +43,7 @@ def complete(self):
             SELECT max(key) FROM {table_name_builder.src_table(ClickHouseTable.ANNOTATIONS_MEMORY)}
             """,
         )[0][0]
-        return logged_query(
+        exists_in_annotations = logged_query(
             f"""
             SELECT EXISTS (
                 SELECT 1
@@ -45,6 +53,17 @@ def complete(self):
             """,
             {'max_key_src': max_key_src},
         )[0][0]
+        exists_in_gt_stats = logged_query(
+            f"""
+            SELECT EXISTS (
+                SELECT 1
+                FROM {table_name_builder.dst_table(ClickHouseTable.GT_STATS)}
+                WHERE key = %(max_key_src)s
+            );
+            """,
+            {'max_key_src': max_key_src},
+        )[0][0]
+        return exists_in_annotations and exists_in_gt_stats
 
     def run(self):
         with hfs.open(
diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index c6140448c..9a03d75cc 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -207,7 +207,6 @@ def test_multiple_update_vat(
                 [],
             ),  # for the second call, there are no new variants, return empty iterator
         ]
-
         mock_standard_contigs.return_value = {'chr1'}
         # This creates a mock validation table with 1 coding and 1 non-coding variant
         # explicitly chosen from the VCF.
@@ -248,7 +247,6 @@ def test_multiple_update_vat(
                 ),
             ),
         )
-
         coding_and_noncoding_variants_ht.write(
             valid_reference_dataset_path(
                 ReferenceGenome.GRCh38,
diff --git a/v03_pipeline/var/test/test_clickhouse_schema.sql b/v03_pipeline/var/test/test_clickhouse_schema.sql
new file mode 100644
index 000000000..ca165199b
--- /dev/null
+++ b/v03_pipeline/var/test/test_clickhouse_schema.sql
@@ -0,0 +1,378 @@
+CREATE TABLE `GRCh38/SNV_INDEL/annotations_disk`
+(
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `xpos` UInt64,
+    `chrom` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25),
+    `pos` UInt32,
+    `ref` String,
+    `alt` String,
+    `variantId` String,
+    `rsid` Nullable(String),
+    `CAID` Nullable(String),
+    `liftedOverChrom` Nullable(Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25)),
+    `liftedOverPos` Nullable(UInt32),
+    `hgmd` Tuple(
+        accession Nullable(String),
+        classification Nullable(Enum8('DM' = 0, 'DM?' = 1, 'DP' = 2, 'DFP' = 3, 'FP' = 4, 'R' = 5))),
+    `screenRegionType` Nullable(Enum8('CTCF-bound' = 0, 'CTCF-only' = 1, 'DNase-H3K4me3' = 2, 'PLS' = 3, 'dELS' = 4, 'pELS' = 5, 'DNase-only' = 6, 'low-DNase' = 7)),
+    `predictions` Tuple(
+        cadd Nullable(Decimal(9, 5)),
+        eigen Nullable(Decimal(9, 5)),
+        fathmm Nullable(Decimal(9, 5)),
+        gnomad_noncoding Nullable(Decimal(9, 5)),
+        mpc Nullable(Decimal(9, 5)),
+        mut_pred Nullable(Decimal(9, 5)),
+        mut_taster Nullable(Enum8('D' = 0, 'A' = 1, 'N' = 2, 'P' = 3)),
+        polyphen Nullable(Decimal(9, 5)),
+        primate_ai Nullable(Decimal(9, 5)),
+        revel Nullable(Decimal(9, 5)),
+        sift Nullable(Decimal(9, 5)),
+        splice_ai Nullable(Decimal(9, 5)),
+        splice_ai_consequence Nullable(Enum8('Acceptor gain' = 0, 'Acceptor loss' = 1, 'Donor gain' = 2, 'Donor loss' = 3, 'No consequence' = 4)),
+        vest Nullable(Decimal(9, 5))),
+    `populations` Tuple(
+        exac Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            filter_af Decimal(9, 8),
+            hemi UInt32,
+            het UInt32,
+            hom UInt32),
+        gnomad_exomes Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            filter_af Decimal(9, 8),
+            hemi UInt32,
+            hom UInt32),
+        gnomad_genomes Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            filter_af Decimal(9, 8),
+            hemi UInt32,
+            hom UInt32),
+        topmed Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            het UInt32,
+            hom UInt32)),
+    `sortedTranscriptConsequences` Nested(alphamissensePathogenicity Nullable(Decimal(9, 5)), canonical Nullable(UInt8), consequenceTerms Array(Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32))), extendedIntronicSpliceRegionVariant Nullable(Bool), fiveutrConsequence Nullable(Enum8('5_prime_UTR_premature_start_codon_gain_variant' = 1, '5_prime_UTR_premature_start_codon_loss_variant' = 2, '5_prime_UTR_stop_codon_gain_variant' = 3, '5_prime_UTR_stop_codon_loss_variant' = 4, '5_prime_UTR_uORF_frameshift_variant' = 5)), geneId Nullable(String)),
+    `sortedMotifFeatureConsequences` Nested(consequenceTerms Array(Nullable(Enum8('TFBS_ablation' = 0, 'TFBS_amplification' = 1, 'TF_binding_site_variant' = 2, 'TFBS_fusion' = 3, 'TFBS_translocation' = 4))), motifFeatureId Nullable(String)),
+    `sortedRegulatoryFeatureConsequences` Nested(biotype Nullable(Enum8('enhancer' = 0, 'promoter' = 1, 'CTCF_binding_site' = 2, 'TF_binding_site' = 3, 'open_chromatin_region' = 4)), consequenceTerms Array(Nullable(Enum8('regulatory_region_ablation' = 0, 'regulatory_region_amplification' = 1, 'regulatory_region_variant' = 2, 'regulatory_region_fusion' = 3))), regulatoryFeatureId Nullable(String))
+)
+ENGINE = EmbeddedRocksDB()
+PRIMARY KEY key;
+
+CREATE TABLE `GRCh38/SNV_INDEL/annotations_memory`
+(
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `xpos` UInt64,
+    `chrom` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25),
+    `pos` UInt32,
+    `ref` String,
+    `alt` String,
+    `variantId` String,
+    `rsid` Nullable(String),
+    `CAID` Nullable(String),
+    `liftedOverChrom` Nullable(Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25)),
+    `liftedOverPos` Nullable(UInt32),
+    `hgmd` Tuple(
+        accession Nullable(String),
+        classification Nullable(Enum8('DM' = 0, 'DM?' = 1, 'DP' = 2, 'DFP' = 3, 'FP' = 4, 'R' = 5))),
+    `screenRegionType` Nullable(Enum8('CTCF-bound' = 0, 'CTCF-only' = 1, 'DNase-H3K4me3' = 2, 'PLS' = 3, 'dELS' = 4, 'pELS' = 5, 'DNase-only' = 6, 'low-DNase' = 7)),
+    `predictions` Tuple(
+        cadd Nullable(Decimal(9, 5)),
+        eigen Nullable(Decimal(9, 5)),
+        fathmm Nullable(Decimal(9, 5)),
+        gnomad_noncoding Nullable(Decimal(9, 5)),
+        mpc Nullable(Decimal(9, 5)),
+        mut_pred Nullable(Decimal(9, 5)),
+        mut_taster Nullable(Enum8('D' = 0, 'A' = 1, 'N' = 2, 'P' = 3)),
+        polyphen Nullable(Decimal(9, 5)),
+        primate_ai Nullable(Decimal(9, 5)),
+        revel Nullable(Decimal(9, 5)),
+        sift Nullable(Decimal(9, 5)),
+        splice_ai Nullable(Decimal(9, 5)),
+        splice_ai_consequence Nullable(Enum8('Acceptor gain' = 0, 'Acceptor loss' = 1, 'Donor gain' = 2, 'Donor loss' = 3, 'No consequence' = 4)),
+        vest Nullable(Decimal(9, 5))),
+    `populations` Tuple(
+        exac Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            filter_af Decimal(9, 8),
+            hemi UInt32,
+            het UInt32,
+            hom UInt32),
+        gnomad_exomes Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            filter_af Decimal(9, 8),
+            hemi UInt32,
+            hom UInt32),
+        gnomad_genomes Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            filter_af Decimal(9, 8),
+            hemi UInt32,
+            hom UInt32),
+        topmed Tuple(
+            ac UInt32,
+            af Decimal(9, 8),
+            an UInt32,
+            het UInt32,
+            hom UInt32)),
+    `sortedTranscriptConsequences` Nested(alphamissensePathogenicity Nullable(Decimal(9, 5)), canonical Nullable(UInt8), consequenceTerms Array(Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32))), extendedIntronicSpliceRegionVariant Nullable(Bool), fiveutrConsequence Nullable(Enum8('5_prime_UTR_premature_start_codon_gain_variant' = 1, '5_prime_UTR_premature_start_codon_loss_variant' = 2, '5_prime_UTR_stop_codon_gain_variant' = 3, '5_prime_UTR_stop_codon_loss_variant' = 4, '5_prime_UTR_uORF_frameshift_variant' = 5)), geneId Nullable(String)),
+    `sortedMotifFeatureConsequences` Nested(consequenceTerms Array(Nullable(Enum8('TFBS_ablation' = 0, 'TFBS_amplification' = 1, 'TF_binding_site_variant' = 2, 'TFBS_fusion' = 3, 'TFBS_translocation' = 4))), motifFeatureId Nullable(String)),
+    `sortedRegulatoryFeatureConsequences` Nested(biotype Nullable(Enum8('enhancer' = 0, 'promoter' = 1, 'CTCF_binding_site' = 2, 'TF_binding_site' = 3, 'open_chromatin_region' = 4)), consequenceTerms Array(Nullable(Enum8('regulatory_region_ablation' = 0, 'regulatory_region_amplification' = 1, 'regulatory_region_variant' = 2, 'regulatory_region_fusion' = 3))), regulatoryFeatureId Nullable(String))
+)
+ENGINE = EmbeddedRocksDB()
+PRIMARY KEY key;
+
+CREATE TABLE `GRCh38/SNV_INDEL/transcripts`
+(
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `transcripts` Nested(alphamissense Tuple(
+        pathogenicity Nullable(Decimal(9, 5))), aminoAcids Nullable(String), biotype Nullable(String), canonical Nullable(UInt8), codons Nullable(String), consequenceTerms Array(Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32))), exon Tuple(
+        index Nullable(Int32),
+        total Nullable(Int32)), geneId Nullable(String), hgvsc Nullable(String), hgvsp Nullable(String), intron Tuple(
+        index Nullable(Int32),
+        total Nullable(Int32)), loftee Tuple(
+        isLofNagnag Nullable(Bool),
+        lofFilters Array(Nullable(String))), majorConsequence Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32)), manePlusClinical Nullable(String), maneSelect Nullable(String), refseqTranscriptId Nullable(String), spliceregion Tuple(
+        extended_intronic_splice_region_variant Nullable(Bool)), transcriptId String, transcriptRank UInt8, utrannotator Tuple(
+        existingInframeOorfs Nullable(Int32),
+        existingOutofframeOorfs Nullable(Int32),
+        existingUorfs Nullable(Int32),
+        fiveutrAnnotation Tuple(
+            AltStop Nullable(String),
+            AltStopDistanceToCDS Nullable(Int32),
+            CapDistanceToStart Nullable(Int32),
+            DistanceToCDS Nullable(Int32),
+            DistanceToStop Nullable(Int32),
+            Evidence Nullable(Bool),
+            FrameWithCDS Nullable(String),
+            KozakContext Nullable(String),
+            KozakStrength Nullable(String),
+            StartDistanceToCDS Nullable(Int32),
+            alt_type Nullable(String),
+            alt_type_length Nullable(Int32),
+            newSTOPDistanceToCDS Nullable(Int32),
+            ref_StartDistanceToCDS Nullable(Int32),
+            ref_type Nullable(String),
+            ref_type_length Nullable(Int32),
+            type Nullable(String)),
+        fiveutrConsequence Nullable(String)))
+)
+ENGINE = EmbeddedRocksDB()
+PRIMARY KEY key;
+
+CREATE TABLE `GRCh38/SNV_INDEL/entries`
+(
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `project_guid` LowCardinality(String),
+    `family_guid` String,
+    `sample_type` Enum8('WES' = 1, 'WGS' = 2),
+    `xpos` UInt64 CODEC(Delta(8), ZSTD(1)),
+    `is_gnomad_gt_5_percent` Bool,
+    `is_annotated_in_any_gene` Bool,
+    `geneId_ids` Array(UInt32),
+    `filters` Array(LowCardinality(String)),
+    `calls` Array(Tuple(
+        sampleId String,
+        gt Nullable(Enum8('REF' = 0, 'HET' = 1, 'HOM' = 2)),
+        gq Nullable(UInt8),
+        ab Nullable(Decimal(9, 5)),
+        dp Nullable(UInt16))),
+    `sign` Int8,
+    `n_partitions` UInt8 MATERIALIZED dictGetOrDefault('GRCh38/SNV_INDEL/project_partitions_dict', 'n_partitions', project_guid, 1),
+    `partition_id` UInt8 MATERIALIZED farmHash64(family_guid) % n_partitions,
+    PROJECTION xpos_projection
+    (
+        SELECT *
+        ORDER BY 
+            is_gnomad_gt_5_percent,
+            is_annotated_in_any_gene,
+            xpos
+    )
+)
+ENGINE = CollapsingMergeTree(sign)
+PARTITION BY (project_guid, partition_id)
+ORDER BY (project_guid, family_guid, sample_type, is_gnomad_gt_5_percent, is_annotated_in_any_gene, key)
+SETTINGS deduplicate_merge_projection_mode = 'rebuild', index_granularity = 8192;
+
+CREATE TABLE `GRCh38/SNV_INDEL/gt_stats`
+(
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `ac_wes` UInt32,
+    `ac_wgs` UInt32,
+    `ac_affected` UInt32 DEFAULT 0,
+    `hom_wes` UInt32,
+    `hom_wgs` UInt32,
+    `hom_affected` UInt32 DEFAULT 0
+)
+ENGINE = SummingMergeTree
+ORDER BY key
+SETTINGS index_granularity = 8192;
+
+CREATE TABLE `GRCh38/SNV_INDEL/key_lookup`
+(
+    `variantId` String,
+    `key` UInt32 CODEC(Delta(8), ZSTD(1))
+)
+ENGINE = EmbeddedRocksDB()
+PRIMARY KEY variantId;
+
+CREATE TABLE `GRCh38/SNV_INDEL/project_gt_stats`
+(
+    `project_guid` LowCardinality(String),
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `sample_type` Enum8('WES' = 1, 'WGS' = 2),
+    `affected` Enum8('A' = 1, 'N' = 2, 'U' = 3) DEFAULT 'U',
+    `ref_samples` UInt32,
+    `het_samples` UInt32,
+    `hom_samples` UInt32
+)
+ENGINE = SummingMergeTree
+PARTITION BY project_guid
+ORDER BY (project_guid, key, sample_type)
+SETTINGS index_granularity = 8192;
+
+CREATE TABLE `GRCh38/SNV_INDEL/reference_data/clinvar`
+(
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `alleleId` Nullable(UInt32),
+    `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16),
+    `goldStars` Nullable(UInt8),
+    `submitters` Array(String),
+    `conditions` Array(String),
+    `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)),
+    `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16)
+)
+ENGINE = Join(`ALL`, LEFT, key)
+SETTINGS join_use_nulls = 1;
+
+CREATE TABLE `GRCh38/SNV_INDEL/reference_data/clinvar/all_variants`
+(
+    `version` Date,
+    `variantId` String,
+    `alleleId` Nullable(UInt32),
+    `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16),
+    `goldStars` Nullable(UInt8),
+    `submitters` Array(String),
+    `conditions` Array(String),
+    `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)),
+    `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16)
+)
+ENGINE = MergeTree
+PARTITION BY version
+PRIMARY KEY (version, variantId)
+ORDER BY (version, variantId)
+SETTINGS index_granularity = 8192;
+
+CREATE TABLE `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants`
+(
+    `key` UInt32 CODEC(Delta(8), ZSTD(1)),
+    `alleleId` Nullable(UInt32),
+    `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16),
+    `goldStars` Nullable(UInt8),
+    `submitters` Array(String),
+    `conditions` Array(String),
+    `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)),
+    `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16)
+)
+ENGINE = MergeTree
+PRIMARY KEY key
+ORDER BY key
+SETTINGS index_granularity = 8192;
+
+CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/entries_to_project_gt_stats_mv` TO `GRCh38/SNV_INDEL/project_gt_stats`
+(
+    `project_guid` LowCardinality(String),
+    `key` UInt32,
+    `sample_type` Enum8('WES' = 1, 'WGS' = 2),
+    `affected` String,
+    `ref_samples` Int64,
+    `het_samples` Int64,
+    `hom_samples` Int64
+)
+AS SELECT
+    project_guid,
+    key,
+    sample_type,
+    dictGetOrDefault('seqrdb_affected_status_dict', 'affected', (family_guid, calls.sampleId), 'U') AS affected,
+    sumIf(sign, calls.gt = 'REF') AS ref_samples,
+    sumIf(sign, calls.gt = 'HET') AS het_samples,
+    sumIf(sign, calls.gt = 'HOM') AS hom_samples
+FROM `GRCh38/SNV_INDEL/entries`
+ARRAY JOIN calls
+GROUP BY
+    project_guid,
+    key,
+    sample_type,
+    affected;
+
+CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/project_gt_stats_to_gt_stats_mv`
+REFRESH EVERY 10 YEAR TO `GRCh38/SNV_INDEL/gt_stats`
+(
+    `key` UInt32,
+    `ac_wes` UInt64,
+    `ac_wgs` UInt64,
+    `ac_affected` UInt64,
+    `hom_wes` UInt64,
+    `hom_wgs` UInt64,
+    `hom_affected` UInt64
+)
+AS SELECT
+    key,
+    sumIf((het_samples * 1) + (hom_samples * 2), sample_type = 'WES') AS ac_wes,
+    sumIf((het_samples * 1) + (hom_samples * 2), sample_type = 'WGS') AS ac_wgs,
+    sumIf((het_samples * 1) + (hom_samples * 2), affected = 'A') AS ac_affected,
+    sumIf(hom_samples, sample_type = 'WES') AS hom_wes,
+    sumIf(hom_samples, sample_type = 'WGS') AS hom_wgs,
+    sumIf(hom_samples, affected = 'A') AS hom_affected
+FROM `GRCh38/SNV_INDEL/project_gt_stats`
+WHERE project_guid NOT IN ['R0555_seqr_demo', 'R0607_gregor_training_project_', 'R0608_gregor_training_project_', 'R0801_gregor_training_project_', 'R0811_gregor_training_project_', 'R0812_gregor_training_project_', 'R0813_gregor_training_project_', 'R0814_gregor_training_project_', 'R0815_gregor_training_project_', 'R0816_gregor_training_project_']
+GROUP BY key;
+
+CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/reference_data/clinvar/all_variants_to_seqr_variants_mv`
+REFRESH EVERY 10 YEAR TO `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants`
+(
+    `key` UInt32,
+    `alleleId` Nullable(UInt32),
+    `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16),
+    `goldStars` Nullable(UInt8),
+    `submitters` Array(String),
+    `conditions` Array(String),
+    `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)),
+    `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16)
+)
+AS SELECT
+    key,
+    COLUMNS('.*') EXCEPT (version, variantId, key)
+FROM `GRCh38/SNV_INDEL/reference_data/clinvar/all_variants` AS src
+INNER JOIN `GRCh38/SNV_INDEL/key_lookup` AS dst ON assumeNotNull(src.variantId) = dst.variantId
+LIMIT 1 BY key;
+
+
+CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants_to_search_mv`
+REFRESH EVERY 10 YEAR TO `GRCh38/SNV_INDEL/reference_data/clinvar`
+(
+    `key` UInt32,
+    `alleleId` Nullable(UInt32),
+    `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16),
+    `goldStars` Nullable(UInt8),
+    `submitters` Array(String),
+    `conditions` Array(String),
+    `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)),
+    `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16)
+)
+AS SELECT *
+FROM `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants`
+LIMIT 1 BY key;
+

From b0cf8b4203df0b339a9d960edf04284242632bd0 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 20:51:37 -0500
Subject: [PATCH 02/17] works

---
 v03_pipeline/bin/pipeline_worker_test.py | 11 ++++++-----
 1 file changed, 6 insertions(+), 5 deletions(-)

diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py
index 96c7008d6..961d439a2 100644
--- a/v03_pipeline/bin/pipeline_worker_test.py
+++ b/v03_pipeline/bin/pipeline_worker_test.py
@@ -4,6 +4,7 @@
 from unittest.mock import Mock, patch
 
 import hail as hl
+import hailtop.fs as hfs
 import luigi
 import luigi.worker
 
@@ -15,10 +16,10 @@
     get_clickhouse_client,
 )
 from v03_pipeline.lib.paths import (
+    clickhouse_load_success_file_path,
     db_id_to_gene_id_path,
     loading_pipeline_deadletter_queue_dir,
     loading_pipeline_queue_dir,
-    clickhouse_load_success_file_path,
 )
 from v03_pipeline.lib.test.misc import copy_project_pedigree_to_mocked_dir
 from v03_pipeline.lib.test.mocked_reference_datasets_testcase import (
@@ -193,12 +194,12 @@ def test_process_queue_integration_test(
         mock_safe_post_to_slack.assert_called_once_with(
             ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "skip_validation": true\n}```',
         )
-        with open(
-            clickhouse_load_success_file(
+        with hfs.open(
+            clickhouse_load_success_file_path(
                 ReferenceGenome.GRCh38,
                 DatasetType.SNV_INDEL,
-                'request_20250916-200704-123456',
-            )
+                '20250916-200704-123456',
+            ),
         ) as f:
             self.assertEqual(f.read(), '')
 

From add7a16bd7f3ec8ec1e5f0663bf6db0a3107b451 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 22:08:19 -0500
Subject: [PATCH 03/17] feat: skip single validation at a time

---
 v03_pipeline/api/model.py                     | 35 +++++++++++-
 v03_pipeline/api/model_test.py                | 36 +++++++++++++
 v03_pipeline/bin/pipeline_worker_test.py      |  5 +-
 v03_pipeline/lib/misc/io.py                   |  4 +-
 v03_pipeline/lib/misc/validation.py           | 10 ++++
 .../lib/tasks/base/base_loading_run_params.py |  5 +-
 v03_pipeline/lib/tasks/dataproc/misc_test.py  |  4 +-
 .../exports/write_new_entries_parquet_test.py |  9 ++--
 .../write_new_transcripts_parquet_test.py     |  5 +-
 .../write_new_variants_parquet_test.py        | 11 ++--
 ...annotations_table_with_new_samples_test.py | 25 +++++----
 v03_pipeline/lib/tasks/validate_callset.py    | 54 +++++++++++--------
 .../lib/tasks/write_imported_callset.py       |  5 +-
 .../lib/tasks/write_metadata_for_run_test.py  |  3 +-
 ...ite_remapped_and_subsetted_callset_test.py |  9 ++--
 .../lib/tasks/write_sample_qc_json_test.py    |  3 +-
 .../write_variant_annotations_vcf_test.py     |  3 +-
 17 files changed, 161 insertions(+), 65 deletions(-)

diff --git a/v03_pipeline/api/model.py b/v03_pipeline/api/model.py
index 57f62036e..9d3e8848c 100644
--- a/v03_pipeline/api/model.py
+++ b/v03_pipeline/api/model.py
@@ -1,9 +1,20 @@
+from typing import Literal
+
 import hailtop.fs as hfs
-from pydantic import AliasChoices, BaseModel, Field, conint, field_validator
+from pydantic import (
+    AliasChoices,
+    BaseModel,
+    Field,
+    conint,
+    field_validator,
+    root_validator,
+)
 
 from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS, SKIPPABLE_VALIDATIONS
 
 MAX_LOADING_PIPELINE_ATTEMPTS = 3
+STRINGIFIED_SKIPPABLE_VALIDATIONS = [f.__name__ for f in SKIPPABLE_VALIDATIONS]
 VALID_FILE_TYPES = ['vcf', 'vcf.gz', 'vcf.bgz', 'mt']
 
 
@@ -26,9 +37,9 @@ class LoadingPipelineRequest(PipelineRunnerRequest):
     sample_type: SampleType
     reference_genome: ReferenceGenome
     dataset_type: DatasetType
-    skip_validation: bool = False
     skip_check_sex_and_relatedness: bool = False
     skip_expect_tdr_metrics: bool = False
+    validations_to_skip: list[Literal[*STRINGIFIED_SKIPPABLE_VALIDATIONS]] = []
 
     def incr_attempt(self):
         if self.attempt_id == (MAX_LOADING_PIPELINE_ATTEMPTS - 1):
@@ -36,6 +47,26 @@ def incr_attempt(self):
         self.attempt_id += 1
         return True
 
+    @field_validator('validations_to_skip')
+    @classmethod
+    def must_be_known_validation(cls, validations_to_skip):
+        for v in validations_to_skip:
+            if v not in set(STRINGIFIED_SKIPPABLE_VALIDATIONS):
+                msg = f'{v} is not a valid validator'
+                raise ValueError(msg)
+        return validations_to_skip
+
+    @root_validator(
+        pre=True,
+    )  # the root validator runs before Pydantic parses or coerces field values.
+    @classmethod
+    def override_all_validations(cls, values):
+        if values.get('validations_to_skip') == [
+            ALL_VALIDATIONS,
+        ]:
+            values['validations_to_skip'] = STRINGIFIED_SKIPPABLE_VALIDATIONS
+        return values
+
     @field_validator('callset_path')
     @classmethod
     def check_valid_callset_path(cls, callset_path: str) -> str:
diff --git a/v03_pipeline/api/model_test.py b/v03_pipeline/api/model_test.py
index e48331aaa..7334b5120 100644
--- a/v03_pipeline/api/model_test.py
+++ b/v03_pipeline/api/model_test.py
@@ -46,6 +46,42 @@ def test_invalid_loading_pipeline_requests(self) -> None:
             ),
         )
 
+    def test_validations_to_skip(self) -> None:
+        shared_params = {
+            'callset_path': CALLSET_PATH,
+            'projects_to_run': ['project_a'],
+            'sample_type': SampleType.WGS.value,
+            'reference_genome': ReferenceGenome.GRCh38.value,
+            'dataset_type': DatasetType.SNV_INDEL.value,
+        }
+        raw_request = {
+            **shared_params,
+            'skip_validation': True,
+        }
+        lpr = LoadingPipelineRequest.model_validate(raw_request)
+        self.assertGreater(len(lpr.validations_to_skip), 1)
+
+        raw_request = {
+            **shared_params,
+            'validations_to_skip': ['all'],
+        }
+        lpr = LoadingPipelineRequest.model_validate(raw_request)
+        self.assertGreater(len(lpr.validations_to_skip), 1)
+
+        raw_request = {
+            **shared_params,
+            'validations_to_skip': ['validate_sample_type'],
+        }
+        lpr = LoadingPipelineRequest.model_validate(raw_request)
+        self.assertEqual(len(lpr.validations_to_skip), 1)
+
+        raw_request = {
+            **shared_params,
+            'validations_to_skip': ['validate_blended_exome'],
+        }
+        with self.assertRaises(ValueError):
+            LoadingPipelineRequest.model_validate(raw_request)
+
     def test_delete_families_request(self) -> None:
         raw_request = {'project_guid': 'project_a', 'family_guids': []}
         with self.assertRaises(ValueError):
diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py
index 961d439a2..9e41c5aec 100644
--- a/v03_pipeline/bin/pipeline_worker_test.py
+++ b/v03_pipeline/bin/pipeline_worker_test.py
@@ -176,7 +176,6 @@ def test_process_queue_integration_test(
             'sample_type': SampleType.WGS.value,
             'reference_genome': ReferenceGenome.GRCh38.value,
             'dataset_type': DatasetType.SNV_INDEL.value,
-            'skip_validation': True,
         }
         os.makedirs(
             loading_pipeline_queue_dir(),
@@ -192,7 +191,7 @@ def test_process_queue_integration_test(
             json.dump(raw_request, f)
         process_queue(local_scheduler=True)
         mock_safe_post_to_slack.assert_called_once_with(
-            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "skip_validation": true\n}```',
+            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "validations_to_skip": []\n}```',
         )
         with hfs.open(
             clickhouse_load_success_file_path(
@@ -263,7 +262,7 @@ def test_process_failure(
         process_queue(local_scheduler=True)
         process_queue(local_scheduler=True)
         mock_safe_post_to_slack.assert_called_once_with(
-            ':failed: Pipeline Runner Request Failed :failed:\nRun ID: 20250918-200704-123456\n```{\n    "attempt_id": 2,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "project_a"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "skip_validation": false\n}```\nReason: there were failed tasks',
+            ':failed: Pipeline Runner Request Failed :failed:\nRun ID: 20250918-200704-123456\n```{\n    "attempt_id": 2,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "project_a"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "validations_to_skip": []\n}```\nReason: there were failed tasks',
         )
         self.assertEqual(len(os.listdir(loading_pipeline_queue_dir())), 0)
         with open(
diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py
index d0a7e044e..f6d901ce9 100644
--- a/v03_pipeline/lib/misc/io.py
+++ b/v03_pipeline/lib/misc/io.py
@@ -81,7 +81,7 @@ def compute_hail_n_partitions(file_size_b: int) -> int:
 )
 def split_multi_hts(
     mt: hl.MatrixTable,
-    skip_validation: bool,
+    skip_validate_no_duplicate_variants: bool,
     max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE,
 ) -> hl.MatrixTable:
     bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC)
@@ -98,7 +98,7 @@ def split_multi_hts(
     mt = split.union_rows(bi)
     # If we've disabled validation (which is expected to throw an exception
     # for duplicate variants, we would like to distinc )
-    if skip_validation:
+    if skip_validate_no_duplicate_variants:
         return mt.distinct_by_row()
     return mt
 
diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py
index 280525462..1c480c20c 100644
--- a/v03_pipeline/lib/misc/validation.py
+++ b/v03_pipeline/lib/misc/validation.py
@@ -9,6 +9,7 @@
     SampleType,
 )
 
+ALL_VALIDATIONS = 'all'
 AMBIGUOUS_THRESHOLD_PERC: float = 0.01  # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown.
 MIN_ROWS_PER_CONTIG = 100
 SAMPLE_TYPE_MATCH_THRESHOLD = 0.3
@@ -191,3 +192,12 @@ def validate_sample_type(
     if has_noncoding and has_coding and sample_type != SampleType.WGS:
         msg = 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants'
         raise SeqrValidationError(msg)
+
+
+SKIPPABLE_VALIDATIONS = [
+    validate_allele_depth_length,
+    validate_allele_type,
+    validate_expected_contig_frequency,
+    validate_no_duplicate_variants,
+    validate_sample_type,
+]
diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py
index 5cba7d46f..3cd447cc5 100644
--- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py
+++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py
@@ -28,10 +28,7 @@ class BaseLoadingRunParams(luigi.Task):
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
     )
-    skip_validation = luigi.BoolParameter(
-        default=False,
-        parsing=luigi.BoolParameter.EXPLICIT_PARSING,
-    )
+    validations_to_skip = luigi.ListParameter(default=[])
     is_new_gcnv_joint_call = luigi.BoolParameter(
         default=False,
         parsing=luigi.BoolParameter.EXPLICIT_PARSING,
diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py
index 347e2e0b9..983275c6e 100644
--- a/v03_pipeline/lib/tasks/dataproc/misc_test.py
+++ b/v03_pipeline/lib/tasks/dataproc/misc_test.py
@@ -39,8 +39,8 @@ def test_to_kebab_str_args(self, _: Mock):
                 'False',
                 '--skip-expect-tdr-metrics',
                 'False',
-                '--skip-validation',
-                'False',
+                '--validations-to-skip',
+                '[]',
                 '--is-new-gcnv-joint-call',
                 'False',
                 '--run-id',
diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py
index 90d8d6dfa..43fa38a14 100644
--- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py
+++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py
@@ -10,6 +10,7 @@
     ReferenceGenome,
     SampleType,
 )
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS
 from v03_pipeline.lib.paths import (
     db_id_to_gene_id_path,
     new_entries_parquet_path,
@@ -114,7 +115,7 @@ def test_write_new_entries_parquet(self):
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guids=['R0113_test_project', 'R0114_project4'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -214,7 +215,7 @@ def test_mito_write_new_entries_parquet(self):
             sample_type=SampleType.WGS,
             callset_path=TEST_MITO_CALLSET,
             project_guids=['R0116_test_project3'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -269,7 +270,7 @@ def test_sv_write_new_entries_parquet(self):
             sample_type=SampleType.WGS,
             callset_path=TEST_SV_VCF_2,
             project_guids=['R0115_test_project2'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -352,7 +353,7 @@ def test_gcnv_write_new_entries_parquet(self):
             sample_type=SampleType.WES,
             callset_path=TEST_GCNV_BED_FILE,
             project_guids=['R0115_test_project2'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py
index 5af285095..35d2d94e2 100644
--- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py
+++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py
@@ -9,6 +9,7 @@
     ReferenceGenome,
     SampleType,
 )
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS
 from v03_pipeline.lib.paths import (
     new_transcripts_parquet_path,
     new_variants_table_path,
@@ -87,7 +88,7 @@ def test_write_new_transcripts_parquet_test(
             project_guids=[
                 'fake_project',
             ],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -187,7 +188,7 @@ def test_grch37_write_new_transcripts_parquet_test(
             project_guids=[
                 'fake_project',
             ],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py
index a3c7fc431..69b0c5b36 100644
--- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py
+++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py
@@ -10,6 +10,7 @@
     ReferenceGenome,
     SampleType,
 )
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS
 from v03_pipeline.lib.paths import (
     new_variants_parquet_path,
     new_variants_table_path,
@@ -107,7 +108,7 @@ def test_write_new_variants_parquet_test(
             project_guids=[
                 'fake_project',
             ],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -231,7 +232,7 @@ def test_grch37_write_new_variants_parquet_test(
             project_guids=[
                 'fake_project',
             ],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -342,7 +343,7 @@ def test_mito_write_new_variants_parquet_test(
             project_guids=[
                 'fake_project',
             ],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -436,7 +437,7 @@ def test_sv_write_new_variants_parquet_test(
             project_guids=[
                 'fake_project',
             ],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
@@ -513,7 +514,7 @@ def test_gcnv_write_new_variants_parquet_test(
             project_guids=[
                 'fake_project',
             ],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(task)
diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index 9a03d75cc..50da39fc6 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -24,7 +24,10 @@
     SampleType,
 )
 from v03_pipeline.lib.misc.io import remap_pedigree_hash
-from v03_pipeline.lib.misc.validation import validate_expected_contig_frequency
+from v03_pipeline.lib.misc.validation import (
+    ALL_VALIDATIONS,
+    validate_expected_contig_frequency,
+)
 from v03_pipeline.lib.paths import (
     valid_reference_dataset_path,
 )
@@ -86,7 +89,7 @@ def test_missing_pedigree(
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guids=['R0113_test_project'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker = luigi.worker.Worker()
@@ -116,7 +119,7 @@ def test_missing_interval_reference_dataset(
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guids=['R0113_test_project'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker = luigi.worker.Worker()
@@ -527,7 +530,7 @@ def test_update_vat_grch37(
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guids=['R0113_test_project'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(uvatwns_task)
@@ -683,7 +686,7 @@ def test_update_vat_without_accessing_private_datasets(
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guids=['R0113_test_project'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         worker.add(uvatwns_task)
@@ -726,7 +729,7 @@ def test_mito_update_vat(
                 sample_type=SampleType.WGS,
                 callset_path=TEST_MITO_MT,
                 project_guids=['R0115_test_project2'],
-                skip_validation=True,
+                validations_to_skip=[ALL_VALIDATIONS],
                 run_id=TEST_RUN_ID,
             )
         )
@@ -834,7 +837,7 @@ def test_sv_multiple_vcf_update_vat(
                 sample_type=SampleType.WGS,
                 callset_path=TEST_SV_VCF,
                 project_guids=['R0115_test_project2'],
-                skip_validation=True,
+                validations_to_skip=[ALL_VALIDATIONS],
                 run_id=TEST_RUN_ID,
             )
         )
@@ -1294,7 +1297,7 @@ def test_sv_multiple_vcf_update_vat(
                 sample_type=SampleType.WGS,
                 callset_path=TEST_SV_VCF_2,
                 project_guids=['R0115_test_project2'],
-                skip_validation=True,
+                validations_to_skip=[ALL_VALIDATIONS],
                 run_id='second_run_id',
             )
         )
@@ -1414,7 +1417,7 @@ def test_sv_multiple_project_single_vcf(
                 sample_type=SampleType.WGS,
                 callset_path=TEST_SV_VCF_2,
                 project_guids=['R0116_test_project3', 'R0117_test_project4'],
-                skip_validation=True,
+                validations_to_skip=[ALL_VALIDATIONS],
                 run_id='run_id',
             )
         )
@@ -1446,7 +1449,7 @@ def test_gcnv_update_vat_multiple(
                 sample_type=SampleType.WES,
                 callset_path=TEST_GCNV_BED_FILE,
                 project_guids=['R0115_test_project2'],
-                skip_validation=True,
+                validations_to_skip=[ALL_VALIDATIONS],
                 run_id=TEST_RUN_ID,
             )
         )
@@ -1585,7 +1588,7 @@ def test_gcnv_update_vat_multiple(
                 sample_type=SampleType.WES,
                 callset_path=TEST_GCNV_BED_FILE_2,
                 project_guids=['R0115_test_project2'],
-                skip_validation=True,
+                validations_to_skip=[ALL_VALIDATIONS],
                 run_id='second_run_id',
             )
         )
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index e8f5d6f34..b13142bfe 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -3,12 +3,9 @@
 import luigi.util
 
 from v03_pipeline.lib.misc.validation import (
+    ALL_VALIDATIONS,
+    SKIPPABLE_VALIDATIONS,
     SeqrValidationError,
-    validate_allele_depth_length,
-    validate_allele_type,
-    validate_expected_contig_frequency,
-    validate_no_duplicate_variants,
-    validate_sample_type,
 )
 from v03_pipeline.lib.paths import (
     imported_callset_path,
@@ -31,6 +28,22 @@
 
 @luigi.util.inherits(BaseLoadingRunParams)
 class ValidateCallsetTask(BaseUpdateTask):
+    @property
+    def validation_dependencies(self) -> dict[str, hl.Table]:
+        deps = {}
+        if (
+            ALL_VALIDATIONS not in self.validations_to_skip
+            and 'validate_sample_type' not in self.validations_to_skip
+            and self.dataset_type.can_run_validation
+        ):
+            deps['coding_and_noncoding_variants_ht'] = hl.read_table(
+                valid_reference_dataset_path(
+                    self.reference_genome,
+                    ReferenceDataset.gnomad_coding_and_noncoding,
+                ),
+            )
+        return deps
+
     def complete(self) -> luigi.Target:
         if super().complete():
             mt = hl.read_matrix_table(self.output().path)
@@ -50,7 +63,11 @@ def output(self) -> luigi.Target:
 
     def requires(self) -> list[luigi.Task]:
         requirements = [self.clone(WriteImportedCallsetTask)]
-        if not self.skip_validation and self.dataset_type.can_run_validation:
+        if (
+            ALL_VALIDATIONS not in self.validations_to_skip
+            and 'validate_sample_type' not in self.validations_to_skip
+            and self.dataset_type.can_run_validation
+        ):
             requirements = [
                 *requirements,
                 (
@@ -87,29 +104,22 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
                 ),
             )
 
-        validation_exceptions = []
-        if self.skip_validation or not self.dataset_type.can_run_validation:
+        if (
+            ALL_VALIDATIONS in self.validations_to_skip
+            or not self.dataset_type.can_run_validation
+        ):
             return mt.select_globals(
                 callset_path=self.callset_path,
                 validated_sample_type=self.sample_type.value,
             )
-        coding_and_noncoding_variants_ht = hl.read_table(
-            valid_reference_dataset_path(
-                self.reference_genome,
-                ReferenceDataset.gnomad_coding_and_noncoding,
-            ),
-        )
-        for validation_f in [
-            validate_allele_depth_length,
-            validate_allele_type,
-            validate_no_duplicate_variants,
-            validate_expected_contig_frequency,
-            validate_sample_type,
-        ]:
+        validation_exceptions = []
+        for validation_f in SKIPPABLE_VALIDATIONS:
             try:
+                if validation_f in self.validations_to_skip:
+                    continue
                 validation_f(
                     mt,
-                    coding_and_noncoding_variants_ht=coding_and_noncoding_variants_ht,
+                    **self.validation_dependencies,
                     **self.param_kwargs,
                 )
             except SeqrValidationError as e:
diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py
index e4df70899..6a3230197 100644
--- a/v03_pipeline/lib/tasks/write_imported_callset.py
+++ b/v03_pipeline/lib/tasks/write_imported_callset.py
@@ -85,7 +85,10 @@ def create_table(self) -> hl.MatrixTable:
         )
         if self.dataset_type.has_multi_allelic_variants:
             # NB: throws SeqrValidationError
-            mt = split_multi_hts(mt, self.skip_validation)
+            mt = split_multi_hts(
+                mt,
+                'validate_no_duplicate_variants' in self.validations_to_skip,
+            )
         if self.dataset_type.re_key_by_seqr_internal_truth_vid and hasattr(
             mt,
             'info.SEQR_INTERNAL_TRUTH_VID',
diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
index e8318f043..e0bec8f14 100644
--- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
+++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py
@@ -5,6 +5,7 @@
 import luigi.worker
 
 from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS
 from v03_pipeline.lib.paths import relatedness_check_tsv_path
 from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask
 from v03_pipeline.lib.test.misc import copy_project_pedigree_to_mocked_dir
@@ -54,7 +55,7 @@ def test_write_metadata_for_run_task(
             sample_type=SampleType.WGS,
             callset_path=TEST_VCF,
             project_guids=['R0113_test_project', 'R0114_project4'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id='run_123456',
         )
         worker.add(write_metadata_for_run_task)
diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
index 305150699..9504965c4 100644
--- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
+++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py
@@ -7,6 +7,7 @@
 
 from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType
 from v03_pipeline.lib.misc.io import remap_pedigree_hash
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS
 from v03_pipeline.lib.paths import (
     relatedness_check_table_path,
     sex_check_table_path,
@@ -101,7 +102,7 @@ def test_write_remapped_and_subsetted_callset_task(
             callset_path=TEST_VCF,
             project_guids=['R0113_test_project'],
             project_i=0,
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             skip_expect_tdr_metrics=True,
         )
         worker.add(wrsc_task)
@@ -151,7 +152,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks(
             callset_path=TEST_VCF,
             project_guids=['R0114_project4'],
             project_i=0,
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             skip_expect_tdr_metrics=True,
         )
         worker.add(wrsc_task)
@@ -244,7 +245,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
             callset_path=TEST_VCF,
             project_guids=['R0114_project4'],
             project_i=0,
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             skip_expect_tdr_metrics=True,
         )
         worker.add(wrsc_task)
@@ -256,7 +257,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed(
             sample_type=SampleType.WES,
             callset_path=TEST_VCF,
             project_guids=['R0114_project4'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
             run_id=TEST_RUN_ID,
         )
         self.assertTrue(write_validation_errors_task.complete())
diff --git a/v03_pipeline/lib/tasks/write_sample_qc_json_test.py b/v03_pipeline/lib/tasks/write_sample_qc_json_test.py
index 8d2bb699e..d1b0a3643 100644
--- a/v03_pipeline/lib/tasks/write_sample_qc_json_test.py
+++ b/v03_pipeline/lib/tasks/write_sample_qc_json_test.py
@@ -8,6 +8,7 @@
 import luigi.worker
 
 from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS
 from v03_pipeline.lib.paths import ancestry_model_rf_path
 from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask
 from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask
@@ -113,7 +114,7 @@ def test_call_sample_qc(
             sample_type=SampleType.WGS,
             callset_path=TEST_VCF,
             project_guids=['R0113_test_project'],
-            skip_validation=True,
+            validations_to_skip=[ALL_VALIDATIONS],
         )
         worker.add(task)
         worker.run()
diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py
index 913de418c..8f3a00107 100644
--- a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py
+++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py
@@ -5,6 +5,7 @@
 import luigi.worker
 
 from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType
+from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS
 from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import (
     UpdateVariantAnnotationsTableWithNewSamplesTask,
 )
@@ -64,7 +65,7 @@ def test_sv_export_vcf(
                 sample_type=SampleType.WGS,
                 callset_path=TEST_SV_VCF,
                 project_guids=['R0115_test_project2'],
-                skip_validation=True,
+                validations_to_skip=[ALL_VALIDATIONS],
                 skip_expect_tdr_metrics=True,
             )
         )

From d952e214580d7892f1394af5c5c503453391543f Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 22:57:15 -0500
Subject: [PATCH 04/17] fix a few

---
 v03_pipeline/bin/pipeline_worker_test.py   |  3 ++-
 v03_pipeline/lib/tasks/validate_callset.py | 14 ++++----------
 2 files changed, 6 insertions(+), 11 deletions(-)

diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py
index 9e41c5aec..b488e3d58 100644
--- a/v03_pipeline/bin/pipeline_worker_test.py
+++ b/v03_pipeline/bin/pipeline_worker_test.py
@@ -176,6 +176,7 @@ def test_process_queue_integration_test(
             'sample_type': SampleType.WGS.value,
             'reference_genome': ReferenceGenome.GRCh38.value,
             'dataset_type': DatasetType.SNV_INDEL.value,
+            'validations_to_skip': ['all'],
         }
         os.makedirs(
             loading_pipeline_queue_dir(),
@@ -191,7 +192,7 @@ def test_process_queue_integration_test(
             json.dump(raw_request, f)
         process_queue(local_scheduler=True)
         mock_safe_post_to_slack.assert_called_once_with(
-            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "validations_to_skip": []\n}```',
+            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "validations_to_skip": [\n        "validate_allele_depth_length",\n        "validate_allele_type",\n        "validate_expected_contig_frequency",\n        "validate_no_duplicate_variants",\n        "validate_sample_type"\n    ]\n}```'
         )
         with hfs.open(
             clickhouse_load_success_file_path(
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index b13142bfe..8ee302b2e 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -3,7 +3,6 @@
 import luigi.util
 
 from v03_pipeline.lib.misc.validation import (
-    ALL_VALIDATIONS,
     SKIPPABLE_VALIDATIONS,
     SeqrValidationError,
 )
@@ -32,8 +31,7 @@ class ValidateCallsetTask(BaseUpdateTask):
     def validation_dependencies(self) -> dict[str, hl.Table]:
         deps = {}
         if (
-            ALL_VALIDATIONS not in self.validations_to_skip
-            and 'validate_sample_type' not in self.validations_to_skip
+            'validate_sample_type' not in self.validations_to_skip
             and self.dataset_type.can_run_validation
         ):
             deps['coding_and_noncoding_variants_ht'] = hl.read_table(
@@ -64,8 +62,7 @@ def output(self) -> luigi.Target:
     def requires(self) -> list[luigi.Task]:
         requirements = [self.clone(WriteImportedCallsetTask)]
         if (
-            ALL_VALIDATIONS not in self.validations_to_skip
-            and 'validate_sample_type' not in self.validations_to_skip
+            'validate_sample_type' not in self.validations_to_skip
             and self.dataset_type.can_run_validation
         ):
             requirements = [
@@ -104,10 +101,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
                 ),
             )
 
-        if (
-            ALL_VALIDATIONS in self.validations_to_skip
-            or not self.dataset_type.can_run_validation
-        ):
+        if not self.dataset_type.can_run_validation:
             return mt.select_globals(
                 callset_path=self.callset_path,
                 validated_sample_type=self.sample_type.value,
@@ -115,7 +109,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
         validation_exceptions = []
         for validation_f in SKIPPABLE_VALIDATIONS:
             try:
-                if validation_f in self.validations_to_skip:
+                if str(validation_f.__name__) in self.validations_to_skip:
                     continue
                 validation_f(
                     mt,

From 5ac896103c2762b3a401b6209bffd9d07b5ed766 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 23:03:36 -0500
Subject: [PATCH 05/17] bump

---
 v03_pipeline/lib/misc/pedigree_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/v03_pipeline/lib/misc/pedigree_test.py b/v03_pipeline/lib/misc/pedigree_test.py
index 6071849d0..2142ea215 100644
--- a/v03_pipeline/lib/misc/pedigree_test.py
+++ b/v03_pipeline/lib/misc/pedigree_test.py
@@ -13,7 +13,7 @@
 
 class PedigreesTest(unittest.TestCase):
     def test_empty_pedigree(self) -> None:
-        with self.assertRaises(ValueError):
+        with self.assertRaises(Exception):
             _ = import_pedigree(TEST_PEDIGREE_1)
 
     def test_parse_lineage(self) -> None:

From ce9223583a4c7d7c4003904290269e8a4251ccc2 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 23:11:06 -0500
Subject: [PATCH 06/17] pedigree

---
 v03_pipeline/lib/misc/pedigree_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/v03_pipeline/lib/misc/pedigree_test.py b/v03_pipeline/lib/misc/pedigree_test.py
index 2142ea215..f70d168ff 100644
--- a/v03_pipeline/lib/misc/pedigree_test.py
+++ b/v03_pipeline/lib/misc/pedigree_test.py
@@ -13,7 +13,7 @@
 
 class PedigreesTest(unittest.TestCase):
     def test_empty_pedigree(self) -> None:
-        with self.assertRaises(Exception):
+        with self.assertRaises(Exception):  # noqa: B017
             _ = import_pedigree(TEST_PEDIGREE_1)
 
     def test_parse_lineage(self) -> None:

From d54c4de98f4cd3d45d057c58ba490256d7e51216 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 23:21:49 -0500
Subject: [PATCH 07/17] validations to skip

---
 v03_pipeline/api/app_test.py   | 24 +++++++++++++++++++++++-
 v03_pipeline/api/model_test.py |  9 +--------
 2 files changed, 24 insertions(+), 9 deletions(-)

diff --git a/v03_pipeline/api/app_test.py b/v03_pipeline/api/app_test.py
index f3a68776e..2b09e3814 100644
--- a/v03_pipeline/api/app_test.py
+++ b/v03_pipeline/api/app_test.py
@@ -67,6 +67,28 @@ async def test_loading_pipeline_invalid_requests(self):
                     'callset_path must point to a file that exists' in log.output[0],
                 )
 
+        body = {
+            'callset_path': CALLSET_PATH,
+            'project_guids': ['project_a'],
+            'sample_type': SampleType.WGS.value,
+            'reference_genome': ReferenceGenome.GRCh38.value,
+            'dataset_type': DatasetType.SNV_INDEL.value,
+            'validations_to_skip': ['bad_validation']
+        }
+        with self.assertLogs(level='ERROR') as log:
+            async with self.client.request(
+                'POST',
+                '/loading_pipeline_enqueue',
+                json=body,
+            ) as resp:
+                self.assertEqual(
+                    resp.status,
+                    web_exceptions.HTTPBadRequest.status_code,
+                )
+                self.assertTrue(
+                    "input_value='bad_validation" in log.output[0],
+                )
+
     async def test_loading_pipeline_enqueue(self):
         body = {
             'callset_path': CALLSET_PATH,
@@ -96,9 +118,9 @@ async def test_loading_pipeline_enqueue(self):
                     'reference_genome': 'GRCh38',
                     'sample_type': 'WGS',
                     'skip_check_sex_and_relatedness': False,
-                    'skip_validation': False,
                     'skip_expect_tdr_metrics': False,
                     'attempt_id': 0,
+                    'validations_to_skip': []
                 },
             },
         )
diff --git a/v03_pipeline/api/model_test.py b/v03_pipeline/api/model_test.py
index 7334b5120..e7dd6cd19 100644
--- a/v03_pipeline/api/model_test.py
+++ b/v03_pipeline/api/model_test.py
@@ -54,19 +54,12 @@ def test_validations_to_skip(self) -> None:
             'reference_genome': ReferenceGenome.GRCh38.value,
             'dataset_type': DatasetType.SNV_INDEL.value,
         }
-        raw_request = {
-            **shared_params,
-            'skip_validation': True,
-        }
-        lpr = LoadingPipelineRequest.model_validate(raw_request)
-        self.assertGreater(len(lpr.validations_to_skip), 1)
-
         raw_request = {
             **shared_params,
             'validations_to_skip': ['all'],
         }
         lpr = LoadingPipelineRequest.model_validate(raw_request)
-        self.assertGreater(len(lpr.validations_to_skip), 1)
+        self.assertGreater(len(lpr.validations_to_skip), 2)
 
         raw_request = {
             **shared_params,

From b455ded337c7d7aaf476d73c17a76f7744fabf20 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 23:25:42 -0500
Subject: [PATCH 08/17] ruff

---
 v03_pipeline/api/app_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/v03_pipeline/api/app_test.py b/v03_pipeline/api/app_test.py
index 2b09e3814..0ae059130 100644
--- a/v03_pipeline/api/app_test.py
+++ b/v03_pipeline/api/app_test.py
@@ -73,7 +73,7 @@ async def test_loading_pipeline_invalid_requests(self):
             'sample_type': SampleType.WGS.value,
             'reference_genome': ReferenceGenome.GRCh38.value,
             'dataset_type': DatasetType.SNV_INDEL.value,
-            'validations_to_skip': ['bad_validation']
+            'validations_to_skip': ['bad_validation'],
         }
         with self.assertLogs(level='ERROR') as log:
             async with self.client.request(
@@ -120,7 +120,7 @@ async def test_loading_pipeline_enqueue(self):
                     'skip_check_sex_and_relatedness': False,
                     'skip_expect_tdr_metrics': False,
                     'attempt_id': 0,
-                    'validations_to_skip': []
+                    'validations_to_skip': [],
                 },
             },
         )

From 9300a615fb1537ea9bd2c5e906426e69d8453147 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Tue, 25 Nov 2025 23:28:01 -0500
Subject: [PATCH 09/17] ruff

---
 v03_pipeline/bin/pipeline_worker_test.py | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py
index b488e3d58..7e3341549 100644
--- a/v03_pipeline/bin/pipeline_worker_test.py
+++ b/v03_pipeline/bin/pipeline_worker_test.py
@@ -192,7 +192,7 @@ def test_process_queue_integration_test(
             json.dump(raw_request, f)
         process_queue(local_scheduler=True)
         mock_safe_post_to_slack.assert_called_once_with(
-            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "validations_to_skip": [\n        "validate_allele_depth_length",\n        "validate_allele_type",\n        "validate_expected_contig_frequency",\n        "validate_no_duplicate_variants",\n        "validate_sample_type"\n    ]\n}```'
+            ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "validations_to_skip": [\n        "validate_allele_depth_length",\n        "validate_allele_type",\n        "validate_expected_contig_frequency",\n        "validate_no_duplicate_variants",\n        "validate_sample_type"\n    ]\n}```',
         )
         with hfs.open(
             clickhouse_load_success_file_path(

From 7989e593212fe647a0b6a82ac604d837cdcabcaa Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Wed, 26 Nov 2025 00:34:41 -0500
Subject: [PATCH 10/17] fix arg

---
 .../update_variant_annotations_table_with_new_samples_test.py | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index 50da39fc6..4755730dd 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -271,7 +271,7 @@ def test_multiple_update_vat(
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guids=['R0113_test_project'],
-            skip_validation=False,
+            validations_to_skip=[],
             run_id=TEST_RUN_ID,
         )
         worker.add(uvatwns_task_3)
@@ -332,7 +332,7 @@ def test_multiple_update_vat(
             sample_type=SampleType.WGS,
             callset_path=TEST_SNV_INDEL_VCF,
             project_guids=['R0114_project4'],
-            skip_validation=False,
+            validations_to_skip=[],
             run_id=TEST_RUN_ID + '-another-run',
         )
         worker.add(uvatwns_task_4)

From 1d876e3d951714da2f71a6b00625a960adbd64a2 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Wed, 26 Nov 2025 01:03:34 -0500
Subject: [PATCH 11/17] all validations

---
 v03_pipeline/lib/tasks/validate_callset.py | 12 +++++++++---
 1 file changed, 9 insertions(+), 3 deletions(-)

diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index 8ee302b2e..4d378fa46 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -3,6 +3,7 @@
 import luigi.util
 
 from v03_pipeline.lib.misc.validation import (
+    ALL_VALIDATIONS,
     SKIPPABLE_VALIDATIONS,
     SeqrValidationError,
 )
@@ -31,7 +32,8 @@ class ValidateCallsetTask(BaseUpdateTask):
     def validation_dependencies(self) -> dict[str, hl.Table]:
         deps = {}
         if (
-            'validate_sample_type' not in self.validations_to_skip
+            ALL_VALIDATIONS not in self.validations_to_skip
+            and 'validate_sample_type' not in self.validations_to_skip
             and self.dataset_type.can_run_validation
         ):
             deps['coding_and_noncoding_variants_ht'] = hl.read_table(
@@ -62,7 +64,8 @@ def output(self) -> luigi.Target:
     def requires(self) -> list[luigi.Task]:
         requirements = [self.clone(WriteImportedCallsetTask)]
         if (
-            'validate_sample_type' not in self.validations_to_skip
+            ALL_VALIDATIONS not in self.validations_to_skip
+            and 'validate_sample_type' not in self.validations_to_skip
             and self.dataset_type.can_run_validation
         ):
             requirements = [
@@ -101,7 +104,10 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
                 ),
             )
 
-        if not self.dataset_type.can_run_validation:
+        if (
+            ALL_VALIDATIONS in self.validations_to_skip
+            or not self.dataset_type.can_run_validation
+        ):
             return mt.select_globals(
                 callset_path=self.callset_path,
                 validated_sample_type=self.sample_type.value,

From 2d23d2ce414789b6dfb4ae1e75100862f32b221c Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Wed, 26 Nov 2025 01:30:39 -0500
Subject: [PATCH 12/17] ruff

---
 v03_pipeline/lib/tasks/validate_callset_test.py | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py
index 6340dfc6a..895ec1bb0 100644
--- a/v03_pipeline/lib/tasks/validate_callset_test.py
+++ b/v03_pipeline/lib/tasks/validate_callset_test.py
@@ -55,7 +55,6 @@ def test_validate_callset_multiple_exceptions(
             # all contigs but chr1, and contains non-coding variants.
             callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF,
             project_guids=['project_a'],
-            skip_validation=False,
             run_id=TEST_RUN_ID,
         )
         worker.add(validate_callset_task)
@@ -68,7 +67,6 @@ def test_validate_callset_multiple_exceptions(
             sample_type=SampleType.WES,
             callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF,
             project_guids=['project_a'],
-            skip_validation=False,
             run_id=TEST_RUN_ID,
         )
         self.assertTrue(write_validation_errors_task.complete())
@@ -79,8 +77,8 @@ def test_validate_callset_multiple_exceptions(
                     'project_guids': ['project_a'],
                     'error_messages': [
                         'Alleles with invalid allele <NON_REF> are present in the callset.  This appears to be a GVCF containing records for sites with no variants.',
-                        "Variants are present multiple times in the callset: ['1-902088-G-A']",
                         'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX',
+                        "Variants are present multiple times in the callset: ['1-902088-G-A']",
                         'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants',
                     ],
                 },

From 48777c873bb71d48ac720f9a0e04360456125a53 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Fri, 28 Nov 2025 13:21:08 -0500
Subject: [PATCH 13/17] ruff

---
 v03_pipeline/bin/pipeline_worker_test.py | 1 -
 1 file changed, 1 deletion(-)

diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py
index c8ddf32a7..7e3341549 100644
--- a/v03_pipeline/bin/pipeline_worker_test.py
+++ b/v03_pipeline/bin/pipeline_worker_test.py
@@ -193,7 +193,6 @@ def test_process_queue_integration_test(
         process_queue(local_scheduler=True)
         mock_safe_post_to_slack.assert_called_once_with(
             ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n    "attempt_id": 0,\n    "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n    "dataset_type": "SNV_INDEL",\n    "project_guids": [\n        "R0113_test_project"\n    ],\n    "reference_genome": "GRCh38",\n    "request_type": "LoadingPipelineRequest",\n    "sample_type": "WGS",\n    "skip_check_sex_and_relatedness": false,\n    "skip_expect_tdr_metrics": false,\n    "validations_to_skip": [\n        "validate_allele_depth_length",\n        "validate_allele_type",\n        "validate_expected_contig_frequency",\n        "validate_no_duplicate_variants",\n        "validate_sample_type"\n    ]\n}```',
-
         )
         with hfs.open(
             clickhouse_load_success_file_path(

From 530c2126ff06147f1ee8319a7b6ffd56d72baa3b Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Fri, 28 Nov 2025 16:38:15 -0500
Subject: [PATCH 14/17] skippable validations

---
 ..._variant_annotations_table_with_new_samples_test.py | 10 ++++++++--
 1 file changed, 8 insertions(+), 2 deletions(-)

diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index 4755730dd..01652f94f 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -18,6 +18,7 @@
     SV_TYPES,
     TRANSCRIPT_CONSEQUENCE_TERMS,
 )
+from v03_pipeline.lib.misc.validation import SKIPPABLE_VALIDATIONS
 from v03_pipeline.lib.core import (
     DatasetType,
     ReferenceGenome,
@@ -132,8 +133,13 @@ def test_missing_interval_reference_dataset(
     )
     @patch('v03_pipeline.lib.tasks.update_new_variants_with_caids.Env')
     @patch(
-        'v03_pipeline.lib.tasks.validate_callset.validate_expected_contig_frequency',
-        partial(validate_expected_contig_frequency, min_rows_per_contig=25),
+        'v03_pipeline.lib.tasks.validate_callset.SKIPPABLE_VALIDATIONS',
+        [
+            x
+            for x in SKIPPABLE_VALIDATIONS
+            if x.__str__ != 'validate_expected_contig_frequency'
+        ]
+        + [partial(validate_expected_contig_frequency, min_rows_per_contig=25)],
     )
     @patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock)
     @patch('v03_pipeline.lib.vep.hl.vep')

From 2093286920b6dd8b21db58bb60f1d1bb65da637c Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Fri, 28 Nov 2025 16:57:36 -0500
Subject: [PATCH 15/17] ruff

---
 .../update_variant_annotations_table_with_new_samples_test.py   | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index 01652f94f..4e08c6869 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -18,7 +18,6 @@
     SV_TYPES,
     TRANSCRIPT_CONSEQUENCE_TERMS,
 )
-from v03_pipeline.lib.misc.validation import SKIPPABLE_VALIDATIONS
 from v03_pipeline.lib.core import (
     DatasetType,
     ReferenceGenome,
@@ -27,6 +26,7 @@
 from v03_pipeline.lib.misc.io import remap_pedigree_hash
 from v03_pipeline.lib.misc.validation import (
     ALL_VALIDATIONS,
+    SKIPPABLE_VALIDATIONS,
     validate_expected_contig_frequency,
 )
 from v03_pipeline.lib.paths import (

From a26b1b908adacce5dae05f1faf6bb9fabdaae12c Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Mon, 1 Dec 2025 13:53:50 -0500
Subject: [PATCH 16/17] functools

---
 ...ariant_annotations_table_with_new_samples_test.py | 12 ++++++++++--
 1 file changed, 10 insertions(+), 2 deletions(-)

diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index 4e08c6869..344aaac83 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -1,5 +1,5 @@
+import functools
 import shutil
-from functools import partial
 from unittest.mock import Mock, PropertyMock, patch
 
 import hail as hl
@@ -139,7 +139,15 @@ def test_missing_interval_reference_dataset(
             for x in SKIPPABLE_VALIDATIONS
             if x.__str__ != 'validate_expected_contig_frequency'
         ]
-        + [partial(validate_expected_contig_frequency, min_rows_per_contig=25)],
+        + [
+            functools.update_wrapper(
+                functools.partial(
+                    validate_expected_contig_frequency,
+                    min_rows_per_contig=25,
+                ),
+                validate_expected_contig_frequency,
+            ),
+        ],
     )
     @patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock)
     @patch('v03_pipeline.lib.vep.hl.vep')

From e5d0c4c3517a2fe6b783d82179311766e02a6e13 Mon Sep 17 00:00:00 2001
From: Benjamin Blankenmeister <bblanken@broadinstitute.org>
Date: Mon, 1 Dec 2025 14:55:03 -0500
Subject: [PATCH 17/17] cleaner mocks

---
 .../update_variant_annotations_table_with_new_samples_test.py   | 2 +-
 v03_pipeline/lib/tasks/validate_callset.py                      | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
index 344aaac83..c2b253bdb 100644
--- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
+++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py
@@ -137,7 +137,7 @@ def test_missing_interval_reference_dataset(
         [
             x
             for x in SKIPPABLE_VALIDATIONS
-            if x.__str__ != 'validate_expected_contig_frequency'
+            if x.__name__ != 'validate_expected_contig_frequency'
         ]
         + [
             functools.update_wrapper(
diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py
index 4d378fa46..2e91ec2a6 100644
--- a/v03_pipeline/lib/tasks/validate_callset.py
+++ b/v03_pipeline/lib/tasks/validate_callset.py
@@ -115,7 +115,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable:
         validation_exceptions = []
         for validation_f in SKIPPABLE_VALIDATIONS:
             try:
-                if str(validation_f.__name__) in self.validations_to_skip:
+                if validation_f.__name__ in self.validations_to_skip:
                     continue
                 validation_f(
                     mt,