From 18b6d52d19066cba282999acc78e785eec2a6402 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 20:42:54 -0500 Subject: [PATCH 01/17] Integration test --- .github/workflows/unit-tests.yml | 1 - pyproject.toml | 3 + v03_pipeline/bin/pipeline_worker_test.py | 187 ++++++++- v03_pipeline/lib/misc/clickhouse.py | 2 + .../tasks/load_complete_run_to_clickhouse.py | 23 +- ...annotations_table_with_new_samples_test.py | 2 - .../var/test/test_clickhouse_schema.sql | 378 ++++++++++++++++++ 7 files changed, 581 insertions(+), 15 deletions(-) create mode 100644 v03_pipeline/var/test/test_clickhouse_schema.sql diff --git a/.github/workflows/unit-tests.yml b/.github/workflows/unit-tests.yml index f5267dc46..3c0327a5c 100644 --- a/.github/workflows/unit-tests.yml +++ b/.github/workflows/unit-tests.yml @@ -70,6 +70,5 @@ jobs: export CLICKHOUSE_OPTIMIZE_TABLE_WAIT_S=1 export PYSPARK_SUBMIT_ARGS='--driver-memory 8G pyspark-shell' export CLICKHOUSE_DATABASE=test - export LOCAL_DISK_MOUNT_PATH=. uv run nosetests --with-coverage --cover-package v03_pipeline v03_pipeline uv run coverage report --omit '*test*' --fail-under=90 diff --git a/pyproject.toml b/pyproject.toml index 568175864..cb704996a 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -102,6 +102,9 @@ inline-quotes = "single" '*clickhouse*' = [ 'S608' # unsafe sql ] +'*pipeline_worker*' = [ + 'S608' # unsafe sql +] [tool.ruff.pylint] max-args = 6 diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py index 73cb5511a..96c7008d6 100644 --- a/v03_pipeline/bin/pipeline_worker_test.py +++ b/v03_pipeline/bin/pipeline_worker_test.py @@ -1,18 +1,34 @@ import json import os -from unittest.mock import patch +import shutil +from unittest.mock import Mock, patch +import hail as hl import luigi +import luigi.worker from v03_pipeline.bin.pipeline_worker import process_queue from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.core.environment import Env +from v03_pipeline.lib.misc.clickhouse import ( + STAGING_CLICKHOUSE_DATABASE, + get_clickhouse_client, +) from v03_pipeline.lib.paths import ( + db_id_to_gene_id_path, loading_pipeline_deadletter_queue_dir, loading_pipeline_queue_dir, + clickhouse_load_success_file_path, +) +from v03_pipeline.lib.test.misc import copy_project_pedigree_to_mocked_dir +from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( + MockedReferenceDatasetsTestCase, ) -from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask -from v03_pipeline.lib.test.mocked_dataroot_testcase import MockedDatarootTestCase +from v03_pipeline.var.test.vep.mock_vep_data import MOCK_38_VEP_DATA +TEST_DB_ID_TO_GENE_ID = 'v03_pipeline/var/test/db_id_to_gene_id.csv.gz' +TEST_PEDIGREE_3_REMAP = 'v03_pipeline/var/test/pedigrees/test_pedigree_3_remap.tsv' +TEST_SCHEMA = 'v03_pipeline/var/test/test_clickhouse_schema.sql' TEST_VCF = 'v03_pipeline/var/test/callsets/1kg_30variants.vcf' @@ -25,25 +41,142 @@ def output(self): return luigi.LocalTarget('output.txt') -class PipelineWorkerTest(MockedDatarootTestCase): +class PipelineWorkerTest(MockedReferenceDatasetsTestCase): + def setUp(self): + super().setUp() + client = get_clickhouse_client() + client.execute( + f""" + DROP DATABASE IF EXISTS {STAGING_CLICKHOUSE_DATABASE}; + """, + ) + client.execute( + f""" + DROP DATABASE IF EXISTS {Env.CLICKHOUSE_DATABASE}; + """, + ) + client.execute( + f""" + CREATE DATABASE {Env.CLICKHOUSE_DATABASE}; + """, + ) + client = get_clickhouse_client(database=Env.CLICKHOUSE_DATABASE) + client.execute( + """ + CREATE DICTIONARY seqrdb_affected_status_dict + ( + `family_guid` String, + `sampleId` String, + `affected` String + ) + PRIMARY KEY family_guid, sampleId + SOURCE(NULL()) + LIFETIME(0) + LAYOUT(COMPLEX_KEY_HASHED()) + """, + ) + client.execute( + """ + CREATE DICTIONARY `GRCh38/SNV_INDEL/project_partitions_dict` + ( + `project_guid` String, + `n_partitions` UInt32 + ) + PRIMARY KEY project_guid + SOURCE(NULL()) + LIFETIME(0) + LAYOUT(COMPLEX_KEY_HASHED()) + """, + ) + with open(TEST_SCHEMA) as f: + sql = f.read() + commands = [cmd.strip() for cmd in sql.split(';') if cmd.strip()] + for cmd in commands: + client.execute(cmd) + client.execute( + f""" + CREATE DICTIONARY `GRCh38/SNV_INDEL/gt_stats_dict` + ( + `key` UInt32, + `ac_wes` UInt64, + `ac_wgs` UInt64, + `ac_affected` UInt64, + `hom_wes` UInt64, + `hom_wgs` UInt64, + `hom_affected` UInt64 + ) + PRIMARY KEY key + SOURCE( + CLICKHOUSE( + USER {Env.CLICKHOUSE_WRITER_USER} PASSWORD {Env.CLICKHOUSE_WRITER_PASSWORD} + DB {Env.CLICKHOUSE_DATABASE} TABLE `GRCh38/SNV_INDEL/gt_stats` + ) + ) + LIFETIME(0) + LAYOUT(FLAT(MAX_ARRAY_SIZE 1000000000)) + """, + ) + os.makedirs( + self.mock_env.LOADING_DATASETS_DIR, + exist_ok=True, + ) + shutil.copy2( + TEST_DB_ID_TO_GENE_ID, + db_id_to_gene_id_path(), + ) + + def tearDown(self): + super().tearDown() + client = get_clickhouse_client() + client.execute( + f""" + DROP DATABASE IF EXISTS {STAGING_CLICKHOUSE_DATABASE}; + """, + ) + client.execute( + f""" + DROP DATABASE IF EXISTS {Env.CLICKHOUSE_DATABASE}; + """, + ) + + @patch( + 'v03_pipeline.lib.tasks.write_new_variants_table.load_gencode_ensembl_to_refseq_id', + ) + @patch( + 'v03_pipeline.lib.tasks.update_new_variants_with_caids.register_alleles_in_chunks', + ) + @patch('v03_pipeline.lib.vep.hl.vep') @patch('v03_pipeline.lib.misc.slack._safe_post_to_slack') - @patch('v03_pipeline.api.request_handlers.WriteClickhouseLoadSuccessFileTask') @patch('v03_pipeline.bin.pipeline_worker.logger') - def test_process_queue( + def test_process_queue_integration_test( self, mock_logger, - mock_write_clickhouse_load_success_file_task, mock_safe_post_to_slack, + mock_vep: Mock, + mock_register_alleles: Mock, + mock_load_gencode_ensembl_to_refseq_id: Mock, ): + mock_load_gencode_ensembl_to_refseq_id.return_value = hl.dict( + {'ENST00000327044': 'NM_015658.4'}, + ) + mock_register_alleles.side_effect = None + mock_vep.side_effect = lambda ht, **_: ht.annotate(vep=MOCK_38_VEP_DATA) + copy_project_pedigree_to_mocked_dir( + TEST_PEDIGREE_3_REMAP, + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + SampleType.WGS, + 'R0113_test_project', + ) raw_request = { 'request_type': 'LoadingPipelineRequest', 'callset_path': TEST_VCF, - 'projects_to_run': ['project_a'], + 'projects_to_run': ['R0113_test_project'], 'sample_type': SampleType.WGS.value, 'reference_genome': ReferenceGenome.GRCh38.value, 'dataset_type': DatasetType.SNV_INDEL.value, + 'skip_validation': True, } - mock_write_clickhouse_load_success_file_task.return_value = MockCompleteTask() os.makedirs( loading_pipeline_queue_dir(), exist_ok=True, @@ -58,8 +191,42 @@ def test_process_queue( json.dump(raw_request, f) process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( - ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "project_a"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "skip_validation": false\n}```', + ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "skip_validation": true\n}```', ) + with open( + clickhouse_load_success_file( + ReferenceGenome.GRCh38, + DatasetType.SNV_INDEL, + 'request_20250916-200704-123456', + ) + ) as f: + self.assertEqual(f.read(), '') + + client = get_clickhouse_client() + annotations_count = client.execute( + f""" + SELECT COUNT(*) + FROM + {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/annotations_memory` + """, + )[0][0] + self.assertEqual(annotations_count, 30) + entries_count = client.execute( + f""" + SELECT COUNT(*) + FROM + {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/entries` + """, + )[0][0] + self.assertEqual(entries_count, 16) + ac_wgs = client.execute( + f""" + SELECT sum(ac_wgs) + FROM + {Env.CLICKHOUSE_DATABASE}.`GRCh38/SNV_INDEL/gt_stats_dict` + """, + )[0][0] + self.assertEqual(ac_wgs, 69) @patch('v03_pipeline.lib.misc.slack._safe_post_to_slack') @patch('v03_pipeline.api.request_handlers.WriteClickhouseLoadSuccessFileTask') diff --git a/v03_pipeline/lib/misc/clickhouse.py b/v03_pipeline/lib/misc/clickhouse.py index 751c9dafb..6aabe5c44 100644 --- a/v03_pipeline/lib/misc/clickhouse.py +++ b/v03_pipeline/lib/misc/clickhouse.py @@ -965,12 +965,14 @@ def rebuild_gt_stats( def get_clickhouse_client( timeout: int | None = None, + database: str | None = None, ) -> Client: return Client( host=Env.CLICKHOUSE_SERVICE_HOSTNAME, port=Env.CLICKHOUSE_SERVICE_PORT, user=Env.CLICKHOUSE_WRITER_USER, password=Env.CLICKHOUSE_WRITER_PASSWORD, + **{'database': database} if database else {}, **{'send_receive_timeout': timeout} if timeout else {}, **{ 'settings': { diff --git a/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py b/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py index cc4140b3a..ca1af75dc 100644 --- a/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py +++ b/v03_pipeline/lib/tasks/load_complete_run_to_clickhouse.py @@ -10,7 +10,7 @@ load_complete_run, logged_query, ) -from v03_pipeline.lib.paths import metadata_for_run_path +from v03_pipeline.lib.paths import metadata_for_run_path, pipeline_run_success_file_path from v03_pipeline.lib.tasks.base.base_loading_run_params import ( BaseLoadingRunParams, ) @@ -25,6 +25,14 @@ def requires(self) -> luigi.Task: return [self.clone(WriteSuccessFileTask)] def complete(self): + if not hfs.exists( + pipeline_run_success_file_path( + self.reference_genome, + self.dataset_type, + self.run_id, + ), + ): + return False table_name_builder = TableNameBuilder( self.reference_genome, self.dataset_type, @@ -35,7 +43,7 @@ def complete(self): SELECT max(key) FROM {table_name_builder.src_table(ClickHouseTable.ANNOTATIONS_MEMORY)} """, )[0][0] - return logged_query( + exists_in_annotations = logged_query( f""" SELECT EXISTS ( SELECT 1 @@ -45,6 +53,17 @@ def complete(self): """, {'max_key_src': max_key_src}, )[0][0] + exists_in_gt_stats = logged_query( + f""" + SELECT EXISTS ( + SELECT 1 + FROM {table_name_builder.dst_table(ClickHouseTable.GT_STATS)} + WHERE key = %(max_key_src)s + ); + """, + {'max_key_src': max_key_src}, + )[0][0] + return exists_in_annotations and exists_in_gt_stats def run(self): with hfs.open( diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index c6140448c..9a03d75cc 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -207,7 +207,6 @@ def test_multiple_update_vat( [], ), # for the second call, there are no new variants, return empty iterator ] - mock_standard_contigs.return_value = {'chr1'} # This creates a mock validation table with 1 coding and 1 non-coding variant # explicitly chosen from the VCF. @@ -248,7 +247,6 @@ def test_multiple_update_vat( ), ), ) - coding_and_noncoding_variants_ht.write( valid_reference_dataset_path( ReferenceGenome.GRCh38, diff --git a/v03_pipeline/var/test/test_clickhouse_schema.sql b/v03_pipeline/var/test/test_clickhouse_schema.sql new file mode 100644 index 000000000..ca165199b --- /dev/null +++ b/v03_pipeline/var/test/test_clickhouse_schema.sql @@ -0,0 +1,378 @@ +CREATE TABLE `GRCh38/SNV_INDEL/annotations_disk` +( + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `xpos` UInt64, + `chrom` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25), + `pos` UInt32, + `ref` String, + `alt` String, + `variantId` String, + `rsid` Nullable(String), + `CAID` Nullable(String), + `liftedOverChrom` Nullable(Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25)), + `liftedOverPos` Nullable(UInt32), + `hgmd` Tuple( + accession Nullable(String), + classification Nullable(Enum8('DM' = 0, 'DM?' = 1, 'DP' = 2, 'DFP' = 3, 'FP' = 4, 'R' = 5))), + `screenRegionType` Nullable(Enum8('CTCF-bound' = 0, 'CTCF-only' = 1, 'DNase-H3K4me3' = 2, 'PLS' = 3, 'dELS' = 4, 'pELS' = 5, 'DNase-only' = 6, 'low-DNase' = 7)), + `predictions` Tuple( + cadd Nullable(Decimal(9, 5)), + eigen Nullable(Decimal(9, 5)), + fathmm Nullable(Decimal(9, 5)), + gnomad_noncoding Nullable(Decimal(9, 5)), + mpc Nullable(Decimal(9, 5)), + mut_pred Nullable(Decimal(9, 5)), + mut_taster Nullable(Enum8('D' = 0, 'A' = 1, 'N' = 2, 'P' = 3)), + polyphen Nullable(Decimal(9, 5)), + primate_ai Nullable(Decimal(9, 5)), + revel Nullable(Decimal(9, 5)), + sift Nullable(Decimal(9, 5)), + splice_ai Nullable(Decimal(9, 5)), + splice_ai_consequence Nullable(Enum8('Acceptor gain' = 0, 'Acceptor loss' = 1, 'Donor gain' = 2, 'Donor loss' = 3, 'No consequence' = 4)), + vest Nullable(Decimal(9, 5))), + `populations` Tuple( + exac Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + filter_af Decimal(9, 8), + hemi UInt32, + het UInt32, + hom UInt32), + gnomad_exomes Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + filter_af Decimal(9, 8), + hemi UInt32, + hom UInt32), + gnomad_genomes Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + filter_af Decimal(9, 8), + hemi UInt32, + hom UInt32), + topmed Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + het UInt32, + hom UInt32)), + `sortedTranscriptConsequences` Nested(alphamissensePathogenicity Nullable(Decimal(9, 5)), canonical Nullable(UInt8), consequenceTerms Array(Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32))), extendedIntronicSpliceRegionVariant Nullable(Bool), fiveutrConsequence Nullable(Enum8('5_prime_UTR_premature_start_codon_gain_variant' = 1, '5_prime_UTR_premature_start_codon_loss_variant' = 2, '5_prime_UTR_stop_codon_gain_variant' = 3, '5_prime_UTR_stop_codon_loss_variant' = 4, '5_prime_UTR_uORF_frameshift_variant' = 5)), geneId Nullable(String)), + `sortedMotifFeatureConsequences` Nested(consequenceTerms Array(Nullable(Enum8('TFBS_ablation' = 0, 'TFBS_amplification' = 1, 'TF_binding_site_variant' = 2, 'TFBS_fusion' = 3, 'TFBS_translocation' = 4))), motifFeatureId Nullable(String)), + `sortedRegulatoryFeatureConsequences` Nested(biotype Nullable(Enum8('enhancer' = 0, 'promoter' = 1, 'CTCF_binding_site' = 2, 'TF_binding_site' = 3, 'open_chromatin_region' = 4)), consequenceTerms Array(Nullable(Enum8('regulatory_region_ablation' = 0, 'regulatory_region_amplification' = 1, 'regulatory_region_variant' = 2, 'regulatory_region_fusion' = 3))), regulatoryFeatureId Nullable(String)) +) +ENGINE = EmbeddedRocksDB() +PRIMARY KEY key; + +CREATE TABLE `GRCh38/SNV_INDEL/annotations_memory` +( + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `xpos` UInt64, + `chrom` Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25), + `pos` UInt32, + `ref` String, + `alt` String, + `variantId` String, + `rsid` Nullable(String), + `CAID` Nullable(String), + `liftedOverChrom` Nullable(Enum8('1' = 1, '2' = 2, '3' = 3, '4' = 4, '5' = 5, '6' = 6, '7' = 7, '8' = 8, '9' = 9, '10' = 10, '11' = 11, '12' = 12, '13' = 13, '14' = 14, '15' = 15, '16' = 16, '17' = 17, '18' = 18, '19' = 19, '20' = 20, '21' = 21, '22' = 22, 'X' = 23, 'Y' = 24, 'M' = 25)), + `liftedOverPos` Nullable(UInt32), + `hgmd` Tuple( + accession Nullable(String), + classification Nullable(Enum8('DM' = 0, 'DM?' = 1, 'DP' = 2, 'DFP' = 3, 'FP' = 4, 'R' = 5))), + `screenRegionType` Nullable(Enum8('CTCF-bound' = 0, 'CTCF-only' = 1, 'DNase-H3K4me3' = 2, 'PLS' = 3, 'dELS' = 4, 'pELS' = 5, 'DNase-only' = 6, 'low-DNase' = 7)), + `predictions` Tuple( + cadd Nullable(Decimal(9, 5)), + eigen Nullable(Decimal(9, 5)), + fathmm Nullable(Decimal(9, 5)), + gnomad_noncoding Nullable(Decimal(9, 5)), + mpc Nullable(Decimal(9, 5)), + mut_pred Nullable(Decimal(9, 5)), + mut_taster Nullable(Enum8('D' = 0, 'A' = 1, 'N' = 2, 'P' = 3)), + polyphen Nullable(Decimal(9, 5)), + primate_ai Nullable(Decimal(9, 5)), + revel Nullable(Decimal(9, 5)), + sift Nullable(Decimal(9, 5)), + splice_ai Nullable(Decimal(9, 5)), + splice_ai_consequence Nullable(Enum8('Acceptor gain' = 0, 'Acceptor loss' = 1, 'Donor gain' = 2, 'Donor loss' = 3, 'No consequence' = 4)), + vest Nullable(Decimal(9, 5))), + `populations` Tuple( + exac Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + filter_af Decimal(9, 8), + hemi UInt32, + het UInt32, + hom UInt32), + gnomad_exomes Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + filter_af Decimal(9, 8), + hemi UInt32, + hom UInt32), + gnomad_genomes Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + filter_af Decimal(9, 8), + hemi UInt32, + hom UInt32), + topmed Tuple( + ac UInt32, + af Decimal(9, 8), + an UInt32, + het UInt32, + hom UInt32)), + `sortedTranscriptConsequences` Nested(alphamissensePathogenicity Nullable(Decimal(9, 5)), canonical Nullable(UInt8), consequenceTerms Array(Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32))), extendedIntronicSpliceRegionVariant Nullable(Bool), fiveutrConsequence Nullable(Enum8('5_prime_UTR_premature_start_codon_gain_variant' = 1, '5_prime_UTR_premature_start_codon_loss_variant' = 2, '5_prime_UTR_stop_codon_gain_variant' = 3, '5_prime_UTR_stop_codon_loss_variant' = 4, '5_prime_UTR_uORF_frameshift_variant' = 5)), geneId Nullable(String)), + `sortedMotifFeatureConsequences` Nested(consequenceTerms Array(Nullable(Enum8('TFBS_ablation' = 0, 'TFBS_amplification' = 1, 'TF_binding_site_variant' = 2, 'TFBS_fusion' = 3, 'TFBS_translocation' = 4))), motifFeatureId Nullable(String)), + `sortedRegulatoryFeatureConsequences` Nested(biotype Nullable(Enum8('enhancer' = 0, 'promoter' = 1, 'CTCF_binding_site' = 2, 'TF_binding_site' = 3, 'open_chromatin_region' = 4)), consequenceTerms Array(Nullable(Enum8('regulatory_region_ablation' = 0, 'regulatory_region_amplification' = 1, 'regulatory_region_variant' = 2, 'regulatory_region_fusion' = 3))), regulatoryFeatureId Nullable(String)) +) +ENGINE = EmbeddedRocksDB() +PRIMARY KEY key; + +CREATE TABLE `GRCh38/SNV_INDEL/transcripts` +( + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `transcripts` Nested(alphamissense Tuple( + pathogenicity Nullable(Decimal(9, 5))), aminoAcids Nullable(String), biotype Nullable(String), canonical Nullable(UInt8), codons Nullable(String), consequenceTerms Array(Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32))), exon Tuple( + index Nullable(Int32), + total Nullable(Int32)), geneId Nullable(String), hgvsc Nullable(String), hgvsp Nullable(String), intron Tuple( + index Nullable(Int32), + total Nullable(Int32)), loftee Tuple( + isLofNagnag Nullable(Bool), + lofFilters Array(Nullable(String))), majorConsequence Nullable(Enum8('transcript_ablation' = 1, 'splice_acceptor_variant' = 2, 'splice_donor_variant' = 3, 'stop_gained' = 4, 'frameshift_variant' = 5, 'stop_lost' = 6, 'start_lost' = 7, 'inframe_insertion' = 8, 'inframe_deletion' = 9, 'missense_variant' = 10, 'protein_altering_variant' = 11, 'splice_donor_5th_base_variant' = 12, 'splice_region_variant' = 13, 'splice_donor_region_variant' = 14, 'splice_polypyrimidine_tract_variant' = 15, 'incomplete_terminal_codon_variant' = 16, 'start_retained_variant' = 17, 'stop_retained_variant' = 18, 'synonymous_variant' = 19, 'coding_sequence_variant' = 20, 'mature_miRNA_variant' = 21, '5_prime_UTR_variant' = 22, '3_prime_UTR_variant' = 23, 'non_coding_transcript_exon_variant' = 24, 'intron_variant' = 25, 'NMD_transcript_variant' = 26, 'non_coding_transcript_variant' = 27, 'coding_transcript_variant' = 28, 'upstream_gene_variant' = 29, 'downstream_gene_variant' = 30, 'intergenic_variant' = 31, 'sequence_variant' = 32)), manePlusClinical Nullable(String), maneSelect Nullable(String), refseqTranscriptId Nullable(String), spliceregion Tuple( + extended_intronic_splice_region_variant Nullable(Bool)), transcriptId String, transcriptRank UInt8, utrannotator Tuple( + existingInframeOorfs Nullable(Int32), + existingOutofframeOorfs Nullable(Int32), + existingUorfs Nullable(Int32), + fiveutrAnnotation Tuple( + AltStop Nullable(String), + AltStopDistanceToCDS Nullable(Int32), + CapDistanceToStart Nullable(Int32), + DistanceToCDS Nullable(Int32), + DistanceToStop Nullable(Int32), + Evidence Nullable(Bool), + FrameWithCDS Nullable(String), + KozakContext Nullable(String), + KozakStrength Nullable(String), + StartDistanceToCDS Nullable(Int32), + alt_type Nullable(String), + alt_type_length Nullable(Int32), + newSTOPDistanceToCDS Nullable(Int32), + ref_StartDistanceToCDS Nullable(Int32), + ref_type Nullable(String), + ref_type_length Nullable(Int32), + type Nullable(String)), + fiveutrConsequence Nullable(String))) +) +ENGINE = EmbeddedRocksDB() +PRIMARY KEY key; + +CREATE TABLE `GRCh38/SNV_INDEL/entries` +( + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `project_guid` LowCardinality(String), + `family_guid` String, + `sample_type` Enum8('WES' = 1, 'WGS' = 2), + `xpos` UInt64 CODEC(Delta(8), ZSTD(1)), + `is_gnomad_gt_5_percent` Bool, + `is_annotated_in_any_gene` Bool, + `geneId_ids` Array(UInt32), + `filters` Array(LowCardinality(String)), + `calls` Array(Tuple( + sampleId String, + gt Nullable(Enum8('REF' = 0, 'HET' = 1, 'HOM' = 2)), + gq Nullable(UInt8), + ab Nullable(Decimal(9, 5)), + dp Nullable(UInt16))), + `sign` Int8, + `n_partitions` UInt8 MATERIALIZED dictGetOrDefault('GRCh38/SNV_INDEL/project_partitions_dict', 'n_partitions', project_guid, 1), + `partition_id` UInt8 MATERIALIZED farmHash64(family_guid) % n_partitions, + PROJECTION xpos_projection + ( + SELECT * + ORDER BY + is_gnomad_gt_5_percent, + is_annotated_in_any_gene, + xpos + ) +) +ENGINE = CollapsingMergeTree(sign) +PARTITION BY (project_guid, partition_id) +ORDER BY (project_guid, family_guid, sample_type, is_gnomad_gt_5_percent, is_annotated_in_any_gene, key) +SETTINGS deduplicate_merge_projection_mode = 'rebuild', index_granularity = 8192; + +CREATE TABLE `GRCh38/SNV_INDEL/gt_stats` +( + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `ac_wes` UInt32, + `ac_wgs` UInt32, + `ac_affected` UInt32 DEFAULT 0, + `hom_wes` UInt32, + `hom_wgs` UInt32, + `hom_affected` UInt32 DEFAULT 0 +) +ENGINE = SummingMergeTree +ORDER BY key +SETTINGS index_granularity = 8192; + +CREATE TABLE `GRCh38/SNV_INDEL/key_lookup` +( + `variantId` String, + `key` UInt32 CODEC(Delta(8), ZSTD(1)) +) +ENGINE = EmbeddedRocksDB() +PRIMARY KEY variantId; + +CREATE TABLE `GRCh38/SNV_INDEL/project_gt_stats` +( + `project_guid` LowCardinality(String), + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `sample_type` Enum8('WES' = 1, 'WGS' = 2), + `affected` Enum8('A' = 1, 'N' = 2, 'U' = 3) DEFAULT 'U', + `ref_samples` UInt32, + `het_samples` UInt32, + `hom_samples` UInt32 +) +ENGINE = SummingMergeTree +PARTITION BY project_guid +ORDER BY (project_guid, key, sample_type) +SETTINGS index_granularity = 8192; + +CREATE TABLE `GRCh38/SNV_INDEL/reference_data/clinvar` +( + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `alleleId` Nullable(UInt32), + `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16), + `goldStars` Nullable(UInt8), + `submitters` Array(String), + `conditions` Array(String), + `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)), + `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16) +) +ENGINE = Join(`ALL`, LEFT, key) +SETTINGS join_use_nulls = 1; + +CREATE TABLE `GRCh38/SNV_INDEL/reference_data/clinvar/all_variants` +( + `version` Date, + `variantId` String, + `alleleId` Nullable(UInt32), + `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16), + `goldStars` Nullable(UInt8), + `submitters` Array(String), + `conditions` Array(String), + `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)), + `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16) +) +ENGINE = MergeTree +PARTITION BY version +PRIMARY KEY (version, variantId) +ORDER BY (version, variantId) +SETTINGS index_granularity = 8192; + +CREATE TABLE `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants` +( + `key` UInt32 CODEC(Delta(8), ZSTD(1)), + `alleleId` Nullable(UInt32), + `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16), + `goldStars` Nullable(UInt8), + `submitters` Array(String), + `conditions` Array(String), + `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)), + `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16) +) +ENGINE = MergeTree +PRIMARY KEY key +ORDER BY key +SETTINGS index_granularity = 8192; + +CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/entries_to_project_gt_stats_mv` TO `GRCh38/SNV_INDEL/project_gt_stats` +( + `project_guid` LowCardinality(String), + `key` UInt32, + `sample_type` Enum8('WES' = 1, 'WGS' = 2), + `affected` String, + `ref_samples` Int64, + `het_samples` Int64, + `hom_samples` Int64 +) +AS SELECT + project_guid, + key, + sample_type, + dictGetOrDefault('seqrdb_affected_status_dict', 'affected', (family_guid, calls.sampleId), 'U') AS affected, + sumIf(sign, calls.gt = 'REF') AS ref_samples, + sumIf(sign, calls.gt = 'HET') AS het_samples, + sumIf(sign, calls.gt = 'HOM') AS hom_samples +FROM `GRCh38/SNV_INDEL/entries` +ARRAY JOIN calls +GROUP BY + project_guid, + key, + sample_type, + affected; + +CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/project_gt_stats_to_gt_stats_mv` +REFRESH EVERY 10 YEAR TO `GRCh38/SNV_INDEL/gt_stats` +( + `key` UInt32, + `ac_wes` UInt64, + `ac_wgs` UInt64, + `ac_affected` UInt64, + `hom_wes` UInt64, + `hom_wgs` UInt64, + `hom_affected` UInt64 +) +AS SELECT + key, + sumIf((het_samples * 1) + (hom_samples * 2), sample_type = 'WES') AS ac_wes, + sumIf((het_samples * 1) + (hom_samples * 2), sample_type = 'WGS') AS ac_wgs, + sumIf((het_samples * 1) + (hom_samples * 2), affected = 'A') AS ac_affected, + sumIf(hom_samples, sample_type = 'WES') AS hom_wes, + sumIf(hom_samples, sample_type = 'WGS') AS hom_wgs, + sumIf(hom_samples, affected = 'A') AS hom_affected +FROM `GRCh38/SNV_INDEL/project_gt_stats` +WHERE project_guid NOT IN ['R0555_seqr_demo', 'R0607_gregor_training_project_', 'R0608_gregor_training_project_', 'R0801_gregor_training_project_', 'R0811_gregor_training_project_', 'R0812_gregor_training_project_', 'R0813_gregor_training_project_', 'R0814_gregor_training_project_', 'R0815_gregor_training_project_', 'R0816_gregor_training_project_'] +GROUP BY key; + +CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/reference_data/clinvar/all_variants_to_seqr_variants_mv` +REFRESH EVERY 10 YEAR TO `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants` +( + `key` UInt32, + `alleleId` Nullable(UInt32), + `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16), + `goldStars` Nullable(UInt8), + `submitters` Array(String), + `conditions` Array(String), + `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)), + `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16) +) +AS SELECT + key, + COLUMNS('.*') EXCEPT (version, variantId, key) +FROM `GRCh38/SNV_INDEL/reference_data/clinvar/all_variants` AS src +INNER JOIN `GRCh38/SNV_INDEL/key_lookup` AS dst ON assumeNotNull(src.variantId) = dst.variantId +LIMIT 1 BY key; + + +CREATE MATERIALIZED VIEW `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants_to_search_mv` +REFRESH EVERY 10 YEAR TO `GRCh38/SNV_INDEL/reference_data/clinvar` +( + `key` UInt32, + `alleleId` Nullable(UInt32), + `conflictingPathogenicities` Nested(pathogenicity Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16), count UInt16), + `goldStars` Nullable(UInt8), + `submitters` Array(String), + `conditions` Array(String), + `assertions` Array(Enum8('Affects' = 0, 'association' = 1, 'association_not_found' = 2, 'confers_sensitivity' = 3, 'drug_response' = 4, 'low_penetrance' = 5, 'not_provided' = 6, 'other' = 7, 'protective' = 8, 'risk_factor' = 9, 'no_classification_for_the_single_variant' = 10, 'no_classifications_from_unflagged_records' = 11)), + `pathogenicity` Enum8('Pathogenic' = 0, 'Pathogenic/Likely_pathogenic' = 1, 'Pathogenic/Likely_pathogenic/Established_risk_allele' = 2, 'Pathogenic/Likely_pathogenic/Likely_risk_allele' = 3, 'Pathogenic/Likely_risk_allele' = 4, 'Likely_pathogenic' = 5, 'Likely_pathogenic/Likely_risk_allele' = 6, 'Established_risk_allele' = 7, 'Likely_risk_allele' = 8, 'Conflicting_classifications_of_pathogenicity' = 9, 'Uncertain_risk_allele' = 10, 'Uncertain_significance/Uncertain_risk_allele' = 11, 'Uncertain_significance' = 12, 'No_pathogenic_assertion' = 13, 'Likely_benign' = 14, 'Benign/Likely_benign' = 15, 'Benign' = 16) +) +AS SELECT * +FROM `GRCh38/SNV_INDEL/reference_data/clinvar/seqr_variants` +LIMIT 1 BY key; + From b0cf8b4203df0b339a9d960edf04284242632bd0 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 20:51:37 -0500 Subject: [PATCH 02/17] works --- v03_pipeline/bin/pipeline_worker_test.py | 11 ++++++----- 1 file changed, 6 insertions(+), 5 deletions(-) diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py index 96c7008d6..961d439a2 100644 --- a/v03_pipeline/bin/pipeline_worker_test.py +++ b/v03_pipeline/bin/pipeline_worker_test.py @@ -4,6 +4,7 @@ from unittest.mock import Mock, patch import hail as hl +import hailtop.fs as hfs import luigi import luigi.worker @@ -15,10 +16,10 @@ get_clickhouse_client, ) from v03_pipeline.lib.paths import ( + clickhouse_load_success_file_path, db_id_to_gene_id_path, loading_pipeline_deadletter_queue_dir, loading_pipeline_queue_dir, - clickhouse_load_success_file_path, ) from v03_pipeline.lib.test.misc import copy_project_pedigree_to_mocked_dir from v03_pipeline.lib.test.mocked_reference_datasets_testcase import ( @@ -193,12 +194,12 @@ def test_process_queue_integration_test( mock_safe_post_to_slack.assert_called_once_with( ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "skip_validation": true\n}```', ) - with open( - clickhouse_load_success_file( + with hfs.open( + clickhouse_load_success_file_path( ReferenceGenome.GRCh38, DatasetType.SNV_INDEL, - 'request_20250916-200704-123456', - ) + '20250916-200704-123456', + ), ) as f: self.assertEqual(f.read(), '') From add7a16bd7f3ec8ec1e5f0663bf6db0a3107b451 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 22:08:19 -0500 Subject: [PATCH 03/17] feat: skip single validation at a time --- v03_pipeline/api/model.py | 35 +++++++++++- v03_pipeline/api/model_test.py | 36 +++++++++++++ v03_pipeline/bin/pipeline_worker_test.py | 5 +- v03_pipeline/lib/misc/io.py | 4 +- v03_pipeline/lib/misc/validation.py | 10 ++++ .../lib/tasks/base/base_loading_run_params.py | 5 +- v03_pipeline/lib/tasks/dataproc/misc_test.py | 4 +- .../exports/write_new_entries_parquet_test.py | 9 ++-- .../write_new_transcripts_parquet_test.py | 5 +- .../write_new_variants_parquet_test.py | 11 ++-- ...annotations_table_with_new_samples_test.py | 25 +++++---- v03_pipeline/lib/tasks/validate_callset.py | 54 +++++++++++-------- .../lib/tasks/write_imported_callset.py | 5 +- .../lib/tasks/write_metadata_for_run_test.py | 3 +- ...ite_remapped_and_subsetted_callset_test.py | 9 ++-- .../lib/tasks/write_sample_qc_json_test.py | 3 +- .../write_variant_annotations_vcf_test.py | 3 +- 17 files changed, 161 insertions(+), 65 deletions(-) diff --git a/v03_pipeline/api/model.py b/v03_pipeline/api/model.py index 57f62036e..9d3e8848c 100644 --- a/v03_pipeline/api/model.py +++ b/v03_pipeline/api/model.py @@ -1,9 +1,20 @@ +from typing import Literal + import hailtop.fs as hfs -from pydantic import AliasChoices, BaseModel, Field, conint, field_validator +from pydantic import ( + AliasChoices, + BaseModel, + Field, + conint, + field_validator, + root_validator, +) from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS, SKIPPABLE_VALIDATIONS MAX_LOADING_PIPELINE_ATTEMPTS = 3 +STRINGIFIED_SKIPPABLE_VALIDATIONS = [f.__name__ for f in SKIPPABLE_VALIDATIONS] VALID_FILE_TYPES = ['vcf', 'vcf.gz', 'vcf.bgz', 'mt'] @@ -26,9 +37,9 @@ class LoadingPipelineRequest(PipelineRunnerRequest): sample_type: SampleType reference_genome: ReferenceGenome dataset_type: DatasetType - skip_validation: bool = False skip_check_sex_and_relatedness: bool = False skip_expect_tdr_metrics: bool = False + validations_to_skip: list[Literal[*STRINGIFIED_SKIPPABLE_VALIDATIONS]] = [] def incr_attempt(self): if self.attempt_id == (MAX_LOADING_PIPELINE_ATTEMPTS - 1): @@ -36,6 +47,26 @@ def incr_attempt(self): self.attempt_id += 1 return True + @field_validator('validations_to_skip') + @classmethod + def must_be_known_validation(cls, validations_to_skip): + for v in validations_to_skip: + if v not in set(STRINGIFIED_SKIPPABLE_VALIDATIONS): + msg = f'{v} is not a valid validator' + raise ValueError(msg) + return validations_to_skip + + @root_validator( + pre=True, + ) # the root validator runs before Pydantic parses or coerces field values. + @classmethod + def override_all_validations(cls, values): + if values.get('validations_to_skip') == [ + ALL_VALIDATIONS, + ]: + values['validations_to_skip'] = STRINGIFIED_SKIPPABLE_VALIDATIONS + return values + @field_validator('callset_path') @classmethod def check_valid_callset_path(cls, callset_path: str) -> str: diff --git a/v03_pipeline/api/model_test.py b/v03_pipeline/api/model_test.py index e48331aaa..7334b5120 100644 --- a/v03_pipeline/api/model_test.py +++ b/v03_pipeline/api/model_test.py @@ -46,6 +46,42 @@ def test_invalid_loading_pipeline_requests(self) -> None: ), ) + def test_validations_to_skip(self) -> None: + shared_params = { + 'callset_path': CALLSET_PATH, + 'projects_to_run': ['project_a'], + 'sample_type': SampleType.WGS.value, + 'reference_genome': ReferenceGenome.GRCh38.value, + 'dataset_type': DatasetType.SNV_INDEL.value, + } + raw_request = { + **shared_params, + 'skip_validation': True, + } + lpr = LoadingPipelineRequest.model_validate(raw_request) + self.assertGreater(len(lpr.validations_to_skip), 1) + + raw_request = { + **shared_params, + 'validations_to_skip': ['all'], + } + lpr = LoadingPipelineRequest.model_validate(raw_request) + self.assertGreater(len(lpr.validations_to_skip), 1) + + raw_request = { + **shared_params, + 'validations_to_skip': ['validate_sample_type'], + } + lpr = LoadingPipelineRequest.model_validate(raw_request) + self.assertEqual(len(lpr.validations_to_skip), 1) + + raw_request = { + **shared_params, + 'validations_to_skip': ['validate_blended_exome'], + } + with self.assertRaises(ValueError): + LoadingPipelineRequest.model_validate(raw_request) + def test_delete_families_request(self) -> None: raw_request = {'project_guid': 'project_a', 'family_guids': []} with self.assertRaises(ValueError): diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py index 961d439a2..9e41c5aec 100644 --- a/v03_pipeline/bin/pipeline_worker_test.py +++ b/v03_pipeline/bin/pipeline_worker_test.py @@ -176,7 +176,6 @@ def test_process_queue_integration_test( 'sample_type': SampleType.WGS.value, 'reference_genome': ReferenceGenome.GRCh38.value, 'dataset_type': DatasetType.SNV_INDEL.value, - 'skip_validation': True, } os.makedirs( loading_pipeline_queue_dir(), @@ -192,7 +191,7 @@ def test_process_queue_integration_test( json.dump(raw_request, f) process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( - ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "skip_validation": true\n}```', + ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": []\n}```', ) with hfs.open( clickhouse_load_success_file_path( @@ -263,7 +262,7 @@ def test_process_failure( process_queue(local_scheduler=True) process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( - ':failed: Pipeline Runner Request Failed :failed:\nRun ID: 20250918-200704-123456\n```{\n "attempt_id": 2,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "project_a"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "skip_validation": false\n}```\nReason: there were failed tasks', + ':failed: Pipeline Runner Request Failed :failed:\nRun ID: 20250918-200704-123456\n```{\n "attempt_id": 2,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "project_a"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": []\n}```\nReason: there were failed tasks', ) self.assertEqual(len(os.listdir(loading_pipeline_queue_dir())), 0) with open( diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index d0a7e044e..f6d901ce9 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -81,7 +81,7 @@ def compute_hail_n_partitions(file_size_b: int) -> int: ) def split_multi_hts( mt: hl.MatrixTable, - skip_validation: bool, + skip_validate_no_duplicate_variants: bool, max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE, ) -> hl.MatrixTable: bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC) @@ -98,7 +98,7 @@ def split_multi_hts( mt = split.union_rows(bi) # If we've disabled validation (which is expected to throw an exception # for duplicate variants, we would like to distinc ) - if skip_validation: + if skip_validate_no_duplicate_variants: return mt.distinct_by_row() return mt diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 280525462..1c480c20c 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -9,6 +9,7 @@ SampleType, ) +ALL_VALIDATIONS = 'all' AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. MIN_ROWS_PER_CONTIG = 100 SAMPLE_TYPE_MATCH_THRESHOLD = 0.3 @@ -191,3 +192,12 @@ def validate_sample_type( if has_noncoding and has_coding and sample_type != SampleType.WGS: msg = 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants' raise SeqrValidationError(msg) + + +SKIPPABLE_VALIDATIONS = [ + validate_allele_depth_length, + validate_allele_type, + validate_expected_contig_frequency, + validate_no_duplicate_variants, + validate_sample_type, +] diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index 5cba7d46f..3cd447cc5 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -28,10 +28,7 @@ class BaseLoadingRunParams(luigi.Task): default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - skip_validation = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) + validations_to_skip = luigi.ListParameter(default=[]) is_new_gcnv_joint_call = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py index 347e2e0b9..983275c6e 100644 --- a/v03_pipeline/lib/tasks/dataproc/misc_test.py +++ b/v03_pipeline/lib/tasks/dataproc/misc_test.py @@ -39,8 +39,8 @@ def test_to_kebab_str_args(self, _: Mock): 'False', '--skip-expect-tdr-metrics', 'False', - '--skip-validation', - 'False', + '--validations-to-skip', + '[]', '--is-new-gcnv-joint-call', 'False', '--run-id', diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index 90d8d6dfa..43fa38a14 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -10,6 +10,7 @@ ReferenceGenome, SampleType, ) +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( db_id_to_gene_id_path, new_entries_parquet_path, @@ -114,7 +115,7 @@ def test_write_new_entries_parquet(self): sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project', 'R0114_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -214,7 +215,7 @@ def test_mito_write_new_entries_parquet(self): sample_type=SampleType.WGS, callset_path=TEST_MITO_CALLSET, project_guids=['R0116_test_project3'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -269,7 +270,7 @@ def test_sv_write_new_entries_parquet(self): sample_type=SampleType.WGS, callset_path=TEST_SV_VCF_2, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -352,7 +353,7 @@ def test_gcnv_write_new_entries_parquet(self): sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 5af285095..35d2d94e2 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -9,6 +9,7 @@ ReferenceGenome, SampleType, ) +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( new_transcripts_parquet_path, new_variants_table_path, @@ -87,7 +88,7 @@ def test_write_new_transcripts_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -187,7 +188,7 @@ def test_grch37_write_new_transcripts_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index a3c7fc431..69b0c5b36 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -10,6 +10,7 @@ ReferenceGenome, SampleType, ) +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( new_variants_parquet_path, new_variants_table_path, @@ -107,7 +108,7 @@ def test_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -231,7 +232,7 @@ def test_grch37_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -342,7 +343,7 @@ def test_mito_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -436,7 +437,7 @@ def test_sv_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -513,7 +514,7 @@ def test_gcnv_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 9a03d75cc..50da39fc6 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -24,7 +24,10 @@ SampleType, ) from v03_pipeline.lib.misc.io import remap_pedigree_hash -from v03_pipeline.lib.misc.validation import validate_expected_contig_frequency +from v03_pipeline.lib.misc.validation import ( + ALL_VALIDATIONS, + validate_expected_contig_frequency, +) from v03_pipeline.lib.paths import ( valid_reference_dataset_path, ) @@ -86,7 +89,7 @@ def test_missing_pedigree( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -116,7 +119,7 @@ def test_missing_interval_reference_dataset( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -527,7 +530,7 @@ def test_update_vat_grch37( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -683,7 +686,7 @@ def test_update_vat_without_accessing_private_datasets( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -726,7 +729,7 @@ def test_mito_update_vat( sample_type=SampleType.WGS, callset_path=TEST_MITO_MT, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) ) @@ -834,7 +837,7 @@ def test_sv_multiple_vcf_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) ) @@ -1294,7 +1297,7 @@ def test_sv_multiple_vcf_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF_2, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='second_run_id', ) ) @@ -1414,7 +1417,7 @@ def test_sv_multiple_project_single_vcf( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF_2, project_guids=['R0116_test_project3', 'R0117_test_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='run_id', ) ) @@ -1446,7 +1449,7 @@ def test_gcnv_update_vat_multiple( sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) ) @@ -1585,7 +1588,7 @@ def test_gcnv_update_vat_multiple( sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE_2, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='second_run_id', ) ) diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index e8f5d6f34..b13142bfe 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -3,12 +3,9 @@ import luigi.util from v03_pipeline.lib.misc.validation import ( + ALL_VALIDATIONS, + SKIPPABLE_VALIDATIONS, SeqrValidationError, - validate_allele_depth_length, - validate_allele_type, - validate_expected_contig_frequency, - validate_no_duplicate_variants, - validate_sample_type, ) from v03_pipeline.lib.paths import ( imported_callset_path, @@ -31,6 +28,22 @@ @luigi.util.inherits(BaseLoadingRunParams) class ValidateCallsetTask(BaseUpdateTask): + @property + def validation_dependencies(self) -> dict[str, hl.Table]: + deps = {} + if ( + ALL_VALIDATIONS not in self.validations_to_skip + and 'validate_sample_type' not in self.validations_to_skip + and self.dataset_type.can_run_validation + ): + deps['coding_and_noncoding_variants_ht'] = hl.read_table( + valid_reference_dataset_path( + self.reference_genome, + ReferenceDataset.gnomad_coding_and_noncoding, + ), + ) + return deps + def complete(self) -> luigi.Target: if super().complete(): mt = hl.read_matrix_table(self.output().path) @@ -50,7 +63,11 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [self.clone(WriteImportedCallsetTask)] - if not self.skip_validation and self.dataset_type.can_run_validation: + if ( + ALL_VALIDATIONS not in self.validations_to_skip + and 'validate_sample_type' not in self.validations_to_skip + and self.dataset_type.can_run_validation + ): requirements = [ *requirements, ( @@ -87,29 +104,22 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: ), ) - validation_exceptions = [] - if self.skip_validation or not self.dataset_type.can_run_validation: + if ( + ALL_VALIDATIONS in self.validations_to_skip + or not self.dataset_type.can_run_validation + ): return mt.select_globals( callset_path=self.callset_path, validated_sample_type=self.sample_type.value, ) - coding_and_noncoding_variants_ht = hl.read_table( - valid_reference_dataset_path( - self.reference_genome, - ReferenceDataset.gnomad_coding_and_noncoding, - ), - ) - for validation_f in [ - validate_allele_depth_length, - validate_allele_type, - validate_no_duplicate_variants, - validate_expected_contig_frequency, - validate_sample_type, - ]: + validation_exceptions = [] + for validation_f in SKIPPABLE_VALIDATIONS: try: + if validation_f in self.validations_to_skip: + continue validation_f( mt, - coding_and_noncoding_variants_ht=coding_and_noncoding_variants_ht, + **self.validation_dependencies, **self.param_kwargs, ) except SeqrValidationError as e: diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index e4df70899..6a3230197 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -85,7 +85,10 @@ def create_table(self) -> hl.MatrixTable: ) if self.dataset_type.has_multi_allelic_variants: # NB: throws SeqrValidationError - mt = split_multi_hts(mt, self.skip_validation) + mt = split_multi_hts( + mt, + 'validate_no_duplicate_variants' in self.validations_to_skip, + ) if self.dataset_type.re_key_by_seqr_internal_truth_vid and hasattr( mt, 'info.SEQR_INTERNAL_TRUTH_VID', diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index e8318f043..e0bec8f14 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -5,6 +5,7 @@ import luigi.worker from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import relatedness_check_tsv_path from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask from v03_pipeline.lib.test.misc import copy_project_pedigree_to_mocked_dir @@ -54,7 +55,7 @@ def test_write_metadata_for_run_task( sample_type=SampleType.WGS, callset_path=TEST_VCF, project_guids=['R0113_test_project', 'R0114_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='run_123456', ) worker.add(write_metadata_for_run_task) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 305150699..9504965c4 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -7,6 +7,7 @@ from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.misc.io import remap_pedigree_hash +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( relatedness_check_table_path, sex_check_table_path, @@ -101,7 +102,7 @@ def test_write_remapped_and_subsetted_callset_task( callset_path=TEST_VCF, project_guids=['R0113_test_project'], project_i=0, - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) worker.add(wrsc_task) @@ -151,7 +152,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks( callset_path=TEST_VCF, project_guids=['R0114_project4'], project_i=0, - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) worker.add(wrsc_task) @@ -244,7 +245,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( callset_path=TEST_VCF, project_guids=['R0114_project4'], project_i=0, - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) worker.add(wrsc_task) @@ -256,7 +257,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( sample_type=SampleType.WES, callset_path=TEST_VCF, project_guids=['R0114_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) self.assertTrue(write_validation_errors_task.complete()) diff --git a/v03_pipeline/lib/tasks/write_sample_qc_json_test.py b/v03_pipeline/lib/tasks/write_sample_qc_json_test.py index 8d2bb699e..d1b0a3643 100644 --- a/v03_pipeline/lib/tasks/write_sample_qc_json_test.py +++ b/v03_pipeline/lib/tasks/write_sample_qc_json_test.py @@ -8,6 +8,7 @@ import luigi.worker from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ancestry_model_rf_path from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask @@ -113,7 +114,7 @@ def test_call_sample_qc( sample_type=SampleType.WGS, callset_path=TEST_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py index 913de418c..8f3a00107 100644 --- a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py +++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py @@ -5,6 +5,7 @@ import luigi.worker from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( UpdateVariantAnnotationsTableWithNewSamplesTask, ) @@ -64,7 +65,7 @@ def test_sv_export_vcf( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) ) From d952e214580d7892f1394af5c5c503453391543f Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 22:57:15 -0500 Subject: [PATCH 04/17] fix a few --- v03_pipeline/bin/pipeline_worker_test.py | 3 ++- v03_pipeline/lib/tasks/validate_callset.py | 14 ++++---------- 2 files changed, 6 insertions(+), 11 deletions(-) diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py index 9e41c5aec..b488e3d58 100644 --- a/v03_pipeline/bin/pipeline_worker_test.py +++ b/v03_pipeline/bin/pipeline_worker_test.py @@ -176,6 +176,7 @@ def test_process_queue_integration_test( 'sample_type': SampleType.WGS.value, 'reference_genome': ReferenceGenome.GRCh38.value, 'dataset_type': DatasetType.SNV_INDEL.value, + 'validations_to_skip': ['all'], } os.makedirs( loading_pipeline_queue_dir(), @@ -191,7 +192,7 @@ def test_process_queue_integration_test( json.dump(raw_request, f) process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( - ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": []\n}```', + ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": [\n "validate_allele_depth_length",\n "validate_allele_type",\n "validate_expected_contig_frequency",\n "validate_no_duplicate_variants",\n "validate_sample_type"\n ]\n}```' ) with hfs.open( clickhouse_load_success_file_path( diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index b13142bfe..8ee302b2e 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -3,7 +3,6 @@ import luigi.util from v03_pipeline.lib.misc.validation import ( - ALL_VALIDATIONS, SKIPPABLE_VALIDATIONS, SeqrValidationError, ) @@ -32,8 +31,7 @@ class ValidateCallsetTask(BaseUpdateTask): def validation_dependencies(self) -> dict[str, hl.Table]: deps = {} if ( - ALL_VALIDATIONS not in self.validations_to_skip - and 'validate_sample_type' not in self.validations_to_skip + 'validate_sample_type' not in self.validations_to_skip and self.dataset_type.can_run_validation ): deps['coding_and_noncoding_variants_ht'] = hl.read_table( @@ -64,8 +62,7 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [self.clone(WriteImportedCallsetTask)] if ( - ALL_VALIDATIONS not in self.validations_to_skip - and 'validate_sample_type' not in self.validations_to_skip + 'validate_sample_type' not in self.validations_to_skip and self.dataset_type.can_run_validation ): requirements = [ @@ -104,10 +101,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: ), ) - if ( - ALL_VALIDATIONS in self.validations_to_skip - or not self.dataset_type.can_run_validation - ): + if not self.dataset_type.can_run_validation: return mt.select_globals( callset_path=self.callset_path, validated_sample_type=self.sample_type.value, @@ -115,7 +109,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: validation_exceptions = [] for validation_f in SKIPPABLE_VALIDATIONS: try: - if validation_f in self.validations_to_skip: + if str(validation_f.__name__) in self.validations_to_skip: continue validation_f( mt, From 5ac896103c2762b3a401b6209bffd9d07b5ed766 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 23:03:36 -0500 Subject: [PATCH 05/17] bump --- v03_pipeline/lib/misc/pedigree_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/pedigree_test.py b/v03_pipeline/lib/misc/pedigree_test.py index 6071849d0..2142ea215 100644 --- a/v03_pipeline/lib/misc/pedigree_test.py +++ b/v03_pipeline/lib/misc/pedigree_test.py @@ -13,7 +13,7 @@ class PedigreesTest(unittest.TestCase): def test_empty_pedigree(self) -> None: - with self.assertRaises(ValueError): + with self.assertRaises(Exception): _ = import_pedigree(TEST_PEDIGREE_1) def test_parse_lineage(self) -> None: From ce9223583a4c7d7c4003904290269e8a4251ccc2 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 23:11:06 -0500 Subject: [PATCH 06/17] pedigree --- v03_pipeline/lib/misc/pedigree_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/misc/pedigree_test.py b/v03_pipeline/lib/misc/pedigree_test.py index 2142ea215..f70d168ff 100644 --- a/v03_pipeline/lib/misc/pedigree_test.py +++ b/v03_pipeline/lib/misc/pedigree_test.py @@ -13,7 +13,7 @@ class PedigreesTest(unittest.TestCase): def test_empty_pedigree(self) -> None: - with self.assertRaises(Exception): + with self.assertRaises(Exception): # noqa: B017 _ = import_pedigree(TEST_PEDIGREE_1) def test_parse_lineage(self) -> None: From d54c4de98f4cd3d45d057c58ba490256d7e51216 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 23:21:49 -0500 Subject: [PATCH 07/17] validations to skip --- v03_pipeline/api/app_test.py | 24 +++++++++++++++++++++++- v03_pipeline/api/model_test.py | 9 +-------- 2 files changed, 24 insertions(+), 9 deletions(-) diff --git a/v03_pipeline/api/app_test.py b/v03_pipeline/api/app_test.py index f3a68776e..2b09e3814 100644 --- a/v03_pipeline/api/app_test.py +++ b/v03_pipeline/api/app_test.py @@ -67,6 +67,28 @@ async def test_loading_pipeline_invalid_requests(self): 'callset_path must point to a file that exists' in log.output[0], ) + body = { + 'callset_path': CALLSET_PATH, + 'project_guids': ['project_a'], + 'sample_type': SampleType.WGS.value, + 'reference_genome': ReferenceGenome.GRCh38.value, + 'dataset_type': DatasetType.SNV_INDEL.value, + 'validations_to_skip': ['bad_validation'] + } + with self.assertLogs(level='ERROR') as log: + async with self.client.request( + 'POST', + '/loading_pipeline_enqueue', + json=body, + ) as resp: + self.assertEqual( + resp.status, + web_exceptions.HTTPBadRequest.status_code, + ) + self.assertTrue( + "input_value='bad_validation" in log.output[0], + ) + async def test_loading_pipeline_enqueue(self): body = { 'callset_path': CALLSET_PATH, @@ -96,9 +118,9 @@ async def test_loading_pipeline_enqueue(self): 'reference_genome': 'GRCh38', 'sample_type': 'WGS', 'skip_check_sex_and_relatedness': False, - 'skip_validation': False, 'skip_expect_tdr_metrics': False, 'attempt_id': 0, + 'validations_to_skip': [] }, }, ) diff --git a/v03_pipeline/api/model_test.py b/v03_pipeline/api/model_test.py index 7334b5120..e7dd6cd19 100644 --- a/v03_pipeline/api/model_test.py +++ b/v03_pipeline/api/model_test.py @@ -54,19 +54,12 @@ def test_validations_to_skip(self) -> None: 'reference_genome': ReferenceGenome.GRCh38.value, 'dataset_type': DatasetType.SNV_INDEL.value, } - raw_request = { - **shared_params, - 'skip_validation': True, - } - lpr = LoadingPipelineRequest.model_validate(raw_request) - self.assertGreater(len(lpr.validations_to_skip), 1) - raw_request = { **shared_params, 'validations_to_skip': ['all'], } lpr = LoadingPipelineRequest.model_validate(raw_request) - self.assertGreater(len(lpr.validations_to_skip), 1) + self.assertGreater(len(lpr.validations_to_skip), 2) raw_request = { **shared_params, From b455ded337c7d7aaf476d73c17a76f7744fabf20 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 23:25:42 -0500 Subject: [PATCH 08/17] ruff --- v03_pipeline/api/app_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/api/app_test.py b/v03_pipeline/api/app_test.py index 2b09e3814..0ae059130 100644 --- a/v03_pipeline/api/app_test.py +++ b/v03_pipeline/api/app_test.py @@ -73,7 +73,7 @@ async def test_loading_pipeline_invalid_requests(self): 'sample_type': SampleType.WGS.value, 'reference_genome': ReferenceGenome.GRCh38.value, 'dataset_type': DatasetType.SNV_INDEL.value, - 'validations_to_skip': ['bad_validation'] + 'validations_to_skip': ['bad_validation'], } with self.assertLogs(level='ERROR') as log: async with self.client.request( @@ -120,7 +120,7 @@ async def test_loading_pipeline_enqueue(self): 'skip_check_sex_and_relatedness': False, 'skip_expect_tdr_metrics': False, 'attempt_id': 0, - 'validations_to_skip': [] + 'validations_to_skip': [], }, }, ) From 9300a615fb1537ea9bd2c5e906426e69d8453147 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Tue, 25 Nov 2025 23:28:01 -0500 Subject: [PATCH 09/17] ruff --- v03_pipeline/bin/pipeline_worker_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py index b488e3d58..7e3341549 100644 --- a/v03_pipeline/bin/pipeline_worker_test.py +++ b/v03_pipeline/bin/pipeline_worker_test.py @@ -192,7 +192,7 @@ def test_process_queue_integration_test( json.dump(raw_request, f) process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( - ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": [\n "validate_allele_depth_length",\n "validate_allele_type",\n "validate_expected_contig_frequency",\n "validate_no_duplicate_variants",\n "validate_sample_type"\n ]\n}```' + ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": [\n "validate_allele_depth_length",\n "validate_allele_type",\n "validate_expected_contig_frequency",\n "validate_no_duplicate_variants",\n "validate_sample_type"\n ]\n}```', ) with hfs.open( clickhouse_load_success_file_path( From 7989e593212fe647a0b6a82ac604d837cdcabcaa Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 26 Nov 2025 00:34:41 -0500 Subject: [PATCH 10/17] fix arg --- .../update_variant_annotations_table_with_new_samples_test.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 50da39fc6..4755730dd 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -271,7 +271,7 @@ def test_multiple_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=False, + validations_to_skip=[], run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_3) @@ -332,7 +332,7 @@ def test_multiple_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0114_project4'], - skip_validation=False, + validations_to_skip=[], run_id=TEST_RUN_ID + '-another-run', ) worker.add(uvatwns_task_4) From 1d876e3d951714da2f71a6b00625a960adbd64a2 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 26 Nov 2025 01:03:34 -0500 Subject: [PATCH 11/17] all validations --- v03_pipeline/lib/tasks/validate_callset.py | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index 8ee302b2e..4d378fa46 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -3,6 +3,7 @@ import luigi.util from v03_pipeline.lib.misc.validation import ( + ALL_VALIDATIONS, SKIPPABLE_VALIDATIONS, SeqrValidationError, ) @@ -31,7 +32,8 @@ class ValidateCallsetTask(BaseUpdateTask): def validation_dependencies(self) -> dict[str, hl.Table]: deps = {} if ( - 'validate_sample_type' not in self.validations_to_skip + ALL_VALIDATIONS not in self.validations_to_skip + and 'validate_sample_type' not in self.validations_to_skip and self.dataset_type.can_run_validation ): deps['coding_and_noncoding_variants_ht'] = hl.read_table( @@ -62,7 +64,8 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [self.clone(WriteImportedCallsetTask)] if ( - 'validate_sample_type' not in self.validations_to_skip + ALL_VALIDATIONS not in self.validations_to_skip + and 'validate_sample_type' not in self.validations_to_skip and self.dataset_type.can_run_validation ): requirements = [ @@ -101,7 +104,10 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: ), ) - if not self.dataset_type.can_run_validation: + if ( + ALL_VALIDATIONS in self.validations_to_skip + or not self.dataset_type.can_run_validation + ): return mt.select_globals( callset_path=self.callset_path, validated_sample_type=self.sample_type.value, From 2d23d2ce414789b6dfb4ae1e75100862f32b221c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Wed, 26 Nov 2025 01:30:39 -0500 Subject: [PATCH 12/17] ruff --- v03_pipeline/lib/tasks/validate_callset_test.py | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py index 6340dfc6a..895ec1bb0 100644 --- a/v03_pipeline/lib/tasks/validate_callset_test.py +++ b/v03_pipeline/lib/tasks/validate_callset_test.py @@ -55,7 +55,6 @@ def test_validate_callset_multiple_exceptions( # all contigs but chr1, and contains non-coding variants. callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, project_guids=['project_a'], - skip_validation=False, run_id=TEST_RUN_ID, ) worker.add(validate_callset_task) @@ -68,7 +67,6 @@ def test_validate_callset_multiple_exceptions( sample_type=SampleType.WES, callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, project_guids=['project_a'], - skip_validation=False, run_id=TEST_RUN_ID, ) self.assertTrue(write_validation_errors_task.complete()) @@ -79,8 +77,8 @@ def test_validate_callset_multiple_exceptions( 'project_guids': ['project_a'], 'error_messages': [ 'Alleles with invalid allele are present in the callset. This appears to be a GVCF containing records for sites with no variants.', - "Variants are present multiple times in the callset: ['1-902088-G-A']", 'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX', + "Variants are present multiple times in the callset: ['1-902088-G-A']", 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants', ], }, From 48777c873bb71d48ac720f9a0e04360456125a53 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 28 Nov 2025 13:21:08 -0500 Subject: [PATCH 13/17] ruff --- v03_pipeline/bin/pipeline_worker_test.py | 1 - 1 file changed, 1 deletion(-) diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py index c8ddf32a7..7e3341549 100644 --- a/v03_pipeline/bin/pipeline_worker_test.py +++ b/v03_pipeline/bin/pipeline_worker_test.py @@ -193,7 +193,6 @@ def test_process_queue_integration_test( process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": [\n "validate_allele_depth_length",\n "validate_allele_type",\n "validate_expected_contig_frequency",\n "validate_no_duplicate_variants",\n "validate_sample_type"\n ]\n}```', - ) with hfs.open( clickhouse_load_success_file_path( From 530c2126ff06147f1ee8319a7b6ffd56d72baa3b Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 28 Nov 2025 16:38:15 -0500 Subject: [PATCH 14/17] skippable validations --- ..._variant_annotations_table_with_new_samples_test.py | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 4755730dd..01652f94f 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -18,6 +18,7 @@ SV_TYPES, TRANSCRIPT_CONSEQUENCE_TERMS, ) +from v03_pipeline.lib.misc.validation import SKIPPABLE_VALIDATIONS from v03_pipeline.lib.core import ( DatasetType, ReferenceGenome, @@ -132,8 +133,13 @@ def test_missing_interval_reference_dataset( ) @patch('v03_pipeline.lib.tasks.update_new_variants_with_caids.Env') @patch( - 'v03_pipeline.lib.tasks.validate_callset.validate_expected_contig_frequency', - partial(validate_expected_contig_frequency, min_rows_per_contig=25), + 'v03_pipeline.lib.tasks.validate_callset.SKIPPABLE_VALIDATIONS', + [ + x + for x in SKIPPABLE_VALIDATIONS + if x.__str__ != 'validate_expected_contig_frequency' + ] + + [partial(validate_expected_contig_frequency, min_rows_per_contig=25)], ) @patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock) @patch('v03_pipeline.lib.vep.hl.vep') From 2093286920b6dd8b21db58bb60f1d1bb65da637c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Fri, 28 Nov 2025 16:57:36 -0500 Subject: [PATCH 15/17] ruff --- .../update_variant_annotations_table_with_new_samples_test.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 01652f94f..4e08c6869 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -18,7 +18,6 @@ SV_TYPES, TRANSCRIPT_CONSEQUENCE_TERMS, ) -from v03_pipeline.lib.misc.validation import SKIPPABLE_VALIDATIONS from v03_pipeline.lib.core import ( DatasetType, ReferenceGenome, @@ -27,6 +26,7 @@ from v03_pipeline.lib.misc.io import remap_pedigree_hash from v03_pipeline.lib.misc.validation import ( ALL_VALIDATIONS, + SKIPPABLE_VALIDATIONS, validate_expected_contig_frequency, ) from v03_pipeline.lib.paths import ( From a26b1b908adacce5dae05f1faf6bb9fabdaae12c Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 1 Dec 2025 13:53:50 -0500 Subject: [PATCH 16/17] functools --- ...ariant_annotations_table_with_new_samples_test.py | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 4e08c6869..344aaac83 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -1,5 +1,5 @@ +import functools import shutil -from functools import partial from unittest.mock import Mock, PropertyMock, patch import hail as hl @@ -139,7 +139,15 @@ def test_missing_interval_reference_dataset( for x in SKIPPABLE_VALIDATIONS if x.__str__ != 'validate_expected_contig_frequency' ] - + [partial(validate_expected_contig_frequency, min_rows_per_contig=25)], + + [ + functools.update_wrapper( + functools.partial( + validate_expected_contig_frequency, + min_rows_per_contig=25, + ), + validate_expected_contig_frequency, + ), + ], ) @patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock) @patch('v03_pipeline.lib.vep.hl.vep') From e5d0c4c3517a2fe6b783d82179311766e02a6e13 Mon Sep 17 00:00:00 2001 From: Benjamin Blankenmeister Date: Mon, 1 Dec 2025 14:55:03 -0500 Subject: [PATCH 17/17] cleaner mocks --- .../update_variant_annotations_table_with_new_samples_test.py | 2 +- v03_pipeline/lib/tasks/validate_callset.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 344aaac83..c2b253bdb 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -137,7 +137,7 @@ def test_missing_interval_reference_dataset( [ x for x in SKIPPABLE_VALIDATIONS - if x.__str__ != 'validate_expected_contig_frequency' + if x.__name__ != 'validate_expected_contig_frequency' ] + [ functools.update_wrapper( diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index 4d378fa46..2e91ec2a6 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -115,7 +115,7 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: validation_exceptions = [] for validation_f in SKIPPABLE_VALIDATIONS: try: - if str(validation_f.__name__) in self.validations_to_skip: + if validation_f.__name__ in self.validations_to_skip: continue validation_f( mt,