diff --git a/v03_pipeline/api/app_test.py b/v03_pipeline/api/app_test.py index f3a68776e..0ae059130 100644 --- a/v03_pipeline/api/app_test.py +++ b/v03_pipeline/api/app_test.py @@ -67,6 +67,28 @@ async def test_loading_pipeline_invalid_requests(self): 'callset_path must point to a file that exists' in log.output[0], ) + body = { + 'callset_path': CALLSET_PATH, + 'project_guids': ['project_a'], + 'sample_type': SampleType.WGS.value, + 'reference_genome': ReferenceGenome.GRCh38.value, + 'dataset_type': DatasetType.SNV_INDEL.value, + 'validations_to_skip': ['bad_validation'], + } + with self.assertLogs(level='ERROR') as log: + async with self.client.request( + 'POST', + '/loading_pipeline_enqueue', + json=body, + ) as resp: + self.assertEqual( + resp.status, + web_exceptions.HTTPBadRequest.status_code, + ) + self.assertTrue( + "input_value='bad_validation" in log.output[0], + ) + async def test_loading_pipeline_enqueue(self): body = { 'callset_path': CALLSET_PATH, @@ -96,9 +118,9 @@ async def test_loading_pipeline_enqueue(self): 'reference_genome': 'GRCh38', 'sample_type': 'WGS', 'skip_check_sex_and_relatedness': False, - 'skip_validation': False, 'skip_expect_tdr_metrics': False, 'attempt_id': 0, + 'validations_to_skip': [], }, }, ) diff --git a/v03_pipeline/api/model.py b/v03_pipeline/api/model.py index 57f62036e..9d3e8848c 100644 --- a/v03_pipeline/api/model.py +++ b/v03_pipeline/api/model.py @@ -1,9 +1,20 @@ +from typing import Literal + import hailtop.fs as hfs -from pydantic import AliasChoices, BaseModel, Field, conint, field_validator +from pydantic import ( + AliasChoices, + BaseModel, + Field, + conint, + field_validator, + root_validator, +) from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS, SKIPPABLE_VALIDATIONS MAX_LOADING_PIPELINE_ATTEMPTS = 3 +STRINGIFIED_SKIPPABLE_VALIDATIONS = [f.__name__ for f in SKIPPABLE_VALIDATIONS] VALID_FILE_TYPES = ['vcf', 'vcf.gz', 'vcf.bgz', 'mt'] @@ -26,9 +37,9 @@ class LoadingPipelineRequest(PipelineRunnerRequest): sample_type: SampleType reference_genome: ReferenceGenome dataset_type: DatasetType - skip_validation: bool = False skip_check_sex_and_relatedness: bool = False skip_expect_tdr_metrics: bool = False + validations_to_skip: list[Literal[*STRINGIFIED_SKIPPABLE_VALIDATIONS]] = [] def incr_attempt(self): if self.attempt_id == (MAX_LOADING_PIPELINE_ATTEMPTS - 1): @@ -36,6 +47,26 @@ def incr_attempt(self): self.attempt_id += 1 return True + @field_validator('validations_to_skip') + @classmethod + def must_be_known_validation(cls, validations_to_skip): + for v in validations_to_skip: + if v not in set(STRINGIFIED_SKIPPABLE_VALIDATIONS): + msg = f'{v} is not a valid validator' + raise ValueError(msg) + return validations_to_skip + + @root_validator( + pre=True, + ) # the root validator runs before Pydantic parses or coerces field values. + @classmethod + def override_all_validations(cls, values): + if values.get('validations_to_skip') == [ + ALL_VALIDATIONS, + ]: + values['validations_to_skip'] = STRINGIFIED_SKIPPABLE_VALIDATIONS + return values + @field_validator('callset_path') @classmethod def check_valid_callset_path(cls, callset_path: str) -> str: diff --git a/v03_pipeline/api/model_test.py b/v03_pipeline/api/model_test.py index e48331aaa..e7dd6cd19 100644 --- a/v03_pipeline/api/model_test.py +++ b/v03_pipeline/api/model_test.py @@ -46,6 +46,35 @@ def test_invalid_loading_pipeline_requests(self) -> None: ), ) + def test_validations_to_skip(self) -> None: + shared_params = { + 'callset_path': CALLSET_PATH, + 'projects_to_run': ['project_a'], + 'sample_type': SampleType.WGS.value, + 'reference_genome': ReferenceGenome.GRCh38.value, + 'dataset_type': DatasetType.SNV_INDEL.value, + } + raw_request = { + **shared_params, + 'validations_to_skip': ['all'], + } + lpr = LoadingPipelineRequest.model_validate(raw_request) + self.assertGreater(len(lpr.validations_to_skip), 2) + + raw_request = { + **shared_params, + 'validations_to_skip': ['validate_sample_type'], + } + lpr = LoadingPipelineRequest.model_validate(raw_request) + self.assertEqual(len(lpr.validations_to_skip), 1) + + raw_request = { + **shared_params, + 'validations_to_skip': ['validate_blended_exome'], + } + with self.assertRaises(ValueError): + LoadingPipelineRequest.model_validate(raw_request) + def test_delete_families_request(self) -> None: raw_request = {'project_guid': 'project_a', 'family_guids': []} with self.assertRaises(ValueError): diff --git a/v03_pipeline/bin/pipeline_worker_test.py b/v03_pipeline/bin/pipeline_worker_test.py index 961d439a2..7e3341549 100644 --- a/v03_pipeline/bin/pipeline_worker_test.py +++ b/v03_pipeline/bin/pipeline_worker_test.py @@ -176,7 +176,7 @@ def test_process_queue_integration_test( 'sample_type': SampleType.WGS.value, 'reference_genome': ReferenceGenome.GRCh38.value, 'dataset_type': DatasetType.SNV_INDEL.value, - 'skip_validation': True, + 'validations_to_skip': ['all'], } os.makedirs( loading_pipeline_queue_dir(), @@ -192,7 +192,7 @@ def test_process_queue_integration_test( json.dump(raw_request, f) process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( - ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "skip_validation": true\n}```', + ':white_check_mark: Pipeline Runner Request Success! :white_check_mark:\nRun ID: 20250916-200704-123456\n```{\n "attempt_id": 0,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "R0113_test_project"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": [\n "validate_allele_depth_length",\n "validate_allele_type",\n "validate_expected_contig_frequency",\n "validate_no_duplicate_variants",\n "validate_sample_type"\n ]\n}```', ) with hfs.open( clickhouse_load_success_file_path( @@ -263,7 +263,7 @@ def test_process_failure( process_queue(local_scheduler=True) process_queue(local_scheduler=True) mock_safe_post_to_slack.assert_called_once_with( - ':failed: Pipeline Runner Request Failed :failed:\nRun ID: 20250918-200704-123456\n```{\n "attempt_id": 2,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "project_a"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "skip_validation": false\n}```\nReason: there were failed tasks', + ':failed: Pipeline Runner Request Failed :failed:\nRun ID: 20250918-200704-123456\n```{\n "attempt_id": 2,\n "callset_path": "v03_pipeline/var/test/callsets/1kg_30variants.vcf",\n "dataset_type": "SNV_INDEL",\n "project_guids": [\n "project_a"\n ],\n "reference_genome": "GRCh38",\n "request_type": "LoadingPipelineRequest",\n "sample_type": "WGS",\n "skip_check_sex_and_relatedness": false,\n "skip_expect_tdr_metrics": false,\n "validations_to_skip": []\n}```\nReason: there were failed tasks', ) self.assertEqual(len(os.listdir(loading_pipeline_queue_dir())), 0) with open( diff --git a/v03_pipeline/lib/misc/io.py b/v03_pipeline/lib/misc/io.py index d0a7e044e..f6d901ce9 100644 --- a/v03_pipeline/lib/misc/io.py +++ b/v03_pipeline/lib/misc/io.py @@ -81,7 +81,7 @@ def compute_hail_n_partitions(file_size_b: int) -> int: ) def split_multi_hts( mt: hl.MatrixTable, - skip_validation: bool, + skip_validate_no_duplicate_variants: bool, max_samples_split_multi_shuffle=MAX_SAMPLES_SPLIT_MULTI_SHUFFLE, ) -> hl.MatrixTable: bi = mt.filter_rows(hl.len(mt.alleles) == BIALLELIC) @@ -98,7 +98,7 @@ def split_multi_hts( mt = split.union_rows(bi) # If we've disabled validation (which is expected to throw an exception # for duplicate variants, we would like to distinc ) - if skip_validation: + if skip_validate_no_duplicate_variants: return mt.distinct_by_row() return mt diff --git a/v03_pipeline/lib/misc/validation.py b/v03_pipeline/lib/misc/validation.py index 280525462..1c480c20c 100644 --- a/v03_pipeline/lib/misc/validation.py +++ b/v03_pipeline/lib/misc/validation.py @@ -9,6 +9,7 @@ SampleType, ) +ALL_VALIDATIONS = 'all' AMBIGUOUS_THRESHOLD_PERC: float = 0.01 # Fraction of samples identified as "ambiguous_sex" above which an error will be thrown. MIN_ROWS_PER_CONTIG = 100 SAMPLE_TYPE_MATCH_THRESHOLD = 0.3 @@ -191,3 +192,12 @@ def validate_sample_type( if has_noncoding and has_coding and sample_type != SampleType.WGS: msg = 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants' raise SeqrValidationError(msg) + + +SKIPPABLE_VALIDATIONS = [ + validate_allele_depth_length, + validate_allele_type, + validate_expected_contig_frequency, + validate_no_duplicate_variants, + validate_sample_type, +] diff --git a/v03_pipeline/lib/tasks/base/base_loading_run_params.py b/v03_pipeline/lib/tasks/base/base_loading_run_params.py index 5cba7d46f..3cd447cc5 100644 --- a/v03_pipeline/lib/tasks/base/base_loading_run_params.py +++ b/v03_pipeline/lib/tasks/base/base_loading_run_params.py @@ -28,10 +28,7 @@ class BaseLoadingRunParams(luigi.Task): default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, ) - skip_validation = luigi.BoolParameter( - default=False, - parsing=luigi.BoolParameter.EXPLICIT_PARSING, - ) + validations_to_skip = luigi.ListParameter(default=[]) is_new_gcnv_joint_call = luigi.BoolParameter( default=False, parsing=luigi.BoolParameter.EXPLICIT_PARSING, diff --git a/v03_pipeline/lib/tasks/dataproc/misc_test.py b/v03_pipeline/lib/tasks/dataproc/misc_test.py index 347e2e0b9..983275c6e 100644 --- a/v03_pipeline/lib/tasks/dataproc/misc_test.py +++ b/v03_pipeline/lib/tasks/dataproc/misc_test.py @@ -39,8 +39,8 @@ def test_to_kebab_str_args(self, _: Mock): 'False', '--skip-expect-tdr-metrics', 'False', - '--skip-validation', - 'False', + '--validations-to-skip', + '[]', '--is-new-gcnv-joint-call', 'False', '--run-id', diff --git a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py index 90d8d6dfa..43fa38a14 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_entries_parquet_test.py @@ -10,6 +10,7 @@ ReferenceGenome, SampleType, ) +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( db_id_to_gene_id_path, new_entries_parquet_path, @@ -114,7 +115,7 @@ def test_write_new_entries_parquet(self): sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project', 'R0114_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -214,7 +215,7 @@ def test_mito_write_new_entries_parquet(self): sample_type=SampleType.WGS, callset_path=TEST_MITO_CALLSET, project_guids=['R0116_test_project3'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -269,7 +270,7 @@ def test_sv_write_new_entries_parquet(self): sample_type=SampleType.WGS, callset_path=TEST_SV_VCF_2, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -352,7 +353,7 @@ def test_gcnv_write_new_entries_parquet(self): sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py index 5af285095..35d2d94e2 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_transcripts_parquet_test.py @@ -9,6 +9,7 @@ ReferenceGenome, SampleType, ) +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( new_transcripts_parquet_path, new_variants_table_path, @@ -87,7 +88,7 @@ def test_write_new_transcripts_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -187,7 +188,7 @@ def test_grch37_write_new_transcripts_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py index a3c7fc431..69b0c5b36 100644 --- a/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py +++ b/v03_pipeline/lib/tasks/exports/write_new_variants_parquet_test.py @@ -10,6 +10,7 @@ ReferenceGenome, SampleType, ) +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( new_variants_parquet_path, new_variants_table_path, @@ -107,7 +108,7 @@ def test_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -231,7 +232,7 @@ def test_grch37_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -342,7 +343,7 @@ def test_mito_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -436,7 +437,7 @@ def test_sv_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) @@ -513,7 +514,7 @@ def test_gcnv_write_new_variants_parquet_test( project_guids=[ 'fake_project', ], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(task) diff --git a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py index 9a03d75cc..c2b253bdb 100644 --- a/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py +++ b/v03_pipeline/lib/tasks/update_variant_annotations_table_with_new_samples_test.py @@ -1,5 +1,5 @@ +import functools import shutil -from functools import partial from unittest.mock import Mock, PropertyMock, patch import hail as hl @@ -24,7 +24,11 @@ SampleType, ) from v03_pipeline.lib.misc.io import remap_pedigree_hash -from v03_pipeline.lib.misc.validation import validate_expected_contig_frequency +from v03_pipeline.lib.misc.validation import ( + ALL_VALIDATIONS, + SKIPPABLE_VALIDATIONS, + validate_expected_contig_frequency, +) from v03_pipeline.lib.paths import ( valid_reference_dataset_path, ) @@ -86,7 +90,7 @@ def test_missing_pedigree( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -116,7 +120,7 @@ def test_missing_interval_reference_dataset( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker = luigi.worker.Worker() @@ -129,8 +133,21 @@ def test_missing_interval_reference_dataset( ) @patch('v03_pipeline.lib.tasks.update_new_variants_with_caids.Env') @patch( - 'v03_pipeline.lib.tasks.validate_callset.validate_expected_contig_frequency', - partial(validate_expected_contig_frequency, min_rows_per_contig=25), + 'v03_pipeline.lib.tasks.validate_callset.SKIPPABLE_VALIDATIONS', + [ + x + for x in SKIPPABLE_VALIDATIONS + if x.__name__ != 'validate_expected_contig_frequency' + ] + + [ + functools.update_wrapper( + functools.partial( + validate_expected_contig_frequency, + min_rows_per_contig=25, + ), + validate_expected_contig_frequency, + ), + ], ) @patch.object(ReferenceGenome, 'standard_contigs', new_callable=PropertyMock) @patch('v03_pipeline.lib.vep.hl.vep') @@ -268,7 +285,7 @@ def test_multiple_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=False, + validations_to_skip=[], run_id=TEST_RUN_ID, ) worker.add(uvatwns_task_3) @@ -329,7 +346,7 @@ def test_multiple_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0114_project4'], - skip_validation=False, + validations_to_skip=[], run_id=TEST_RUN_ID + '-another-run', ) worker.add(uvatwns_task_4) @@ -527,7 +544,7 @@ def test_update_vat_grch37( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -683,7 +700,7 @@ def test_update_vat_without_accessing_private_datasets( sample_type=SampleType.WGS, callset_path=TEST_SNV_INDEL_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) worker.add(uvatwns_task) @@ -726,7 +743,7 @@ def test_mito_update_vat( sample_type=SampleType.WGS, callset_path=TEST_MITO_MT, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) ) @@ -834,7 +851,7 @@ def test_sv_multiple_vcf_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) ) @@ -1294,7 +1311,7 @@ def test_sv_multiple_vcf_update_vat( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF_2, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='second_run_id', ) ) @@ -1414,7 +1431,7 @@ def test_sv_multiple_project_single_vcf( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF_2, project_guids=['R0116_test_project3', 'R0117_test_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='run_id', ) ) @@ -1446,7 +1463,7 @@ def test_gcnv_update_vat_multiple( sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) ) @@ -1585,7 +1602,7 @@ def test_gcnv_update_vat_multiple( sample_type=SampleType.WES, callset_path=TEST_GCNV_BED_FILE_2, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='second_run_id', ) ) diff --git a/v03_pipeline/lib/tasks/validate_callset.py b/v03_pipeline/lib/tasks/validate_callset.py index e8f5d6f34..2e91ec2a6 100644 --- a/v03_pipeline/lib/tasks/validate_callset.py +++ b/v03_pipeline/lib/tasks/validate_callset.py @@ -3,12 +3,9 @@ import luigi.util from v03_pipeline.lib.misc.validation import ( + ALL_VALIDATIONS, + SKIPPABLE_VALIDATIONS, SeqrValidationError, - validate_allele_depth_length, - validate_allele_type, - validate_expected_contig_frequency, - validate_no_duplicate_variants, - validate_sample_type, ) from v03_pipeline.lib.paths import ( imported_callset_path, @@ -31,6 +28,22 @@ @luigi.util.inherits(BaseLoadingRunParams) class ValidateCallsetTask(BaseUpdateTask): + @property + def validation_dependencies(self) -> dict[str, hl.Table]: + deps = {} + if ( + ALL_VALIDATIONS not in self.validations_to_skip + and 'validate_sample_type' not in self.validations_to_skip + and self.dataset_type.can_run_validation + ): + deps['coding_and_noncoding_variants_ht'] = hl.read_table( + valid_reference_dataset_path( + self.reference_genome, + ReferenceDataset.gnomad_coding_and_noncoding, + ), + ) + return deps + def complete(self) -> luigi.Target: if super().complete(): mt = hl.read_matrix_table(self.output().path) @@ -50,7 +63,11 @@ def output(self) -> luigi.Target: def requires(self) -> list[luigi.Task]: requirements = [self.clone(WriteImportedCallsetTask)] - if not self.skip_validation and self.dataset_type.can_run_validation: + if ( + ALL_VALIDATIONS not in self.validations_to_skip + and 'validate_sample_type' not in self.validations_to_skip + and self.dataset_type.can_run_validation + ): requirements = [ *requirements, ( @@ -87,29 +104,22 @@ def update_table(self, mt: hl.MatrixTable) -> hl.MatrixTable: ), ) - validation_exceptions = [] - if self.skip_validation or not self.dataset_type.can_run_validation: + if ( + ALL_VALIDATIONS in self.validations_to_skip + or not self.dataset_type.can_run_validation + ): return mt.select_globals( callset_path=self.callset_path, validated_sample_type=self.sample_type.value, ) - coding_and_noncoding_variants_ht = hl.read_table( - valid_reference_dataset_path( - self.reference_genome, - ReferenceDataset.gnomad_coding_and_noncoding, - ), - ) - for validation_f in [ - validate_allele_depth_length, - validate_allele_type, - validate_no_duplicate_variants, - validate_expected_contig_frequency, - validate_sample_type, - ]: + validation_exceptions = [] + for validation_f in SKIPPABLE_VALIDATIONS: try: + if validation_f.__name__ in self.validations_to_skip: + continue validation_f( mt, - coding_and_noncoding_variants_ht=coding_and_noncoding_variants_ht, + **self.validation_dependencies, **self.param_kwargs, ) except SeqrValidationError as e: diff --git a/v03_pipeline/lib/tasks/validate_callset_test.py b/v03_pipeline/lib/tasks/validate_callset_test.py index 6340dfc6a..895ec1bb0 100644 --- a/v03_pipeline/lib/tasks/validate_callset_test.py +++ b/v03_pipeline/lib/tasks/validate_callset_test.py @@ -55,7 +55,6 @@ def test_validate_callset_multiple_exceptions( # all contigs but chr1, and contains non-coding variants. callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, project_guids=['project_a'], - skip_validation=False, run_id=TEST_RUN_ID, ) worker.add(validate_callset_task) @@ -68,7 +67,6 @@ def test_validate_callset_multiple_exceptions( sample_type=SampleType.WES, callset_path=MULTIPLE_VALIDATION_EXCEPTIONS_VCF, project_guids=['project_a'], - skip_validation=False, run_id=TEST_RUN_ID, ) self.assertTrue(write_validation_errors_task.complete()) @@ -79,8 +77,8 @@ def test_validate_callset_multiple_exceptions( 'project_guids': ['project_a'], 'error_messages': [ 'Alleles with invalid allele are present in the callset. This appears to be a GVCF containing records for sites with no variants.', - "Variants are present multiple times in the callset: ['1-902088-G-A']", 'Missing the following expected contigs:chr10, chr11, chr12, chr13, chr14, chr15, chr16, chr17, chr18, chr19, chr2, chr20, chr21, chr22, chr3, chr4, chr5, chr6, chr7, chr8, chr9, chrX', + "Variants are present multiple times in the callset: ['1-902088-G-A']", 'Sample type validation error: dataset sample-type is specified as WES but appears to be WGS because it contains many common non-coding variants', ], }, diff --git a/v03_pipeline/lib/tasks/write_imported_callset.py b/v03_pipeline/lib/tasks/write_imported_callset.py index e4df70899..6a3230197 100644 --- a/v03_pipeline/lib/tasks/write_imported_callset.py +++ b/v03_pipeline/lib/tasks/write_imported_callset.py @@ -85,7 +85,10 @@ def create_table(self) -> hl.MatrixTable: ) if self.dataset_type.has_multi_allelic_variants: # NB: throws SeqrValidationError - mt = split_multi_hts(mt, self.skip_validation) + mt = split_multi_hts( + mt, + 'validate_no_duplicate_variants' in self.validations_to_skip, + ) if self.dataset_type.re_key_by_seqr_internal_truth_vid and hasattr( mt, 'info.SEQR_INTERNAL_TRUTH_VID', diff --git a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py index e8318f043..e0bec8f14 100644 --- a/v03_pipeline/lib/tasks/write_metadata_for_run_test.py +++ b/v03_pipeline/lib/tasks/write_metadata_for_run_test.py @@ -5,6 +5,7 @@ import luigi.worker from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import relatedness_check_tsv_path from v03_pipeline.lib.tasks.write_metadata_for_run import WriteMetadataForRunTask from v03_pipeline.lib.test.misc import copy_project_pedigree_to_mocked_dir @@ -54,7 +55,7 @@ def test_write_metadata_for_run_task( sample_type=SampleType.WGS, callset_path=TEST_VCF, project_guids=['R0113_test_project', 'R0114_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id='run_123456', ) worker.add(write_metadata_for_run_task) diff --git a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py index 305150699..9504965c4 100644 --- a/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py +++ b/v03_pipeline/lib/tasks/write_remapped_and_subsetted_callset_test.py @@ -7,6 +7,7 @@ from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType from v03_pipeline.lib.misc.io import remap_pedigree_hash +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ( relatedness_check_table_path, sex_check_table_path, @@ -101,7 +102,7 @@ def test_write_remapped_and_subsetted_callset_task( callset_path=TEST_VCF, project_guids=['R0113_test_project'], project_i=0, - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) worker.add(wrsc_task) @@ -151,7 +152,7 @@ def test_write_remapped_and_subsetted_callset_task_failed_some_family_checks( callset_path=TEST_VCF, project_guids=['R0114_project4'], project_i=0, - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) worker.add(wrsc_task) @@ -244,7 +245,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( callset_path=TEST_VCF, project_guids=['R0114_project4'], project_i=0, - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) worker.add(wrsc_task) @@ -256,7 +257,7 @@ def test_write_remapped_and_subsetted_callset_task_all_families_failed( sample_type=SampleType.WES, callset_path=TEST_VCF, project_guids=['R0114_project4'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], run_id=TEST_RUN_ID, ) self.assertTrue(write_validation_errors_task.complete()) diff --git a/v03_pipeline/lib/tasks/write_sample_qc_json_test.py b/v03_pipeline/lib/tasks/write_sample_qc_json_test.py index 8d2bb699e..d1b0a3643 100644 --- a/v03_pipeline/lib/tasks/write_sample_qc_json_test.py +++ b/v03_pipeline/lib/tasks/write_sample_qc_json_test.py @@ -8,6 +8,7 @@ import luigi.worker from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.paths import ancestry_model_rf_path from v03_pipeline.lib.tasks.write_sample_qc_json import WriteSampleQCJsonTask from v03_pipeline.lib.test.mock_complete_task import MockCompleteTask @@ -113,7 +114,7 @@ def test_call_sample_qc( sample_type=SampleType.WGS, callset_path=TEST_VCF, project_guids=['R0113_test_project'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], ) worker.add(task) worker.run() diff --git a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py index 913de418c..8f3a00107 100644 --- a/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py +++ b/v03_pipeline/lib/tasks/write_variant_annotations_vcf_test.py @@ -5,6 +5,7 @@ import luigi.worker from v03_pipeline.lib.core import DatasetType, ReferenceGenome, SampleType +from v03_pipeline.lib.misc.validation import ALL_VALIDATIONS from v03_pipeline.lib.tasks.update_variant_annotations_table_with_new_samples import ( UpdateVariantAnnotationsTableWithNewSamplesTask, ) @@ -64,7 +65,7 @@ def test_sv_export_vcf( sample_type=SampleType.WGS, callset_path=TEST_SV_VCF, project_guids=['R0115_test_project2'], - skip_validation=True, + validations_to_skip=[ALL_VALIDATIONS], skip_expect_tdr_metrics=True, ) )