diff --git a/.github/workflows/tests.yml b/.github/workflows/tests.yml index 21eecaa..4a83044 100644 --- a/.github/workflows/tests.yml +++ b/.github/workflows/tests.yml @@ -19,16 +19,17 @@ jobs: steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v4 - name: Set up Python ${{ matrix.python-version }} - uses: actions/setup-python@v3 + uses: actions/setup-python@v5 with: python-version: ${{ matrix.python-version }} cache: "pip" + cache-dependency-path: requirements.txt - name: Install dependencies run: | python -m pip install --upgrade pip - python -m pip install flake8 pytest pytest-cov coveralls + python -m pip install flake8 ruff pytest pytest-cov coveralls pip install -r requirements.txt pip install . - name: Lint with flake8 @@ -53,3 +54,5 @@ jobs: ./test.sh - name: Publish coverage to Coveralls uses: coverallsapp/github-action@v2.2.3 + with: + parallel: true diff --git a/develop.sh b/develop.sh new file mode 100755 index 0000000..a7b89c6 --- /dev/null +++ b/develop.sh @@ -0,0 +1,3 @@ +set -e + +pip install -e . diff --git a/lint.sh b/lint.sh index 554e4b0..17cd87b 100755 --- a/lint.sh +++ b/lint.sh @@ -1,12 +1,5 @@ #!/bin/bash set -o errexit -# getting false positives due to this issue with pylint: -# https://bitbucket.org/logilab/pylint/issues/701/false-positives-with-not-an-iterable-and - -find varcode tests -name '*.py' \ - | xargs pylint \ - --errors-only \ - --disable=unsubscriptable-object,not-an-iterable - -echo 'Passes pylint check' +ruff check varcode tests +echo 'Passes ruff check' diff --git a/tests/test_effect_helpers.py b/tests/test_effect_helpers.py new file mode 100644 index 0000000..667389f --- /dev/null +++ b/tests/test_effect_helpers.py @@ -0,0 +1,242 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from varcode.effects.effect_helpers import ( + changes_exonic_splice_site, + variant_overlaps_interval, +) + + +class DummyTranscript(object): + def __init__(self, sequence, n_exons=2): + self.sequence = sequence + self.exons = [object()] * n_exons + + +def test_variant_overlaps_interval_single_base_at_interval_boundaries(): + assert variant_overlaps_interval( + variant_start=3, + n_ref_bases=1, + interval_start=3, + interval_end=5) + assert variant_overlaps_interval( + variant_start=5, + n_ref_bases=1, + interval_start=3, + interval_end=5) + + +def test_variant_overlaps_interval_excludes_adjacent_bases(): + assert not variant_overlaps_interval( + variant_start=2, + n_ref_bases=1, + interval_start=3, + interval_end=5) + assert not variant_overlaps_interval( + variant_start=6, + n_ref_bases=1, + interval_start=3, + interval_end=5) + + +def test_variant_overlaps_interval_spanning_variants(): + assert variant_overlaps_interval( + variant_start=2, + n_ref_bases=2, + interval_start=3, + interval_end=5) + assert not variant_overlaps_interval( + variant_start=1, + n_ref_bases=2, + interval_start=3, + interval_end=5) + + +def test_variant_overlaps_interval_insertions(): + assert variant_overlaps_interval( + variant_start=3, + n_ref_bases=0, + interval_start=3, + interval_end=5) + assert variant_overlaps_interval( + variant_start=5, + n_ref_bases=0, + interval_start=3, + interval_end=5) + assert not variant_overlaps_interval( + variant_start=2, + n_ref_bases=0, + interval_start=3, + interval_end=5) + + +def test_changes_exonic_splice_site_breaking_substitution_in_motif(): + transcript = DummyTranscript("AATCAGAA") + result = changes_exonic_splice_site( + transcript_offset=4, + transcript=transcript, + transcript_ref="A", + transcript_alt="C", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is True + + +def test_changes_exonic_splice_site_preserving_substitution_in_motif(): + transcript = DummyTranscript("AATCAGAA") + result = changes_exonic_splice_site( + transcript_offset=3, + transcript=transcript, + transcript_ref="C", + transcript_alt="A", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is None + + +def test_changes_exonic_splice_site_ignores_noncanonical_reference_motif(): + # Exon-end motif is CTG (not MAG), so this path should not trigger. + transcript = DummyTranscript("AATCTGAA") + result = changes_exonic_splice_site( + transcript_offset=4, + transcript=transcript, + transcript_ref="T", + transcript_alt="A", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is None + + +def test_changes_exonic_splice_site_skips_end_motif_check_for_last_exon(): + # Last exon has no downstream splice donor boundary in this model. + transcript = DummyTranscript("AATCAGAA", n_exons=1) + result = changes_exonic_splice_site( + transcript_offset=4, + transcript=transcript, + transcript_ref="A", + transcript_alt="C", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is None + + +def test_changes_exonic_splice_site_breaking_deletion_in_motif(): + transcript = DummyTranscript("AATCAGAA") + result = changes_exonic_splice_site( + transcript_offset=4, + transcript=transcript, + transcript_ref="A", + transcript_alt="", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is True + + +def test_changes_exonic_splice_site_preserving_insertion_in_motif(): + transcript = DummyTranscript("AATCAGAA") + result = changes_exonic_splice_site( + transcript_offset=3, + transcript=transcript, + transcript_ref="", + transcript_alt="A", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is None + + +def test_changes_exonic_splice_site_breaking_insertion_in_motif(): + transcript = DummyTranscript("AATCAGAA") + result = changes_exonic_splice_site( + transcript_offset=3, + transcript=transcript, + transcript_ref="", + transcript_alt="T", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is True + + +def test_changes_exonic_splice_site_breaking_acceptor_purine_substitution(): + # Exon start base A->T for exon_number > 1 should break YAG|R. + transcript = DummyTranscript("CCATGGTT", n_exons=2) + result = changes_exonic_splice_site( + transcript_offset=2, + transcript=transcript, + transcript_ref="A", + transcript_alt="T", + exon_start_offset=2, + exon_end_offset=5, + exon_number=2) + assert result is True + + +def test_changes_exonic_splice_site_preserving_acceptor_purine_substitution(): + transcript = DummyTranscript("CCATGGTT", n_exons=2) + result = changes_exonic_splice_site( + transcript_offset=2, + transcript=transcript, + transcript_ref="A", + transcript_alt="G", + exon_start_offset=2, + exon_end_offset=5, + exon_number=2) + assert result is None + + +def test_changes_exonic_splice_site_breaking_acceptor_deletion(): + # Deleting the first exon base (A) reveals C (non-purine) at exon start. + transcript = DummyTranscript("CCACGGTT", n_exons=2) + result = changes_exonic_splice_site( + transcript_offset=2, + transcript=transcript, + transcript_ref="A", + transcript_alt="", + exon_start_offset=2, + exon_end_offset=5, + exon_number=2) + assert result is True + + +def test_changes_exonic_splice_site_preserving_acceptor_deletion(): + # Deleting first exon base (A) reveals G (purine), preserving YAG|R. + transcript = DummyTranscript("CCAGGGTT", n_exons=2) + result = changes_exonic_splice_site( + transcript_offset=2, + transcript=transcript, + transcript_ref="A", + transcript_alt="", + exon_start_offset=2, + exon_end_offset=5, + exon_number=2) + assert result is None + + +def test_changes_exonic_splice_site_ignores_variant_before_splice_window(): + # The canonical exon-end splice motif is at offsets 3..5: CAG. + # Variant is at offset 2, immediately before that window. + transcript = DummyTranscript("AATCAGAA") + result = changes_exonic_splice_site( + transcript_offset=2, + transcript=transcript, + transcript_ref="T", + transcript_alt="C", + exon_start_offset=0, + exon_end_offset=5, + exon_number=1) + assert result is None diff --git a/tests/test_effect_prediction.py b/tests/test_effect_prediction.py new file mode 100644 index 0000000..023440e --- /dev/null +++ b/tests/test_effect_prediction.py @@ -0,0 +1,111 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import pytest + +from varcode.effects import EffectCollection +from varcode.effects.effect_classes import ( + Intronic, + IntronicSpliceSite, + SpliceAcceptor, + SpliceDonor, +) +from varcode.effects.effect_prediction import ( + choose_intronic_effect_class, + predict_variant_effects, +) + + +class VariantWithGeneLookupError(object): + @property + def gene_ids(self): + raise RuntimeError("simulated lookup failure") + + @property + def transcripts(self): + return [] + + +class DummyVariant(object): + def __init__(self, start, end, is_insertion=False): + self.trimmed_base1_start = start + self.trimmed_base1_end = end + self.is_insertion = is_insertion + + +class DummyExon(object): + def __init__(self, strand, start, end): + self.strand = strand + self.start = start + self.end = end + + +def test_predict_variant_effects_returns_collection_on_lookup_error(): + effects = predict_variant_effects( + variant=VariantWithGeneLookupError(), + raise_on_error=False) + assert isinstance(effects, EffectCollection) + assert len(effects) == 0 + + +def test_predict_variant_effects_reraises_on_lookup_error(): + with pytest.raises(RuntimeError): + predict_variant_effects( + variant=VariantWithGeneLookupError(), + raise_on_error=True) + + +@pytest.mark.parametrize( + "variant,exon,distance_to_exon,expected_effect_class", + [ + # + strand, before exon, distance 1-2 => splice acceptor + (DummyVariant(start=9, end=9), DummyExon(strand="+", start=10, end=20), 1, SpliceAcceptor), + # + strand, after exon, distance 1-2 => splice donor + (DummyVariant(start=21, end=21), DummyExon(strand="+", start=10, end=20), 2, SpliceDonor), + # + strand, after exon, distance 3-6 => intronic splice site + (DummyVariant(start=21, end=21), DummyExon(strand="+", start=10, end=20), 6, IntronicSpliceSite), + # + strand, before exon, distance 3 => intronic splice site + (DummyVariant(start=7, end=7), DummyExon(strand="+", start=10, end=20), 3, IntronicSpliceSite), + # + strand, far from exon => intronic + (DummyVariant(start=6, end=6), DummyExon(strand="+", start=10, end=20), 4, Intronic), + # + strand insertion exactly at exon start is before exon + (DummyVariant(start=10, end=10, is_insertion=True), DummyExon(strand="+", start=10, end=20), 1, SpliceAcceptor), + # - strand, genomic position after exon is before exon in transcript direction + (DummyVariant(start=21, end=21), DummyExon(strand="-", start=10, end=20), 1, SpliceAcceptor), + # - strand, genomic position before exon is after exon in transcript direction + (DummyVariant(start=9, end=9), DummyExon(strand="-", start=10, end=20), 2, SpliceDonor), + # - strand insertion exactly at exon end counts as before exon + (DummyVariant(start=20, end=20, is_insertion=True), DummyExon(strand="-", start=10, end=20), 2, SpliceAcceptor), + # - strand, after exon (transcript direction), distance 3-6 => intronic splice site + (DummyVariant(start=8, end=8), DummyExon(strand="-", start=10, end=20), 6, IntronicSpliceSite), + # - strand, far from exon => intronic + (DummyVariant(start=8, end=8), DummyExon(strand="-", start=10, end=20), 7, Intronic), + ], +) +def test_choose_intronic_effect_class_paths( + variant, + exon, + distance_to_exon, + expected_effect_class): + effect_class = choose_intronic_effect_class( + variant=variant, + nearest_exon=exon, + distance_to_exon=distance_to_exon) + assert effect_class is expected_effect_class + + +def test_choose_intronic_effect_class_rejects_nonpositive_distance(): + with pytest.raises(AssertionError): + choose_intronic_effect_class( + variant=DummyVariant(start=9, end=9), + nearest_exon=DummyExon(strand="+", start=10, end=20), + distance_to_exon=0) diff --git a/tests/test_mouse.py b/tests/test_mouse.py index 319e916..74dc00a 100644 --- a/tests/test_mouse.py +++ b/tests/test_mouse.py @@ -12,7 +12,7 @@ from .common import eq_ -from varcode import load_vcf, load_vcf_fast, Variant +from varcode import load_vcf, Variant from varcode.effects import Substitution from pyensembl import Genome, EnsemblRelease from .data import data_path diff --git a/tests/test_reference.py b/tests/test_reference.py index 06f18b0..64f7801 100644 --- a/tests/test_reference.py +++ b/tests/test_reference.py @@ -38,7 +38,7 @@ def test_most_recent_assembly(): eq_(most_recent_assembly_name(['ncbi36']), 'ncbi36') eq_(most_recent_assembly_name(['ncbi36', '35']), 'ncbi36') def generate_reference_name_aliases(): - with warnings.catch_warnings(record=True) as w: + with warnings.catch_warnings(record=True): for assembly_name, aliases in ensembl_reference_aliases.items(): candidate_list = [assembly_name] + list(aliases) for candidate in candidate_list: @@ -64,4 +64,3 @@ def generate_reference_name_fasta_filenames(): @pytest.mark.parametrize(['candidate', 'assembly_name'], generate_reference_name_fasta_filenames()) def test_reference_name_fasta_filenames(candidate, assembly_name): eq_(infer_reference_name(candidate), assembly_name) - diff --git a/varcode/cli/variant_args.py b/varcode/cli/variant_args.py index 756415a..d31a27b 100644 --- a/varcode/cli/variant_args.py +++ b/varcode/cli/variant_args.py @@ -115,12 +115,15 @@ def download_and_install_reference_data(variant_collections): def variant_collection_from_args(args, required=True): variant_collections = [] - for vcf_path in args.vcf: - variant_collections.append( - load_vcf(vcf_path, genome=args.genome)) - - for maf_path in args.maf: - variant_collections.append(load_maf(maf_path)) + variant_collections.extend( + load_vcf(vcf_path, genome=args.genome) + for vcf_path in args.vcf + ) + + variant_collections.extend( + load_maf(maf_path) + for maf_path in args.maf + ) if args.variant: if not args.genome: diff --git a/varcode/effects/common.py b/varcode/effects/common.py index d39358f..82b0f8a 100644 --- a/varcode/effects/common.py +++ b/varcode/effects/common.py @@ -10,13 +10,9 @@ # See the License for the specific language governing permissions and # limitations under the License. -from Bio.Seq import Seq - - - def bio_seq_to_str(seq): if type(seq) is str: return seq else: return str(seq) - \ No newline at end of file + diff --git a/varcode/effects/effect_collection.py b/varcode/effects/effect_collection.py index de1f381..d6646e8 100644 --- a/varcode/effects/effect_collection.py +++ b/varcode/effects/effect_collection.py @@ -179,11 +179,13 @@ def detailed_string(self): lines.append(" Gene: %s (%s)" % (gene_name, gene_id)) # place transcript effects with more significant impact # on top (e.g. FrameShift should go before NoncodingTranscript) - for effect in sorted( + lines.extend( + " -- %s" % effect + for effect in sorted( gene_effects, key=effect_priority, - reverse=True): - lines.append(" -- %s" % effect) + reverse=True) + ) # if we only printed one effect for this gene then # it's redundant to print it again as the highest priority effect diff --git a/varcode/effects/effect_helpers.py b/varcode/effects/effect_helpers.py index 0f6aff8..b1fc0bf 100644 --- a/varcode/effects/effect_helpers.py +++ b/varcode/effects/effect_helpers.py @@ -16,6 +16,7 @@ from ..nucleotides import PURINE_NUCLEOTIDES, AMINO_NUCLEOTIDES +from .mutate import insert_after, substitute def variant_overlaps_interval( variant_start, @@ -47,7 +48,7 @@ def variant_overlaps_interval( # end after the insertion point, they must be fully contained # by the other interval return interval_start <= variant_start and interval_end >= variant_start - variant_end = variant_start + n_ref_bases + variant_end = variant_start + n_ref_bases - 1 """ if self._changes_exonic_splice_site( strand_ref, @@ -66,6 +67,36 @@ def matches_exon_end_pattern(seq): return False return seq[-3] in AMINO_NUCLEOTIDES and seq[-2] == "A" and seq[-1] == "G" + +def mutate_exon_sequence( + transcript, + transcript_offset, + transcript_ref, + transcript_alt, + exon_start_offset, + exon_end_offset): + """ + Apply a transcript-level variant to an exon sequence. + + Used for a simple splice motif model that compares the reference and + mutated exon-end trinucleotides. + """ + exon_sequence = transcript.sequence[exon_start_offset:exon_end_offset + 1] + variant_offset_in_exon = transcript_offset - exon_start_offset + + if len(transcript_ref) == 0: + return insert_after( + sequence=exon_sequence, + offset=variant_offset_in_exon, + new_residues=transcript_alt) + + return substitute( + sequence=exon_sequence, + offset=variant_offset_in_exon, + ref=transcript_ref, + alt=transcript_alt) + + def changes_exonic_splice_site( transcript_offset, transcript, @@ -150,11 +181,17 @@ def changes_exonic_splice_site( exon_end_offset - 2:exon_end_offset + 1] if matches_exon_end_pattern(end_of_reference_exon): - # if the last three nucleotides conform to the consensus - # sequence then treat any deviation as an ExonicSpliceSite - # mutation - end_of_variant_exon = end_of_reference_exon - if matches_exon_end_pattern(end_of_variant_exon): - # end of exon matches splicing signal, check if it still - # does after the mutation + # Simple exon-end motif model: + # - If reference end matches MAG and + # - mutation changes the final exon trinucleotide away from + # MAG, mark as an exonic splice-site effect. + mutated_exon_sequence = mutate_exon_sequence( + transcript=transcript, + transcript_offset=transcript_offset, + transcript_ref=transcript_ref, + transcript_alt=transcript_alt, + exon_start_offset=exon_start_offset, + exon_end_offset=exon_end_offset) + end_of_variant_exon = mutated_exon_sequence[-3:] + if not matches_exon_end_pattern(end_of_variant_exon): return True diff --git a/varcode/effects/effect_ordering.py b/varcode/effects/effect_ordering.py index 38f6112..5932b2e 100644 --- a/varcode/effects/effect_ordering.py +++ b/varcode/effects/effect_ordering.py @@ -478,8 +478,8 @@ def top_priority_effect_for_single_gene(effects): # get set of indices of all effects with maximum CDS length max_cds_length_indices = { i - for (i, l) in enumerate(cds_lengths) - if l == max_cds_length + for (i, length) in enumerate(cds_lengths) + if length == max_cds_length } seq_lengths = [length_of_associated_transcript(e) for e in effects] @@ -489,8 +489,8 @@ def top_priority_effect_for_single_gene(effects): # has maximum sequence length max_seq_length_indices = { i - for (i, l) in enumerate(seq_lengths) - if l == max_seq_length + for (i, length) in enumerate(seq_lengths) + if length == max_seq_length } # which effects have transcripts with both the longest CDS and @@ -559,4 +559,3 @@ def top_priority_effect(effects): # if all effects were without genes then choose the best among those assert len(effects_without_genes) > 0 return max(effects_without_genes, key=multi_gene_effect_sort_key) - diff --git a/varcode/effects/effect_prediction.py b/varcode/effects/effect_prediction.py index f38b80a..85fdf81 100644 --- a/varcode/effects/effect_prediction.py +++ b/varcode/effects/effect_prediction.py @@ -62,11 +62,11 @@ def predict_variant_effects(variant, raise_on_error=False): try: gene_ids = variant.gene_ids transcripts = variant.transcripts - except: + except Exception: if raise_on_error: raise else: - return [] + return EffectCollection([]) if len(gene_ids) == 0: effects = [Intergenic(variant)] diff --git a/varcode/reference.py b/varcode/reference.py index 8b8be26..1809570 100644 --- a/varcode/reference.py +++ b/varcode/reference.py @@ -12,7 +12,6 @@ from __future__ import print_function, division, absolute_import -from collections import defaultdict import os from warnings import warn import re @@ -344,4 +343,4 @@ def infer_genome(genome_object_string_or_int): "instance, got %s : %s") % ( str(genome_object_string_or_int), type(genome_object_string_or_int))) - return genome, converted_ucsc_to_ensembl \ No newline at end of file + return genome, converted_ucsc_to_ensembl diff --git a/varcode/variant_collection.py b/varcode/variant_collection.py index 9faea44..cc4245c 100644 --- a/varcode/variant_collection.py +++ b/varcode/variant_collection.py @@ -179,8 +179,10 @@ def detailed_string(self): } gene_name_str = ", ".join(sorted(gene_names)) lines.append(" -- %s (%s):" % (gene_name_str, gene_id)) - for variant in variant_group: - lines.append(" ---- %s" % variant) + lines.extend( + " ---- %s" % variant + for variant in variant_group + ) header = self.short_string() joined_lines = "\n".join(lines) return "%s\n%s" % (header, joined_lines) @@ -342,4 +344,4 @@ def row_from_variant(variant): def clone_without_ucsc_data(self): variants = [v.clone_without_ucsc_data() for v in self] - return self.clone_with_new_elements(variants) \ No newline at end of file + return self.clone_with_new_elements(variants) diff --git a/varcode/vcf_output.py b/varcode/vcf_output.py index ec5d89f..4058e5b 100644 --- a/varcode/vcf_output.py +++ b/varcode/vcf_output.py @@ -79,7 +79,7 @@ def build_filter_field(variant): corresponding VCF representations. """ filter_metadata = get_metadata_field('filter', variant) - if type(filter_metadata) == list: + if isinstance(filter_metadata, list): return 'PASS' if filter_metadata == [] else ';'.join(filter_metadata) else: # TODO: Can the filter field ever be something other than the 3 @@ -102,7 +102,7 @@ def build_info_pair(key, val): if val is True: return key - if type(val) == list: + if isinstance(val, list): val = ','.join(map(str, val)) else: val = str(val) @@ -132,7 +132,7 @@ def build_sample_field(sample): def build_sample_val(sample_val): """Build a specific value for a sample (i.e., one value for one column).""" - if type(sample_val) is list: + if isinstance(sample_val, list): return ','.join(map(str, sample_val)) elif sample_val is not None: return str(sample_val) diff --git a/varcode/version.py b/varcode/version.py index a955fda..bc86c94 100644 --- a/varcode/version.py +++ b/varcode/version.py @@ -1 +1 @@ -__version__ = "1.2.1" +__version__ = "1.2.2"