-
Notifications
You must be signed in to change notification settings - Fork 0
1 update small variant vcf list #2
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from all commits
2011b98
b178f0f
74d4541
a55e39c
141c519
02fb98f
8f625af
938c6e7
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,192 @@ | ||
| """ | ||
| This module contains the code for the `update_small_variant_vcf_list` command. | ||
| The command takes two arguments, `results_dir`, which is a string that specifies the directory where the results of the latest TSO500 run are stored. | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Should this functionality be used by other modules developed by other people - except for cli? If so, it would be great to have that summed up somewhere so that people are aware and can easily use this - probably your code using the functionality somewhere else will be enough.. |
||
| """ | ||
|
|
||
| import glob | ||
| import logging | ||
| import re | ||
| from datetime import datetime | ||
| from pathlib import Path | ||
|
|
||
| import pandas | ||
|
|
||
| # Use logger that was set up in CLI | ||
| logger = logging.getLogger(__name__) | ||
|
|
||
|
|
||
| class VcfList: | ||
| """ | ||
| Represents small variant VCF list. | ||
|
|
||
| Attributes: | ||
| dataframe (Dataframe): Dataframe representing the current version of small variant VCF list. | ||
| inpred_id_regex (str): Regular expression matching InPreD IDs. | ||
| output (str): Path to updated version of small variant VCF list. | ||
| tumor_sample_types (set[str]): Single letter codes representing a tumor sample. | ||
| vcf_list_columns (list[str]): List of dataframe column names. | ||
| vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory. | ||
| """ | ||
|
|
||
| vcf_list_columns = ["vcf", "sample_type"] | ||
|
|
||
| def __init__( | ||
| self, | ||
| results_dir: Path, | ||
| glob_pattern: str, | ||
| vcf_list: Path | None, | ||
| inpred_id_regex: str, | ||
| tumor_sample_types: str, | ||
| output: str, | ||
| ): | ||
| """ | ||
| Create new instance of SmallVariantVcfList. | ||
| """ | ||
| self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}") | ||
| self.inpred_id_regex = rf"{inpred_id_regex}" | ||
| self.tumor_sample_types = set(tumor_sample_types.split(",")) | ||
| self.dataframe = pandas.DataFrame(columns=self.vcf_list_columns) | ||
|
|
||
| # Try reading small variant VCF list or start from scratch | ||
| if vcf_list: | ||
| try: | ||
| self.dataframe = pandas.read_csv( | ||
| vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines="warn" | ||
| ) | ||
| except FileNotFoundError: | ||
| logger.warning( | ||
| f"{vcf_list} not found, creating new small variant VCF list." | ||
| ) | ||
| else: | ||
| logger.info("no small variant VCF list specified, creating new one.") | ||
|
|
||
| # Replace placeholder with actual date | ||
| if "<YYYYMMDD>" in output: | ||
| now = datetime.now() | ||
| self.output = output.replace("<YYYYMMDD>", now.strftime("%Y%m%d")) | ||
| else: | ||
| self.output = output | ||
|
|
||
| def __eq__(self, other): | ||
| """ | ||
| Compare to other class instance. | ||
| """ | ||
| if not isinstance(other, VcfList): | ||
| return NotImplemented | ||
| if self.dataframe != other.dataframe: | ||
| return False | ||
| if self.inpred_id_regex != other.inpred_id_regex: | ||
| return False | ||
| if self.output != other.output: | ||
| return False | ||
| if self.tumor_sample_types != other.tumor_sample_types: | ||
| return False | ||
| return self.vcfs != other.vcfs | ||
|
|
||
| def update(self): | ||
| """ | ||
| Add VCF(s) from results directory to small variant VCF list. | ||
| """ | ||
|
|
||
| # Loop over all small variant VCFs | ||
| for vcf in self.vcfs: | ||
| # Avoid duplication | ||
| if vcf in self.dataframe["vcf"].values: | ||
| logger.warning(f"{vcf} is already in small variant VCF list, skipping.") | ||
| continue | ||
|
|
||
| # Parse InPreD ID to get patient ID and sample type | ||
| match = re.search(self.inpred_id_regex, vcf) | ||
| try: | ||
| patient_id = match.group("patient_id") | ||
| sample_type = match.group("sample_type") | ||
| except AttributeError: | ||
| logger.warning(f"could not parse InPreD ID from {vcf}, skipping.") | ||
| continue | ||
|
|
||
| # Check if VCF is eligible for small variant VCF list and add if yes | ||
| small_variant_vcf = Vcf( | ||
| vcf, patient_id, sample_type, self.tumor_sample_types | ||
| ) | ||
| if not small_variant_vcf.include: | ||
| continue | ||
| else: | ||
| self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row() | ||
|
|
||
| # Check if new patient ID is represented multiple times | ||
| patient_sample_count = self.dataframe["vcf"].str.contains(patient_id).sum() | ||
| if patient_sample_count > 1: | ||
| logger.warning( | ||
| f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list." | ||
| ) | ||
|
|
||
| # Write updated small variant VCF list to file | ||
| self.dataframe.drop_duplicates().to_csv( | ||
| self.output, sep="\t", header=False, index=False | ||
| ) | ||
|
|
||
|
|
||
| class Vcf: | ||
| """ | ||
| Represents small variant VCF. | ||
|
|
||
| Attributes: | ||
| include (bool): Whether to add the vcf to the small variant VCF file or not. | ||
| patient_id (str): ID of patient that the VCF belongs to. | ||
| sample_type (str): Single letter code representing type of sample, e.g. T = tumor. | ||
| vcf (str): Path to VCF file. | ||
| """ | ||
|
|
||
| include = True | ||
|
|
||
| def __init__( | ||
| self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set | ||
| ): | ||
| """ | ||
| Create new instance of SmallVariantVcf. | ||
| """ | ||
| self.vcf = vcf | ||
| self.patient_id = patient_id | ||
|
|
||
| # Exclude control sample starting with IPC | ||
| if patient_id.startswith("IPC"): | ||
| logger.warning(f"{self.patient_id} is a control sample, skipping.") | ||
| self.include = False | ||
| return | ||
|
|
||
| # Ensure sample type is N or included in tumor_sample_types | ||
|
Collaborator
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I would prefer "N (normal)" or something like that here and in the log just for easier readability in the future. |
||
| if sample_type != "T" and sample_type != "N": | ||
| if sample_type not in tumor_sample_types: | ||
| logger.warning( | ||
| f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping." | ||
| ) | ||
| self.include = False | ||
| return | ||
| else: | ||
| # Reset any sample type in tumor_sample_types with T | ||
| logger.warning( | ||
| f"sample type code {sample_type} for {vcf} will be replaced with T" | ||
| ) | ||
| self.sample_type = "T" | ||
| else: | ||
| self.sample_type = sample_type | ||
|
|
||
| def __eq__(self, other): | ||
| """ | ||
| Compare to other class instance. | ||
| """ | ||
| if not isinstance(other, Vcf): | ||
| return NotImplemented | ||
| if self.include != other.include: | ||
| return False | ||
| if self.patient_id != other.patient_id: | ||
| return False | ||
| if self.sample_type != other.sample_type: | ||
| return False | ||
| return self.vcf != other.vcf | ||
|
|
||
| def row(self): | ||
| """ | ||
| Return small variant VCF list row. | ||
| """ | ||
| return [self.vcf, self.sample_type] | ||
This file was deleted.
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1 @@ | ||
| tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
What happens if there is a sample associated with a patient from the latest TSO500 run sequenced in one of the older runs (for example tumor DNA sample of patient A is sequenced in the latest run but normal DNA sample in one of the older runs)? Would it be useful for the older vcf file be added in the vcf list or not really?