Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"]
build-backend = "setuptools.build_meta"

[project]
dependencies = ["typer>=0.24.1"]
dependencies = ["pandas>=3.0.1", "typer>=0.24.1"]
dynamic = ["version"]
name = "tsoppy"
requires-python = ">=3.14"
Expand Down
101 changes: 91 additions & 10 deletions src/tsoppy/cli.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,27 @@
"""

import importlib.metadata
import logging
import re
from pathlib import Path
from typing import Annotated

import typer

from tsoppy.update_small_variant_vcf_list.main import VcfList

# Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message.
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s %(levelname)s: %(message)s",
datefmt="%Y/%m/%d %H:%M:%S",
)
logger = logging.getLogger(__name__)

# Create a Typer app for the CLI. The app will be used to define the commands and their arguments.
app = typer.Typer()

# app_version will be set from git tag.
app_version = importlib.metadata.version("tsoppy")


Expand All @@ -19,17 +35,82 @@ def version():
print(f"tsoppy version {app_version}")


def glob_pattern_callback(value: str) -> str:
"""
Callback function checking that the glob pattern ends with '.vcf'.
"""
if not value.endswith(".vcf"):
raise typer.BadParameter("Glob pattern must end with '.vcf'.")
return value


def inpred_id_regex_callback(value: str) -> str:
"""
Callback function ensuring inpred_id_regex contains the required named capture groups.
"""
if "<patient_id>" not in value:
raise typer.BadParameter(
"inpred_id_regex must contain a named group 'patient_id'."
)
if "<sample_type>" not in value:
raise typer.BadParameter(
"inpred_id_regex must contain a named group 'sample_type'."
)
return value


def tumor_sample_types_callback(value: str) -> str:
"""
Callback function to ensure tumor_sample_types is a comma-separated list of single letters.
"""
if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value):
raise typer.BadParameter(
"tumor_sample_types must be comma-separated list of single letters."
)
return value


@app.command()
def placeholder(
user_name: Annotated[str, typer.Option("--name", "-n")],
user_id: Annotated[str, typer.Option("--id", "-i")],
verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False,
def update_small_variant_vcf_list(
results_dir: Annotated[
Path | None,
typer.Option(
help="Directory where the results of the latest TSO500 run are stored."
),
],
glob_pattern: Annotated[
str,
typer.Option(
help="Glob pattern to search for small variant VCF files in the results directory.",
callback=glob_pattern_callback,
),
] = "**/Results/**/*_MergedSmallVariants.genome.vcf",
inpred_id_regex: Annotated[
str,
typer.Option(
help="Regular expression to extract the inpred_id from the VCF file name.",
callback=inpred_id_regex_callback,
),
] = "(?P<patient_id>\D{3}\d{4})-\D\d{2}-(?P<sample_type>\D)\d{2}-\D\d{2}.*.vcf$",
output: Annotated[
str, typer.Option(help="Name of new small variant VCF list.")
] = "small_variant_vcf_list_<YYYYMMDD>.tsv",
tumor_sample_types: Annotated[
str,
typer.Option(
help="Comma-separated list of sample types that are considered tumor samples."
),
] = "C,D,d,L,M,P,p,R,r,T,X",
vcf_list: Annotated[
Path | None, typer.Option(help="Path to list of small variant VCF files.")
] = None,
):
"""
This is the helptext for the placeholder command that demonstrates how to
use Typer for CLI applications.
Updates the small variant VCF list based on VCF(s) in results directory.
"""
if verbose:
print(f"{user_name} has the following id: {user_id}")
else:
print(f"{user_name}: {user_id}")
logger.info("Start updating small variant VCF list.")
small_variant_vcf_list = VcfList(
results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output
)
small_variant_vcf_list.update()
logger.info("Finished updating small variant VCF list.")
Empty file.
192 changes: 192 additions & 0 deletions src/tsoppy/update_small_variant_vcf_list/main.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,192 @@
"""
This module contains the code for the `update_small_variant_vcf_list` command.
The command takes two arguments, `results_dir`, which is a string that specifies the directory where the results of the latest TSO500 run are stored.
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

What happens if there is a sample associated with a patient from the latest TSO500 run sequenced in one of the older runs (for example tumor DNA sample of patient A is sequenced in the latest run but normal DNA sample in one of the older runs)? Would it be useful for the older vcf file be added in the vcf list or not really?

Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Should this functionality be used by other modules developed by other people - except for cli? If so, it would be great to have that summed up somewhere so that people are aware and can easily use this - probably your code using the functionality somewhere else will be enough..

"""

import glob
import logging
import re
from datetime import datetime
from pathlib import Path

import pandas

# Use logger that was set up in CLI
logger = logging.getLogger(__name__)


class VcfList:
"""
Represents small variant VCF list.

Attributes:
dataframe (Dataframe): Dataframe representing the current version of small variant VCF list.
inpred_id_regex (str): Regular expression matching InPreD IDs.
output (str): Path to updated version of small variant VCF list.
tumor_sample_types (set[str]): Single letter codes representing a tumor sample.
vcf_list_columns (list[str]): List of dataframe column names.
vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory.
"""

vcf_list_columns = ["vcf", "sample_type"]

def __init__(
self,
results_dir: Path,
glob_pattern: str,
vcf_list: Path | None,
inpred_id_regex: str,
tumor_sample_types: str,
output: str,
):
"""
Create new instance of SmallVariantVcfList.
"""
self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}")
self.inpred_id_regex = rf"{inpred_id_regex}"
self.tumor_sample_types = set(tumor_sample_types.split(","))
self.dataframe = pandas.DataFrame(columns=self.vcf_list_columns)

# Try reading small variant VCF list or start from scratch
if vcf_list:
try:
self.dataframe = pandas.read_csv(
vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines="warn"
)
except FileNotFoundError:
logger.warning(
f"{vcf_list} not found, creating new small variant VCF list."
)
else:
logger.info("no small variant VCF list specified, creating new one.")

# Replace placeholder with actual date
if "<YYYYMMDD>" in output:
now = datetime.now()
self.output = output.replace("<YYYYMMDD>", now.strftime("%Y%m%d"))
else:
self.output = output

def __eq__(self, other):
"""
Compare to other class instance.
"""
if not isinstance(other, VcfList):
return NotImplemented
if self.dataframe != other.dataframe:
return False
if self.inpred_id_regex != other.inpred_id_regex:
return False
if self.output != other.output:
return False
if self.tumor_sample_types != other.tumor_sample_types:
return False
return self.vcfs != other.vcfs

def update(self):
"""
Add VCF(s) from results directory to small variant VCF list.
"""

# Loop over all small variant VCFs
for vcf in self.vcfs:
# Avoid duplication
if vcf in self.dataframe["vcf"].values:
logger.warning(f"{vcf} is already in small variant VCF list, skipping.")
continue

# Parse InPreD ID to get patient ID and sample type
match = re.search(self.inpred_id_regex, vcf)
try:
patient_id = match.group("patient_id")
sample_type = match.group("sample_type")
except AttributeError:
logger.warning(f"could not parse InPreD ID from {vcf}, skipping.")
continue

# Check if VCF is eligible for small variant VCF list and add if yes
small_variant_vcf = Vcf(
vcf, patient_id, sample_type, self.tumor_sample_types
)
if not small_variant_vcf.include:
continue
else:
self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row()

# Check if new patient ID is represented multiple times
patient_sample_count = self.dataframe["vcf"].str.contains(patient_id).sum()
if patient_sample_count > 1:
logger.warning(
f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list."
)

# Write updated small variant VCF list to file
self.dataframe.drop_duplicates().to_csv(
self.output, sep="\t", header=False, index=False
)


class Vcf:
"""
Represents small variant VCF.

Attributes:
include (bool): Whether to add the vcf to the small variant VCF file or not.
patient_id (str): ID of patient that the VCF belongs to.
sample_type (str): Single letter code representing type of sample, e.g. T = tumor.
vcf (str): Path to VCF file.
"""

include = True

def __init__(
self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set
):
"""
Create new instance of SmallVariantVcf.
"""
self.vcf = vcf
self.patient_id = patient_id

# Exclude control sample starting with IPC
if patient_id.startswith("IPC"):
logger.warning(f"{self.patient_id} is a control sample, skipping.")
self.include = False
return

# Ensure sample type is N or included in tumor_sample_types
Copy link
Copy Markdown
Collaborator

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I would prefer "N (normal)" or something like that here and in the log just for easier readability in the future.

if sample_type != "T" and sample_type != "N":
if sample_type not in tumor_sample_types:
logger.warning(
f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping."
)
self.include = False
return
else:
# Reset any sample type in tumor_sample_types with T
logger.warning(
f"sample type code {sample_type} for {vcf} will be replaced with T"
)
self.sample_type = "T"
else:
self.sample_type = sample_type

def __eq__(self, other):
"""
Compare to other class instance.
"""
if not isinstance(other, Vcf):
return NotImplemented
if self.include != other.include:
return False
if self.patient_id != other.patient_id:
return False
if self.sample_type != other.sample_type:
return False
return self.vcf != other.vcf

def row(self):
"""
Return small variant VCF list row.
"""
return [self.vcf, self.sample_type]
10 changes: 0 additions & 10 deletions tests/cli_test.py

This file was deleted.

Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N
Loading