InPreD · marrip · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026 · Mar 30, 2026
@@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"]
 build-backend = "setuptools.build_meta"
 
 [project]
-dependencies = ["typer>=0.24.1"]
+dependencies = ["pandas>=3.0.1", "typer>=0.24.1"]
 dynamic = ["version"]
 name = "tsoppy"
 requires-python = ">=3.14"

@@ -3,11 +3,27 @@
 """
 
 import importlib.metadata
+import logging
+import re
+from pathlib import Path
 from typing import Annotated
 
 import typer
 
+from tsoppy.update_small_variant_vcf_list.main import VcfList
+
+# Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message.
+logging.basicConfig(
+    level=logging.INFO,
+    format="%(asctime)s %(levelname)s: %(message)s",
+    datefmt="%Y/%m/%d %H:%M:%S",
+)
+logger = logging.getLogger(__name__)
+
+# Create a Typer app for the CLI. The app will be used to define the commands and their arguments.
 app = typer.Typer()
+
+# app_version will be set from git tag.
 app_version = importlib.metadata.version("tsoppy")
 
 
@@ -19,17 +35,82 @@ def version():
     print(f"tsoppy version {app_version}")
 
 
+def glob_pattern_callback(value: str) -> str:
+    """
+    Callback function checking that the glob pattern ends with '.vcf'.
+    """
+    if not value.endswith(".vcf"):
+        raise typer.BadParameter("Glob pattern must end with '.vcf'.")
+    return value
+
+
+def inpred_id_regex_callback(value: str) -> str:
+    """
+    Callback function ensuring inpred_id_regex contains the required named capture groups.
+    """
+    if "<patient_id>" not in value:
+        raise typer.BadParameter(
+            "inpred_id_regex must contain a named group 'patient_id'."
+        )
+    if "<sample_type>" not in value:
+        raise typer.BadParameter(
+            "inpred_id_regex must contain a named group 'sample_type'."
+        )
+    return value
+
+
+def tumor_sample_types_callback(value: str) -> str:
+    """
+    Callback function to ensure tumor_sample_types is a comma-separated list of single letters.
+    """
+    if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value):
+        raise typer.BadParameter(
+            "tumor_sample_types must be comma-separated list of single letters."
+        )
+    return value
+
+
 @app.command()
-def placeholder(
-    user_name: Annotated[str, typer.Option("--name", "-n")],
-    user_id: Annotated[str, typer.Option("--id", "-i")],
-    verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False,
+def update_small_variant_vcf_list(
+    results_dir: Annotated[
+        Path | None,
+        typer.Option(
+            help="Directory where the results of the latest TSO500 run are stored."
+        ),
+    ],
+    glob_pattern: Annotated[
+        str,
+        typer.Option(
+            help="Glob pattern to search for small variant VCF files in the results directory.",
+            callback=glob_pattern_callback,
+        ),
+    ] = "**/Results/**/*_MergedSmallVariants.genome.vcf",
+    inpred_id_regex: Annotated[
+        str,
+        typer.Option(
+            help="Regular expression to extract the inpred_id from the VCF file name.",
+            callback=inpred_id_regex_callback,
+        ),
+    ] = "(?P<patient_id>\D{3}\d{4})-\D\d{2}-(?P<sample_type>\D)\d{2}-\D\d{2}.*.vcf$",
+    output: Annotated[
+        str, typer.Option(help="Name of new small variant VCF list.")
+    ] = "small_variant_vcf_list_<YYYYMMDD>.tsv",
+    tumor_sample_types: Annotated[
+        str,
+        typer.Option(
+            help="Comma-separated list of sample types that are considered tumor samples."
+        ),
+    ] = "C,D,d,L,M,P,p,R,r,T,X",
+    vcf_list: Annotated[
+        Path | None, typer.Option(help="Path to list of small variant VCF files.")
+    ] = None,
 ):
     """
-    This is the helptext for the placeholder command that demonstrates how to
-    use Typer for CLI applications.
+    Updates the small variant VCF list based on VCF(s) in results directory.
     """
-    if verbose:
-        print(f"{user_name} has the following id: {user_id}")
-    else:
-        print(f"{user_name}: {user_id}")
+    logger.info("Start updating small variant VCF list.")
+    small_variant_vcf_list = VcfList(
+        results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output
+    )
+    small_variant_vcf_list.update()
+    logger.info("Finished updating small variant VCF list.")
@@ -0,0 +1,192 @@
+"""
+This module contains the code for the `update_small_variant_vcf_list` command.
+The command takes two arguments, `results_dir`, which is a string that specifies the directory where the results of the latest TSO500 run are stored.
+"""
+
+import glob
+import logging
+import re
+from datetime import datetime
+from pathlib import Path
+
+import pandas
+
+# Use logger that was set up in CLI
+logger = logging.getLogger(__name__)
+
+
+class VcfList:
+    """
+    Represents small variant VCF list.
+
+    Attributes:
+        dataframe (Dataframe): Dataframe representing the current version of small variant VCF list.
+        inpred_id_regex (str): Regular expression matching InPreD IDs.
+        output (str): Path to updated version of small variant VCF list.
+        tumor_sample_types (set[str]): Single letter codes representing a tumor sample.
+        vcf_list_columns (list[str]): List of dataframe column names.
+        vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory.
+    """
+
+    vcf_list_columns = ["vcf", "sample_type"]
+
+    def __init__(
+        self,
+        results_dir: Path,
+        glob_pattern: str,
+        vcf_list: Path | None,
+        inpred_id_regex: str,
+        tumor_sample_types: str,
+        output: str,
+    ):
+        """
+        Create new instance of SmallVariantVcfList.
+        """
+        self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}")
+        self.inpred_id_regex = rf"{inpred_id_regex}"
+        self.tumor_sample_types = set(tumor_sample_types.split(","))
+        self.dataframe = pandas.DataFrame(columns=self.vcf_list_columns)
+
+        # Try reading small variant VCF list or start from scratch
+        if vcf_list:
+            try:
+                self.dataframe = pandas.read_csv(
+                    vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines="warn"
+                )
+            except FileNotFoundError:
+                logger.warning(
+                    f"{vcf_list} not found, creating new small variant VCF list."
+                )
+        else:
+            logger.info("no small variant VCF list specified, creating new one.")
+
+        # Replace placeholder with actual date
+        if "<YYYYMMDD>" in output:
+            now = datetime.now()
+            self.output = output.replace("<YYYYMMDD>", now.strftime("%Y%m%d"))
+        else:
+            self.output = output
+
+    def __eq__(self, other):
+        """
+        Compare to other class instance.
+        """
+        if not isinstance(other, VcfList):
+            return NotImplemented
+        if self.dataframe != other.dataframe:
+            return False
+        if self.inpred_id_regex != other.inpred_id_regex:
+            return False
+        if self.output != other.output:
+            return False
+        if self.tumor_sample_types != other.tumor_sample_types:
+            return False
+        return self.vcfs != other.vcfs
+
+    def update(self):
+        """
+        Add VCF(s) from results directory to small variant VCF list.
+        """
+
+        # Loop over all small variant VCFs
+        for vcf in self.vcfs:
+            # Avoid duplication
+            if vcf in self.dataframe["vcf"].values:
+                logger.warning(f"{vcf} is already in small variant VCF list, skipping.")
+                continue
+
+            # Parse InPreD ID to get patient ID and sample type
+            match = re.search(self.inpred_id_regex, vcf)
+            try:
+                patient_id = match.group("patient_id")
+                sample_type = match.group("sample_type")
+            except AttributeError:
+                logger.warning(f"could not parse InPreD ID from {vcf}, skipping.")
+                continue
+
+            # Check if VCF is eligible for small variant VCF list and add if yes
+            small_variant_vcf = Vcf(
+                vcf, patient_id, sample_type, self.tumor_sample_types
+            )
+            if not small_variant_vcf.include:
+                continue
+            else:
+                self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row()
+
+            # Check if new patient ID is represented multiple times
+            patient_sample_count = self.dataframe["vcf"].str.contains(patient_id).sum()
+            if patient_sample_count > 1:
+                logger.warning(
+                    f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list."
+                )
+
+        # Write updated small variant VCF list to file
+        self.dataframe.drop_duplicates().to_csv(
+            self.output, sep="\t", header=False, index=False
+        )
+
+
+class Vcf:
+    """
+    Represents small variant VCF.
+
+    Attributes:
+        include (bool): Whether to add the vcf to the small variant VCF file or not.
+        patient_id (str): ID of patient that the VCF belongs to.
+        sample_type (str): Single letter code representing type of sample, e.g. T = tumor.
+        vcf (str): Path to VCF file.
+    """
+
+    include = True
+
+    def __init__(
+        self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set
+    ):
+        """
+        Create new instance of SmallVariantVcf.
+        """
+        self.vcf = vcf
+        self.patient_id = patient_id
+
+        # Exclude control sample starting with IPC
+        if patient_id.startswith("IPC"):
+            logger.warning(f"{self.patient_id} is a control sample, skipping.")
+            self.include = False
+            return
+
+        # Ensure sample type is N or included in tumor_sample_types
+        if sample_type != "T" and sample_type != "N":
+            if sample_type not in tumor_sample_types:
+                logger.warning(
+                    f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping."
+                )
+                self.include = False
+                return
+            else:
+                # Reset any sample type in tumor_sample_types with T
+                logger.warning(
+                    f"sample type code {sample_type} for {vcf} will be replaced with T"
+                )
+                self.sample_type = "T"
+        else:
+            self.sample_type = sample_type
+
+    def __eq__(self, other):
+        """
+        Compare to other class instance.
+        """
+        if not isinstance(other, Vcf):
+            return NotImplemented
+        if self.include != other.include:
+            return False
+        if self.patient_id != other.patient_id:
+            return False
+        if self.sample_type != other.sample_type:
+            return False
+        return self.vcf != other.vcf
+
+    def row(self):
+        """
+        Return small variant VCF list row.
+        """
+        return [self.vcf, self.sample_type]
@@ -0,0 +1 @@
+tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf	N
@@ -0,0 +1 @@
+tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf	N
@@ -0,0 +1 @@
+tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf	N
@@ -0,0 +1 @@
+tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf	N
@@ -0,0 +1 @@
+tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf	N
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N