From 2011b984ad51e4727aea764526718e69bec6fa0a Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:18:15 +0200 Subject: [PATCH 1/8] chore: add pandas to deps --- pyproject.toml | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/pyproject.toml b/pyproject.toml index 1107b88..90e9a51 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"] build-backend = "setuptools.build_meta" [project] -dependencies = ["typer>=0.24.1"] +dependencies = ["pandas>=3.0.1", "typer>=0.24.1"] dynamic = ["version"] name = "tsoppy" requires-python = ">=3.14" From b178f0f1ffaec23cad7712d6a0e9beb56dee87f6 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:19:21 +0200 Subject: [PATCH 2/8] feat: add update small variant vcf list subpackage --- .../update_small_variant_vcf_list/__init__.py | 0 .../update_small_variant_vcf_list/main.py | 179 ++++++++++++++++++ 2 files changed, 179 insertions(+) create mode 100644 src/tsoppy/update_small_variant_vcf_list/__init__.py create mode 100644 src/tsoppy/update_small_variant_vcf_list/main.py diff --git a/src/tsoppy/update_small_variant_vcf_list/__init__.py b/src/tsoppy/update_small_variant_vcf_list/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py new file mode 100644 index 0000000..b906a18 --- /dev/null +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -0,0 +1,179 @@ +""" +This module contains the code for the `update_small_variant_vcf_list` command. +The command takes two arguments, `results_dir`, which is a string that specifies the directory where the results of the latest TSO500 run are stored. +""" + +import glob +import logging +import re +from datetime import datetime +from pathlib import Path + +import pandas + +# Use logger that was set up in CLI +logger = logging.getLogger(__name__) + + +class VcfList: + """ + Represents small variant VCF list. + + Attributes: + dataframe (Dataframe): Dataframe representing the current version of small variant VCF list. + inpred_id_regex (str): Regular expression matching InPreD IDs. + output (str): Path to updated version of small variant VCF list. + tumor_sample_types (set[str]): Single letter codes representing a tumor sample. + vcf_list_columns (list[str]): List of dataframe column names. + vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory. + """ + vcf_list_columns = ["vcf", "sample_type"] + + def __init__(self, results_dir: Path, glob_pattern: str, vcf_list: Path | None, inpred_id_regex: str, tumor_sample_types: str, output: str): + """ + Create new instance of SmallVariantVcfList. + """ + self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}") + self.inpred_id_regex = rf"{inpred_id_regex}" + self.tumor_sample_types = set(tumor_sample_types.split(",")) + self.dataframe = pandas.DataFrame(columns=self.vcf_list_columns) + + # Try reading small variant VCF list or start from scratch + if vcf_list: + try: + self.dataframe = pandas.read_csv( + vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines='warn') + except FileNotFoundError: + logger.warning( + f"{vcf_list} not found, creating new small variant VCF list.") + else: + logger.info( + f"no small variant VCF list specified, creating new one.") + + # Replace placeholder with actual date + if "" in output: + now = datetime.now() + self.output = output.replace("", now.strftime("%Y%m%d")) + else: + self.output = output + + def __eq__(self, other): + """ + Compare to other class instance. + """ + if not isinstance(other, VcfList): + return NotImplemented + if self.dataframe != other.dataframe: + return False + if self.inpred_id_regex != other.inpred_id_regex: + return False + if self.output != other.output: + return False + if self.tumor_sample_types != other.tumor_sample_types: + return False + return self.vcfs != other.vcfs + + def update(self): + """ + Add VCF(s) from results directory to small variant VCF list. + """ + + # Loop over all small variant VCFs + for vcf in self.vcfs: + + # Avoid duplication + if vcf in self.dataframe["vcf"].values: + logger.warning( + f"{vcf} is already in small variant VCF list, skipping.") + continue + + # Parse InPreD ID to get patient ID and sample type + match = re.search(self.inpred_id_regex, vcf) + try: + patient_id = match.group("patient_id") + sample_type = match.group("sample_type") + except AttributeError: + logger.warning( + f"could not parse InPreD ID from {vcf}, skipping.") + continue + + # Check if VCF is eligible for small variant VCF list and add if yes + small_variant_vcf = Vcf( + vcf, patient_id, sample_type, self.tumor_sample_types) + if not small_variant_vcf.include: + continue + else: + self.dataframe.loc[len(self.dataframe) + ] = small_variant_vcf.row() + + # Check if new patient ID is represented multiple times + patient_sample_count = self.dataframe["vcf"].str.contains( + patient_id).sum() + if patient_sample_count > 1: + logger.warning( + f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list.") + + # Write updated small variant VCF list to file + self.dataframe.drop_duplicates().to_csv( + self.output, sep="\t", header=False, index=False) + + +class Vcf: + """ + Represents small variant VCF. + + Attributes: + include (bool): Whether to add the vcf to the small variant VCF file or not. + patient_id (str): ID of patient that the VCF belongs to. + sample_type (str): Single letter code representing type of sample, e.g. T = tumor. + vcf (str): Path to VCF file. + """ + include = True + + def __init__(self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set): + """ + Create new instance of SmallVariantVcf. + """ + self.vcf = vcf + self.patient_id = patient_id + + # Exclude control sample starting with IPC + if patient_id.startswith("IPC"): + logger.warning(f"{self.patient_id} is a control sample, skipping.") + self.include = False + return + + # Ensure sample type is N or included in tumor_sample_types + if sample_type != "T" and sample_type != "N": + if sample_type not in tumor_sample_types: + logger.warning( + f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping.") + self.include = False + return + else: + # Reset any sample type in tumor_sample_types with T + logger.warning( + f"sample type code {sample_type} for {vcf} will be replaced with T") + self.sample_type = "T" + else: + self.sample_type = sample_type + + def __eq__(self, other): + """ + Compare to other class instance. + """ + if not isinstance(other, Vcf): + return NotImplemented + if self.include != other.include: + return False + if self.patient_id != other.patient_id: + return False + if self.sample_type != other.sample_type: + return False + return self.vcf != other.vcf + + def row(self): + """ + Return small variant VCF list row. + """ + return [self.vcf, self.sample_type] From 74d454151ec698e3c7007665052ec1eb9f06450a Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:20:49 +0200 Subject: [PATCH 3/8] test: add tests and test data for update small variant vcf list subpackage --- tests/cli_test.py | 10 -- ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list_expected.tsv | 1 + ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list.tsv | 0 .../TSO500_vcf_list_expected.tsv | 0 ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../skip_existing_vcf/TSO500_vcf_list.tsv | 1 + .../TSO500_vcf_list_expected.tsv | 1 + ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list_expected.tsv | 1 + ...D01-N01-A01_MergedSmallVariants.genome.vcf | 0 .../TSO500_vcf_list.tsv | 0 .../TSO500_vcf_list_expected.tsv | 1 + ...test_update_small_variant_vcf_list_main.py | 135 ++++++++++++++++++ 15 files changed, 140 insertions(+), 10 deletions(-) delete mode 100644 tests/cli_test.py create mode 100644 tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf create mode 100644 tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv create mode 100644 tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv create mode 100644 tests/test_update_small_variant_vcf_list_main.py diff --git a/tests/cli_test.py b/tests/cli_test.py deleted file mode 100644 index 7b8ee75..0000000 --- a/tests/cli_test.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -CLI module unit tests for tsoppy. -""" - - -def test_placeholder(): - """ - Unit test for the placeholder command in the CLI module. - """ - assert True diff --git a/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..2de322d --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/LocalApp/Results/IPH00002-D01-N01-A01/IPH00002-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/inpred_id_not_parsable/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv new file mode 100644 index 0000000..166d2a7 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..166d2a7 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..91a1e5c --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..682e578 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py new file mode 100644 index 0000000..857f166 --- /dev/null +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -0,0 +1,135 @@ +""" +update small variant vcf list subpackage main module unit tests. +""" +from os import path +import os +import filecmp +import unittest +from tsoppy.update_small_variant_vcf_list.main import VcfList, Vcf + +# Define path to test data - cannot be absolute due to different paths locally and in CI +test_data_dir = "tests/test_data/update_small_variant_vcf_list_main" + +# test constants +glob_pattern = "**/Results/**/*_MergedSmallVariants.genome.vcf" +tumor_sample_types = "C,D,d,L,M,P,p,R,r,T,X" +inpred_id_regex = "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" + + +class TestVcfList(unittest.TestCase): + def test_update(self): + test_cases = [ + { + "name": "successfully update small variant vcf list", + "results_dir": path.join(test_data_dir, "successfully_update_small_variant_vcf_list"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "create new small variant vcf list", + "results_dir": path.join(test_data_dir, "create_new_small_variant_vcf_list"), + "glob_pattern": glob_pattern, + "vcf_list": None, + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "small variant vcf list does not exist", + "results_dir": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "skip existing vcf", + "results_dir": path.join(test_data_dir, "skip_existing_vcf"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_expected.tsv"), + }, + { + "name": "inpred id not parsable", + "results_dir": path.join(test_data_dir, "inpred_id_not_parsable"), + "glob_pattern": glob_pattern, + "vcf_list": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list.tsv"), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_updated.tsv"), + "expected": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_expected.tsv"), + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + got = VcfList( + test_case["results_dir"], test_case["glob_pattern"], test_case["vcf_list"], test_case["inpred_id_regex"], test_case["tumor_sample_types"], test_case["output"]) + got.update() + assert filecmp.cmp(test_case["output"], test_case["expected"]) + os.remove(test_case["output"]) + + +class TestVcf(unittest.TestCase): + def test_init(self): + test_cases = [ + { + "name": "include sample", + "vcf": "IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPH0001", + "sample_type": "T", + "tumor_sample_types": tumor_sample_types, + "expected": True, + }, + { + "name": "sample is control", + "vcf": "IPC0001-01-T01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPC0001", + "sample_type": "T", + "tumor_sample_types": tumor_sample_types, + "expected": False, + }, + { + "name": "sample is neither tumor nor normal", + "vcf": "IPH0001-01-A01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPH0001", + "sample_type": "A", + "tumor_sample_types": tumor_sample_types, + "expected": False, + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + got = Vcf( + test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + assert got.include == test_case["expected"] + + def test_row(self): + test_cases = [ + { + "name": "successfully return row", + "vcf": "IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", + "patient_id": "IPH0001", + "sample_type": "T", + "tumor_sample_types": tumor_sample_types, + "expected": ["IPH0001-01-T01-01_MergedSmallVariants.genome.vcf", "T"], + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + vcf = Vcf( + test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + got = vcf.row() + assert got == test_case["expected"] From a55e39cdfbeee29025b73271d8d4c5be052d9bf7 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:22:43 +0200 Subject: [PATCH 4/8] feat: add logger and set defaults --- src/tsoppy/cli.py | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 3c0030a..8b23d47 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -3,10 +3,17 @@ """ import importlib.metadata +import logging from typing import Annotated import typer +# Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message. +logging.basicConfig(level=logging.INFO, + format='%(asctime)s %(levelname)s: %(message)s', + datefmt='%Y/%m/%d %H:%M:%S') +logger = logging.getLogger(__name__) + app = typer.Typer() app_version = importlib.metadata.version("tsoppy") From 141c519bcc0576065be9964a563adaecedcbc436 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:25:13 +0200 Subject: [PATCH 5/8] feat: expose update small variant vcf list subpackage via command in cli --- src/tsoppy/cli.py | 64 +++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 54 insertions(+), 10 deletions(-) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 8b23d47..9ca08e8 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -4,10 +4,14 @@ import importlib.metadata import logging +import re +from pathlib import Path from typing import Annotated import typer +from tsoppy.update_small_variant_vcf_list.main import VcfList + # Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message. logging.basicConfig(level=logging.INFO, format='%(asctime)s %(levelname)s: %(message)s', @@ -26,17 +30,57 @@ def version(): print(f"tsoppy version {app_version}") +def glob_pattern_callback(value: str) -> str: + """ + Callback function checking that the glob pattern ends with '.vcf'. + """ + if not value.endswith(".vcf"): + raise typer.BadParameter("Glob pattern must end with '.vcf'.") + return value + + +def inpred_id_regex_callback(value: str) -> str: + """ + Callback function ensuring inpred_id_regex contains the required named capture groups. + """ + if "" not in value: + raise typer.BadParameter( + "inpred_id_regex must contain a named group 'patient_id'.") + if "" not in value: + raise typer.BadParameter( + "inpred_id_regex must contain a named group 'sample_type'.") + return value + + +def tumor_sample_types_callback(value: str) -> str: + """ + Callback function to ensure tumor_sample_types is a comma-separated list of single letters. + """ + if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value): + raise typer.BadParameter( + "tumor_sample_types must be comma-separated list of single letters.") + return value + + @app.command() -def placeholder( - user_name: Annotated[str, typer.Option("--name", "-n")], - user_id: Annotated[str, typer.Option("--id", "-i")], - verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False, +def update_small_variant_vcf_list( + results_dir: Annotated[Path | None, typer.Option(help="Directory where the results of the latest TSO500 run are stored.")], + glob_pattern: Annotated[str, typer.Option( + help="Glob pattern to search for small variant VCF files in the results directory.", callback=glob_pattern_callback)] = "**/Results/**/*_MergedSmallVariants.genome.vcf", + inpred_id_regex: Annotated[str, typer.Option( + help="Regular expression to extract the inpred_id from the VCF file name.", callback=inpred_id_regex_callback)] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", + output: Annotated[str, typer.Option( + help="Name of new small variant VCF list.")] = f"small_variant_vcf_list_.tsv", + tumor_sample_types: Annotated[str, typer.Option( + help="Comma-separated list of sample types that are considered tumor samples.")] = "C,D,d,L,M,P,p,R,r,T,X", + vcf_list: Annotated[Path | None, typer.Option( + help="Path to list of small variant VCF files.")] = None, ): """ - This is the helptext for the placeholder command that demonstrates how to - use Typer for CLI applications. + Updates the small variant VCF list based on VCF(s) in results directory. """ - if verbose: - print(f"{user_name} has the following id: {user_id}") - else: - print(f"{user_name}: {user_id}") + logger.info("Start updating small variant VCF list.") + small_variant_vcf_list = VcfList( + results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output) + small_variant_vcf_list.update() + logger.info("Finished updating small variant VCF list.") From 02fb98fed777ea76268209de79fa1ce33fc98ed1 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:25:37 +0200 Subject: [PATCH 6/8] style: include comments --- src/tsoppy/cli.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 9ca08e8..eb0b590 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -18,7 +18,10 @@ datefmt='%Y/%m/%d %H:%M:%S') logger = logging.getLogger(__name__) +# Create a Typer app for the CLI. The app will be used to define the commands and their arguments. app = typer.Typer() + +# app_version will be set from git tag. app_version = importlib.metadata.version("tsoppy") From 8f625af922e0f163d13a6ebea1c42d7cb435ff45 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:26:39 +0200 Subject: [PATCH 7/8] chore: lint packages --- tests/test_update_small_variant_vcf_list_main.py | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index 857f166..c069cfe 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -1,11 +1,12 @@ """ update small variant vcf list subpackage main module unit tests. """ -from os import path -import os import filecmp +import os import unittest -from tsoppy.update_small_variant_vcf_list.main import VcfList, Vcf +from os import path + +from tsoppy.update_small_variant_vcf_list.main import Vcf, VcfList # Define path to test data - cannot be absolute due to different paths locally and in CI test_data_dir = "tests/test_data/update_small_variant_vcf_list_main" From 938c6e769f02f59ff581fef71b5b655ff6d3f133 Mon Sep 17 00:00:00 2001 From: Martin Rippin Date: Mon, 30 Mar 2026 11:33:53 +0200 Subject: [PATCH 8/8] chore: ruff lint --- src/tsoppy/cli.py | 63 +++++++---- .../update_small_variant_vcf_list/main.py | 53 +++++---- ...test_update_small_variant_vcf_list_main.py | 101 ++++++++++++++---- 3 files changed, 158 insertions(+), 59 deletions(-) diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index eb0b590..8dabc70 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -13,9 +13,11 @@ from tsoppy.update_small_variant_vcf_list.main import VcfList # Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message. -logging.basicConfig(level=logging.INFO, - format='%(asctime)s %(levelname)s: %(message)s', - datefmt='%Y/%m/%d %H:%M:%S') +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", +) logger = logging.getLogger(__name__) # Create a Typer app for the CLI. The app will be used to define the commands and their arguments. @@ -48,10 +50,12 @@ def inpred_id_regex_callback(value: str) -> str: """ if "" not in value: raise typer.BadParameter( - "inpred_id_regex must contain a named group 'patient_id'.") + "inpred_id_regex must contain a named group 'patient_id'." + ) if "" not in value: raise typer.BadParameter( - "inpred_id_regex must contain a named group 'sample_type'.") + "inpred_id_regex must contain a named group 'sample_type'." + ) return value @@ -61,29 +65,52 @@ def tumor_sample_types_callback(value: str) -> str: """ if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value): raise typer.BadParameter( - "tumor_sample_types must be comma-separated list of single letters.") + "tumor_sample_types must be comma-separated list of single letters." + ) return value @app.command() def update_small_variant_vcf_list( - results_dir: Annotated[Path | None, typer.Option(help="Directory where the results of the latest TSO500 run are stored.")], - glob_pattern: Annotated[str, typer.Option( - help="Glob pattern to search for small variant VCF files in the results directory.", callback=glob_pattern_callback)] = "**/Results/**/*_MergedSmallVariants.genome.vcf", - inpred_id_regex: Annotated[str, typer.Option( - help="Regular expression to extract the inpred_id from the VCF file name.", callback=inpred_id_regex_callback)] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", - output: Annotated[str, typer.Option( - help="Name of new small variant VCF list.")] = f"small_variant_vcf_list_.tsv", - tumor_sample_types: Annotated[str, typer.Option( - help="Comma-separated list of sample types that are considered tumor samples.")] = "C,D,d,L,M,P,p,R,r,T,X", - vcf_list: Annotated[Path | None, typer.Option( - help="Path to list of small variant VCF files.")] = None, + results_dir: Annotated[ + Path | None, + typer.Option( + help="Directory where the results of the latest TSO500 run are stored." + ), + ], + glob_pattern: Annotated[ + str, + typer.Option( + help="Glob pattern to search for small variant VCF files in the results directory.", + callback=glob_pattern_callback, + ), + ] = "**/Results/**/*_MergedSmallVariants.genome.vcf", + inpred_id_regex: Annotated[ + str, + typer.Option( + help="Regular expression to extract the inpred_id from the VCF file name.", + callback=inpred_id_regex_callback, + ), + ] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", + output: Annotated[ + str, typer.Option(help="Name of new small variant VCF list.") + ] = "small_variant_vcf_list_.tsv", + tumor_sample_types: Annotated[ + str, + typer.Option( + help="Comma-separated list of sample types that are considered tumor samples." + ), + ] = "C,D,d,L,M,P,p,R,r,T,X", + vcf_list: Annotated[ + Path | None, typer.Option(help="Path to list of small variant VCF files.") + ] = None, ): """ Updates the small variant VCF list based on VCF(s) in results directory. """ logger.info("Start updating small variant VCF list.") small_variant_vcf_list = VcfList( - results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output) + results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output + ) small_variant_vcf_list.update() logger.info("Finished updating small variant VCF list.") diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py index b906a18..d5428e5 100644 --- a/src/tsoppy/update_small_variant_vcf_list/main.py +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -27,9 +27,18 @@ class VcfList: vcf_list_columns (list[str]): List of dataframe column names. vcfs (list[str]): Small variant VCF(s) located in TSO500 results directory. """ + vcf_list_columns = ["vcf", "sample_type"] - def __init__(self, results_dir: Path, glob_pattern: str, vcf_list: Path | None, inpred_id_regex: str, tumor_sample_types: str, output: str): + def __init__( + self, + results_dir: Path, + glob_pattern: str, + vcf_list: Path | None, + inpred_id_regex: str, + tumor_sample_types: str, + output: str, + ): """ Create new instance of SmallVariantVcfList. """ @@ -42,13 +51,14 @@ def __init__(self, results_dir: Path, glob_pattern: str, vcf_list: Path | None, if vcf_list: try: self.dataframe = pandas.read_csv( - vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines='warn') + vcf_list, sep="\t", names=self.vcf_list_columns, on_bad_lines="warn" + ) except FileNotFoundError: logger.warning( - f"{vcf_list} not found, creating new small variant VCF list.") + f"{vcf_list} not found, creating new small variant VCF list." + ) else: - logger.info( - f"no small variant VCF list specified, creating new one.") + logger.info("no small variant VCF list specified, creating new one.") # Replace placeholder with actual date if "" in output: @@ -80,11 +90,9 @@ def update(self): # Loop over all small variant VCFs for vcf in self.vcfs: - # Avoid duplication if vcf in self.dataframe["vcf"].values: - logger.warning( - f"{vcf} is already in small variant VCF list, skipping.") + logger.warning(f"{vcf} is already in small variant VCF list, skipping.") continue # Parse InPreD ID to get patient ID and sample type @@ -93,29 +101,29 @@ def update(self): patient_id = match.group("patient_id") sample_type = match.group("sample_type") except AttributeError: - logger.warning( - f"could not parse InPreD ID from {vcf}, skipping.") + logger.warning(f"could not parse InPreD ID from {vcf}, skipping.") continue # Check if VCF is eligible for small variant VCF list and add if yes small_variant_vcf = Vcf( - vcf, patient_id, sample_type, self.tumor_sample_types) + vcf, patient_id, sample_type, self.tumor_sample_types + ) if not small_variant_vcf.include: continue else: - self.dataframe.loc[len(self.dataframe) - ] = small_variant_vcf.row() + self.dataframe.loc[len(self.dataframe)] = small_variant_vcf.row() # Check if new patient ID is represented multiple times - patient_sample_count = self.dataframe["vcf"].str.contains( - patient_id).sum() + patient_sample_count = self.dataframe["vcf"].str.contains(patient_id).sum() if patient_sample_count > 1: logger.warning( - f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list.") + f"patient {patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list." + ) # Write updated small variant VCF list to file self.dataframe.drop_duplicates().to_csv( - self.output, sep="\t", header=False, index=False) + self.output, sep="\t", header=False, index=False + ) class Vcf: @@ -128,9 +136,12 @@ class Vcf: sample_type (str): Single letter code representing type of sample, e.g. T = tumor. vcf (str): Path to VCF file. """ + include = True - def __init__(self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set): + def __init__( + self, vcf: str, patient_id: str, sample_type: str, tumor_sample_types: set + ): """ Create new instance of SmallVariantVcf. """ @@ -147,13 +158,15 @@ def __init__(self, vcf: str, patient_id: str, sample_type: str, tumor_sample_typ if sample_type != "T" and sample_type != "N": if sample_type not in tumor_sample_types: logger.warning( - f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping.") + f"{self.vcf} has sample type {sample_type} which is not {tumor_sample_types} or N, skipping." + ) self.include = False return else: # Reset any sample type in tumor_sample_types with T logger.warning( - f"sample type code {sample_type} for {vcf} will be replaced with T") + f"sample type code {sample_type} for {vcf} will be replaced with T" + ) self.sample_type = "T" else: self.sample_type = sample_type diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py index c069cfe..dde4b56 100644 --- a/tests/test_update_small_variant_vcf_list_main.py +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -1,6 +1,7 @@ """ update small variant vcf list subpackage main module unit tests. """ + import filecmp import os import unittest @@ -14,7 +15,9 @@ # test constants glob_pattern = "**/Results/**/*_MergedSmallVariants.genome.vcf" tumor_sample_types = "C,D,d,L,M,P,p,R,r,T,X" -inpred_id_regex = "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" +inpred_id_regex = ( + "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" +) class TestVcfList(unittest.TestCase): @@ -22,60 +25,108 @@ def test_update(self): test_cases = [ { "name": "successfully update small variant vcf list", - "results_dir": path.join(test_data_dir, "successfully_update_small_variant_vcf_list"), + "results_dir": path.join( + test_data_dir, "successfully_update_small_variant_vcf_list" + ), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv", + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv", + ), }, { "name": "create new small variant vcf list", - "results_dir": path.join(test_data_dir, "create_new_small_variant_vcf_list"), + "results_dir": path.join( + test_data_dir, "create_new_small_variant_vcf_list" + ), "glob_pattern": glob_pattern, "vcf_list": None, "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, + "create_new_small_variant_vcf_list/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv", + ), }, { "name": "small variant vcf list does not exist", - "results_dir": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist"), + "results_dir": path.join( + test_data_dir, "small_variant_vcf_list_does_not_exist" + ), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list.tsv", + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv", + ), }, { "name": "skip existing vcf", "results_dir": path.join(test_data_dir, "skip_existing_vcf"), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list.tsv" + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "skip_existing_vcf/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list_updated.tsv" + ), + "expected": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list_expected.tsv" + ), }, { "name": "inpred id not parsable", "results_dir": path.join(test_data_dir, "inpred_id_not_parsable"), "glob_pattern": glob_pattern, - "vcf_list": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list.tsv"), + "vcf_list": path.join( + test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list.tsv" + ), "inpred_id_regex": inpred_id_regex, "tumor_sample_types": tumor_sample_types, - "output": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_updated.tsv"), - "expected": path.join(test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_expected.tsv"), + "output": path.join( + test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_updated.tsv" + ), + "expected": path.join( + test_data_dir, "inpred_id_not_parsable/TSO500_vcf_list_expected.tsv" + ), }, ] for test_case in test_cases: with self.subTest(msg=test_case["name"]): got = VcfList( - test_case["results_dir"], test_case["glob_pattern"], test_case["vcf_list"], test_case["inpred_id_regex"], test_case["tumor_sample_types"], test_case["output"]) + test_case["results_dir"], + test_case["glob_pattern"], + test_case["vcf_list"], + test_case["inpred_id_regex"], + test_case["tumor_sample_types"], + test_case["output"], + ) got.update() assert filecmp.cmp(test_case["output"], test_case["expected"]) os.remove(test_case["output"]) @@ -113,7 +164,11 @@ def test_init(self): for test_case in test_cases: with self.subTest(msg=test_case["name"]): got = Vcf( - test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + test_case["vcf"], + test_case["patient_id"], + test_case["sample_type"], + test_case["tumor_sample_types"], + ) assert got.include == test_case["expected"] def test_row(self): @@ -131,6 +186,10 @@ def test_row(self): for test_case in test_cases: with self.subTest(msg=test_case["name"]): vcf = Vcf( - test_case["vcf"], test_case["patient_id"], test_case["sample_type"], test_case["tumor_sample_types"]) + test_case["vcf"], + test_case["patient_id"], + test_case["sample_type"], + test_case["tumor_sample_types"], + ) got = vcf.row() assert got == test_case["expected"]