diff --git a/pyproject.toml b/pyproject.toml index 1107b88..69dc879 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -3,7 +3,7 @@ requires = ["setuptools", "setuptools-scm", "setuptools-git-versioning"] build-backend = "setuptools.build_meta" [project] -dependencies = ["typer>=0.24.1"] +dependencies = ["polars>=1.39.3", "typer>=0.24.1"] dynamic = ["version"] name = "tsoppy" requires-python = ">=3.14" @@ -12,7 +12,7 @@ requires-python = ">=3.14" tsoppy = "tsoppy.cli:app" [project.optional-dependencies] -dev = ["isort==8.0.1", "ruff==0.15.6"] +dev = ["isort==8.0.1", "pytest==9.0.2", "ruff==0.15.6"] lint = ["ruff==0.15.6"] test = ["pytest==9.0.2", "pytest-emoji==0.2.0", "pytest-md==0.2.0"] diff --git a/src/tsoppy/cli.py b/src/tsoppy/cli.py index 3c0030a..8dabc70 100644 --- a/src/tsoppy/cli.py +++ b/src/tsoppy/cli.py @@ -3,11 +3,27 @@ """ import importlib.metadata +import logging +import re +from pathlib import Path from typing import Annotated import typer +from tsoppy.update_small_variant_vcf_list.main import VcfList + +# Set up logging for the CLI. The logging level is set to INFO, and the log messages will include the timestamp, log level, and message. +logging.basicConfig( + level=logging.INFO, + format="%(asctime)s %(levelname)s: %(message)s", + datefmt="%Y/%m/%d %H:%M:%S", +) +logger = logging.getLogger(__name__) + +# Create a Typer app for the CLI. The app will be used to define the commands and their arguments. app = typer.Typer() + +# app_version will be set from git tag. app_version = importlib.metadata.version("tsoppy") @@ -19,17 +35,82 @@ def version(): print(f"tsoppy version {app_version}") +def glob_pattern_callback(value: str) -> str: + """ + Callback function checking that the glob pattern ends with '.vcf'. + """ + if not value.endswith(".vcf"): + raise typer.BadParameter("Glob pattern must end with '.vcf'.") + return value + + +def inpred_id_regex_callback(value: str) -> str: + """ + Callback function ensuring inpred_id_regex contains the required named capture groups. + """ + if "" not in value: + raise typer.BadParameter( + "inpred_id_regex must contain a named group 'patient_id'." + ) + if "" not in value: + raise typer.BadParameter( + "inpred_id_regex must contain a named group 'sample_type'." + ) + return value + + +def tumor_sample_types_callback(value: str) -> str: + """ + Callback function to ensure tumor_sample_types is a comma-separated list of single letters. + """ + if not re.fullmatch(r"^([A-Za-z],)+[A-Za-z]$", value): + raise typer.BadParameter( + "tumor_sample_types must be comma-separated list of single letters." + ) + return value + + @app.command() -def placeholder( - user_name: Annotated[str, typer.Option("--name", "-n")], - user_id: Annotated[str, typer.Option("--id", "-i")], - verbose: Annotated[bool, typer.Option("--verbose", "-v")] = False, +def update_small_variant_vcf_list( + results_dir: Annotated[ + Path | None, + typer.Option( + help="Directory where the results of the latest TSO500 run are stored." + ), + ], + glob_pattern: Annotated[ + str, + typer.Option( + help="Glob pattern to search for small variant VCF files in the results directory.", + callback=glob_pattern_callback, + ), + ] = "**/Results/**/*_MergedSmallVariants.genome.vcf", + inpred_id_regex: Annotated[ + str, + typer.Option( + help="Regular expression to extract the inpred_id from the VCF file name.", + callback=inpred_id_regex_callback, + ), + ] = "(?P\D{3}\d{4})-\D\d{2}-(?P\D)\d{2}-\D\d{2}.*.vcf$", + output: Annotated[ + str, typer.Option(help="Name of new small variant VCF list.") + ] = "small_variant_vcf_list_.tsv", + tumor_sample_types: Annotated[ + str, + typer.Option( + help="Comma-separated list of sample types that are considered tumor samples." + ), + ] = "C,D,d,L,M,P,p,R,r,T,X", + vcf_list: Annotated[ + Path | None, typer.Option(help="Path to list of small variant VCF files.") + ] = None, ): """ - This is the helptext for the placeholder command that demonstrates how to - use Typer for CLI applications. + Updates the small variant VCF list based on VCF(s) in results directory. """ - if verbose: - print(f"{user_name} has the following id: {user_id}") - else: - print(f"{user_name}: {user_id}") + logger.info("Start updating small variant VCF list.") + small_variant_vcf_list = VcfList( + results_dir, glob_pattern, vcf_list, inpred_id_regex, tumor_sample_types, output + ) + small_variant_vcf_list.update() + logger.info("Finished updating small variant VCF list.") diff --git a/src/tsoppy/update_small_variant_vcf_list/__init__.py b/src/tsoppy/update_small_variant_vcf_list/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/src/tsoppy/update_small_variant_vcf_list/main.py b/src/tsoppy/update_small_variant_vcf_list/main.py new file mode 100644 index 0000000..6065c2d --- /dev/null +++ b/src/tsoppy/update_small_variant_vcf_list/main.py @@ -0,0 +1,212 @@ +""" +This module defines the classes 'VcfList' and 'Vcf'. +'VcfList' takes a directory holding TSO500 results, a glob to identify small variant vcf files, the currect small variant vcf list, +a regular expression matching InPreD IDs, a set of tumor sample types and the path to the new small variant vcf list. +'VcfList' has a method to update the current small variant vcf list with vcfs found in the TSO500 results directory. +'Vcf' is defined by a path to a small variant vcf and a set of tumor sample types. +'Vcf' provides a method to create a new row for a pandas dataframe. +""" + +import glob +import logging +import re +from datetime import datetime +from pathlib import Path + +import polars + +# Use logger that was set up in CLI +logger = logging.getLogger(__name__) + + +class InvalidSampleType(Exception): + """ + Exception if sample type is not valid. + """ + + def __init__(self, msg="sample type is not valid"): + self.msg = msg + super().__init__(self.msg) + + def __str__(self): + return self.msg + + +class VcfList: + """ + Represents small variant VCF list. + + Attributes: + dataframe (Dataframe): Dataframe representing the current version of small variant VCF list. + inpred_id_regex (str): Regular expression matching InPreD IDs. + output (str): Path to updated version of small variant VCF list. + tumor_sample_types (set[str]): Single letter codes representing a tumor sample. + vcf_list_columns (list[str]): List of dataframe column names. + vcfs (dict): Small variant VCF(s) located in TSO500 results directory. + """ + + vcf_list_columns = {"vcf": polars.String, "sample_type": polars.String} + + def __init__( + self, + results_dir: Path, + glob_pattern: str, + vcf_list: Path | None, + inpred_id_regex: str, + tumor_sample_types: str, + output: str, + ): + """ + Create new instance of SmallVariantVcfList. + """ + self.vcfs = glob.glob(f"{results_dir}/{glob_pattern}") + self.inpred_id_regex = rf"{inpred_id_regex}" + self.tumor_sample_types = set(tumor_sample_types.split(",")) + self.dataframe = polars.DataFrame(schema=self.vcf_list_columns) + + # Try reading small variant VCF list or start from scratch + if vcf_list: + try: + self.dataframe = polars.read_csv( + source=vcf_list, + separator="\t", + schema=self.vcf_list_columns, + ignore_errors=True, + has_header=False, + raise_if_empty=False, + ) + except FileNotFoundError: + logger.warning( + f"{vcf_list} not found, creating new small variant VCF list." + ) + else: + logger.info("no small variant VCF list specified, creating new one.") + + # Replace placeholder with actual date + if "" in output: + now = datetime.now() + self.output = output.replace("", now.strftime("%Y%m%d")) + else: + self.output = output + + def __eq__(self, other): + """ + Compare to other class instance. + """ + if not isinstance(other, VcfList): + return NotImplemented + if self.dataframe != other.dataframe: + return False + if self.inpred_id_regex != other.inpred_id_regex: + return False + if self.output != other.output: + return False + if self.tumor_sample_types != other.tumor_sample_types: + return False + return self.vcfs != other.vcfs + + def update(self): + """ + Add VCF(s) from results directory to small variant VCF list. + """ + + # Loop over all small variant VCFs + for vcf in self.vcfs: + # Try to create vcf class instance + try: + small_variant_vcf = Vcf( + vcf, self.inpred_id_regex, self.tumor_sample_types + ) + except AttributeError: + logger.warning( + f"could not parse InPreD ID from {small_variant_vcf.vcf}, skipping." + ) + continue + except InvalidSampleType: + logger.warning( + f"{small_variant_vcf.vcf} has sample type {small_variant_vcf.sample_type} which is not {self.tumor_sample_types} or N(ormal), skipping." + ) + continue + + # Avoid duplication + if small_variant_vcf.vcf in self.dataframe["vcf"].to_list(): + logger.warning(f"{vcf} is already in small variant VCF list, skipping.") + continue + + # Exclude control samples + if small_variant_vcf.patient_id.startswith("IPC"): + logger.warning( + f"{small_variant_vcf.patient_id} is a control sample, skipping." + ) + continue + + # Add vcf to list + self.dataframe = polars.concat([self.dataframe, small_variant_vcf.row()]) + + # Check if new patient ID is represented multiple times + patient_sample_count = ( + self.dataframe["vcf"] == small_variant_vcf.patient_id + ).sum() + if patient_sample_count > 1: + logger.warning( + f"patient {small_variant_vcf.patient_id} has {patient_sample_count} vcf(s) in the small variant VCF list." + ) + + # Write updated small variant VCF list to file + self.dataframe.unique().write_csv( + file=self.output, separator="\t", include_header=False + ) + + +class Vcf: + """ + Represents small variant VCF. + + Attributes: + patient_id (str): ID of patient that the VCF belongs to. + sample_type (str): Single letter code representing type of sample, e.g. T = tumor. + vcf (str): Path to VCF file. + """ + + def __init__(self, vcf: str, inpred_id_regex: str, tumor_sample_types: set): + """ + Create new instance of SmallVariantVcf. + """ + self.vcf = vcf + + # Parse InPreD ID to get patient ID and sample type + match = re.search(inpred_id_regex, self.vcf) + try: + self.patient_id = match.group("patient_id") + self.sample_type = match.group("sample_type") + except AttributeError: + raise AttributeError + + # Validate sample type is N(ormal) or included in tumor_sample_types + if self.sample_type != "N": + if self.sample_type not in tumor_sample_types: + raise InvalidSampleType + else: + # Reset any sample type in tumor_sample_types with T + logger.warning( + f"sample type code {self.sample_type} for {self.vcf} will be replaced with T" + ) + self.sample_type = "T" + + def __eq__(self, other): + """ + Compare to other class instance. + """ + if not isinstance(other, Vcf): + return NotImplemented + if self.patient_id != other.patient_id: + return False + if self.sample_type != other.sample_type: + return False + return self.vcf != other.vcf + + def row(self): + """ + Return small variant VCF list row. + """ + return polars.DataFrame({"vcf": [self.vcf], "sample_type": [self.sample_type]}) diff --git a/tests/cli_test.py b/tests/cli_test.py deleted file mode 100644 index 7b8ee75..0000000 --- a/tests/cli_test.py +++ /dev/null @@ -1,10 +0,0 @@ -""" -CLI module unit tests for tsoppy. -""" - - -def test_placeholder(): - """ - Unit test for the placeholder command in the CLI module. - """ - assert True diff --git a/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..2de322d --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/create_new_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/LocalApp/Results/IPC0002-D01-N01-A01/IPC0002-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/LocalApp/Results/IPC0002-D01-N01-A01/IPC0002-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/sample_is_control/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv new file mode 100644 index 0000000..166d2a7 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..166d2a7 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/skip_existing_vcf/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..91a1e5c --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/small_variant_vcf_list_does_not_exist/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv new file mode 100644 index 0000000..e69de29 diff --git a/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv new file mode 100644 index 0000000..682e578 --- /dev/null +++ b/tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv @@ -0,0 +1 @@ +tests/test_data/update_small_variant_vcf_list_main/successfully_update_small_variant_vcf_list/LocalApp/Results/IPH0001-D01-N01-A01/IPH0001-D01-N01-A01_MergedSmallVariants.genome.vcf N diff --git a/tests/test_update_small_variant_vcf_list_main.py b/tests/test_update_small_variant_vcf_list_main.py new file mode 100644 index 0000000..e912965 --- /dev/null +++ b/tests/test_update_small_variant_vcf_list_main.py @@ -0,0 +1,208 @@ +""" +update small variant vcf list subpackage main module unit tests. +""" + +import filecmp +import os +import unittest +from contextlib import nullcontext +from os import path + +import polars +import pytest + +from tsoppy.update_small_variant_vcf_list.main import InvalidSampleType, Vcf, VcfList + +# Define path to test data - cannot be absolute due to different paths locally and in CI +test_data_dir = "tests/test_data/update_small_variant_vcf_list_main" + +# test constants +glob_pattern = "**/Results/**/*_MergedSmallVariants.genome.vcf" +tumor_sample_types = "C,D,d,L,M,P,p,R,r,T,X" +inpred_id_regex = ( + "(?P\\D{3}\\d{4})-\\D\\d{2}-(?P\\D)\\d{2}-\\D\\d{2}.*.vcf$" +) + + +class TestVcfList(unittest.TestCase): + def test_update(self): + test_cases = [ + { + "name": "successfully update small variant vcf list", + "results_dir": path.join( + test_data_dir, "successfully_update_small_variant_vcf_list" + ), + "glob_pattern": glob_pattern, + "vcf_list": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list.tsv", + ), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "successfully_update_small_variant_vcf_list/TSO500_vcf_list_expected.tsv", + ), + }, + { + "name": "create new small variant vcf list", + "results_dir": path.join( + test_data_dir, "create_new_small_variant_vcf_list" + ), + "glob_pattern": glob_pattern, + "vcf_list": None, + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join( + test_data_dir, + "create_new_small_variant_vcf_list/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "create_new_small_variant_vcf_list/TSO500_vcf_list_expected.tsv", + ), + }, + { + "name": "small variant vcf list does not exist", + "results_dir": path.join( + test_data_dir, "small_variant_vcf_list_does_not_exist" + ), + "glob_pattern": glob_pattern, + "vcf_list": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list.tsv", + ), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_updated.tsv", + ), + "expected": path.join( + test_data_dir, + "small_variant_vcf_list_does_not_exist/TSO500_vcf_list_expected.tsv", + ), + }, + { + "name": "skip existing vcf", + "results_dir": path.join(test_data_dir, "skip_existing_vcf"), + "glob_pattern": glob_pattern, + "vcf_list": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list.tsv" + ), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list_updated.tsv" + ), + "expected": path.join( + test_data_dir, "skip_existing_vcf/TSO500_vcf_list_expected.tsv" + ), + }, + { + "name": "sample is control", + "results_dir": path.join(test_data_dir, "sample_is_control"), + "glob_pattern": glob_pattern, + "vcf_list": path.join( + test_data_dir, "sample_is_control/TSO500_vcf_list.tsv" + ), + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "output": path.join( + test_data_dir, "sample_is_control/TSO500_vcf_list_updated.tsv" + ), + "expected": path.join( + test_data_dir, "sample_is_control/TSO500_vcf_list_expected.tsv" + ), + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + got = VcfList( + test_case["results_dir"], + test_case["glob_pattern"], + test_case["vcf_list"], + test_case["inpred_id_regex"], + test_case["tumor_sample_types"], + test_case["output"], + ) + got.update() + assert filecmp.cmp(test_case["output"], test_case["expected"]) + os.remove(test_case["output"]) + + +class TestVcf(unittest.TestCase): + def test_init(self): + test_cases = [ + { + "name": "include sample", + "vcf": "IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "exception": nullcontext(), + "patient_id": "IPH0001", + "sample_type": "T", + }, + { + "name": "inpred id is not parsable", + "vcf": "IPH0001D01-T01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "exception": pytest.raises(AttributeError), + "patient_id": None, + "sample_type": None, + }, + { + "name": "sample is neither tumor nor normal", + "vcf": "IPH0001-D01-A01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, + "tumor_sample_types": tumor_sample_types, + "exception": pytest.raises(InvalidSampleType), + "patient_id": "IPH0001", + "sample_type": "A", + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + with test_case["exception"]: + got = Vcf( + test_case["vcf"], + test_case["inpred_id_regex"], + test_case["tumor_sample_types"], + ) + assert got.patient_id == test_case["patient_id"] + assert got.sample_type == test_case["sample_type"] + + def test_row(self): + test_cases = [ + { + "name": "successfully return row", + "vcf": "IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf", + "inpred_id_regex": inpred_id_regex, + "patient_id": "IPH0001", + "sample_type": "T", + "tumor_sample_types": tumor_sample_types, + "expected": polars.DataFrame( + { + "vcf": ["IPH0001-D01-T01-A01_MergedSmallVariants.genome.vcf"], + "sample_type": ["T"], + } + ), + }, + ] + + for test_case in test_cases: + with self.subTest(msg=test_case["name"]): + vcf = Vcf( + test_case["vcf"], + test_case["inpred_id_regex"], + test_case["tumor_sample_types"], + ) + got = vcf.row() + assert got.equals(test_case["expected"])