From e339a93a113e5be1b0ffb14be14e3919df39d2eb Mon Sep 17 00:00:00 2001 From: Barathi Subramanian <79915093+barathi-1993@users.noreply.github.com> Date: Wed, 4 Feb 2026 19:21:29 -0800 Subject: [PATCH 1/4] Add files via upload --- src/thunder/config/dataset/STARC_9.yaml | 42 +++++++++++++++++++++++++ 1 file changed, 42 insertions(+) create mode 100644 src/thunder/config/dataset/STARC_9.yaml diff --git a/src/thunder/config/dataset/STARC_9.yaml b/src/thunder/config/dataset/STARC_9.yaml new file mode 100644 index 0000000..0ed2eda --- /dev/null +++ b/src/thunder/config/dataset/STARC_9.yaml @@ -0,0 +1,42 @@ +dataset_name: STARC_9 +nb_classes: 9 +base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/ +compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"] +nb_train_samples: 630000 +nb_val_samples: 540000 +nb_test_samples: 20000 +md5sum: "2a238a6340b693cd2b10d15f6afa2053" +image_sizes: [[256, 256]] +mpp: 0.5 +cancer_type: colorectal +classes: ["ADI", "LYM", "MUC", "MUS", "NCS", "NOR", "BLD", "FCT", "TUM"] +class_to_id: + ADI: 0 + LYM: 1 + MUC: 2 + MUS: 3 + NCS: 4 + NOR: 5 + BLD: 6 + FCT: 7 + TUM: 8 +id_to_class: + 0: ADI + 1: LYM + 2: MUC + 3: MUS + 4: NCS + 5: NOR + 6: BLD + 7: FCT + 8: TUM +id_to_classname: # From KEEP paper + 0: adipose tissue + 1: lymphoid tissue + 2: mucin + 3: muscle + 4: necrosis + 5: normal mucosa + 6: blood + 7: fibroconnective tissue + 8: tumor \ No newline at end of file From ce6efeb4c9ea5c93773cd20dda0498ba725f77f9 Mon Sep 17 00:00:00 2001 From: Barathi Subramanian <79915093+barathi-1993@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:43:50 -0700 Subject: [PATCH 2/4] Update STARC_9.yaml --- src/thunder/config/dataset/STARC_9.yaml | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/src/thunder/config/dataset/STARC_9.yaml b/src/thunder/config/dataset/STARC_9.yaml index 0ed2eda..43a5c40 100644 --- a/src/thunder/config/dataset/STARC_9.yaml +++ b/src/thunder/config/dataset/STARC_9.yaml @@ -3,8 +3,8 @@ nb_classes: 9 base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/ compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"] nb_train_samples: 630000 -nb_val_samples: 540000 -nb_test_samples: 20000 +nb_val_samples: 20000 +nb_test_samples: 54000 md5sum: "2a238a6340b693cd2b10d15f6afa2053" image_sizes: [[256, 256]] mpp: 0.5 @@ -39,4 +39,4 @@ id_to_classname: # From KEEP paper 5: normal mucosa 6: blood 7: fibroconnective tissue - 8: tumor \ No newline at end of file + 8: tumor From 3a91041c3e457ad5b09a73919f6218d75bea8fcd Mon Sep 17 00:00:00 2001 From: Barathi Subramanian <79915093+barathi-1993@users.noreply.github.com> Date: Tue, 24 Mar 2026 19:49:24 -0700 Subject: [PATCH 3/4] Add files via upload --- src/thunder/datasets/dataset/starc9.py | 224 +++++++++++++++++++++++++ 1 file changed, 224 insertions(+) create mode 100644 src/thunder/datasets/dataset/starc9.py diff --git a/src/thunder/datasets/dataset/starc9.py b/src/thunder/datasets/dataset/starc9.py new file mode 100644 index 0000000..127fb77 --- /dev/null +++ b/src/thunder/datasets/dataset/starc9.py @@ -0,0 +1,224 @@ +import os +import json +import zipfile +from pathlib import Path +from collections import defaultdict +from typing import Dict, List, Tuple + +from huggingface_hub import snapshot_download + + +CLASS_TO_ID = { + "ADI": 0, + "LYM": 1, + "MUC": 2, + "MUS": 3, + "NCS": 4, + "NOR": 5, + "BLD": 6, + "FCT": 7, + "TUM": 8, +} + +VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"} + + +def download_starc9(root_folder: str) -> str: + """ + Download the STARC-9 dataset from Hugging Face and extract all zip files. + + Final split mapping: + - train: Training_data_normalized + - val: Validation_data/STANFORD-CRC-HE-VAL-SMALL + - test: Validation_data/STANFORD-CRC-HE-VAL-LARGE + + CURATED-TCGA is intentionally ignored here. + """ + dataset_root = os.path.join(root_folder, "starc_9") + + snapshot_download( + repo_id="Path2AI/STARC-9", + repo_type="dataset", + local_dir=dataset_root, + local_dir_use_symlinks=False, + ) + + extract_all_zips(dataset_root) + flatten_nested_class_dirs(dataset_root) + return dataset_root + + +def extract_all_zips(root_dir: str) -> None: + """ + Recursively extract every .zip under root_dir into a folder with the same stem. + """ + for current_root, _, files in os.walk(root_dir): + for file_name in files: + if not file_name.lower().endswith(".zip"): + continue + + zip_path = os.path.join(current_root, file_name) + extract_dir = os.path.join(current_root, Path(file_name).stem) + + if os.path.exists(extract_dir) and any(Path(extract_dir).iterdir()): + continue + + os.makedirs(extract_dir, exist_ok=True) + with zipfile.ZipFile(zip_path, "r") as zf: + zf.extractall(extract_dir) + + +def flatten_nested_class_dirs(root_dir: str) -> None: + """ + Fix common extraction issue like: + ADI/ADI/*.png + into: + ADI/*.png + """ + for split_root, class_dirs in find_candidate_class_roots(root_dir): + for class_name in class_dirs: + class_dir = Path(split_root) / class_name + nested_dir = class_dir / class_name + if nested_dir.is_dir(): + for item in nested_dir.iterdir(): + target = class_dir / item.name + if not target.exists(): + item.rename(target) + try: + nested_dir.rmdir() + except OSError: + pass + + +def find_candidate_class_roots(root_dir: str) -> List[Tuple[str, List[str]]]: + """ + Find directories that contain some/all class folders. + """ + candidates = [] + expected = set(CLASS_TO_ID.keys()) + + for current_root, dirnames, _ in os.walk(root_dir): + present = sorted([d for d in dirnames if d in expected]) + if present: + candidates.append((current_root, present)) + return candidates + + +def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int], Dict[str, int]]: + """ + Read all images from a directory structured like: + class_root/ + ADI/ + LYM/ + ... + """ + images: List[str] = [] + labels: List[int] = [] + class_counts: Dict[str, int] = defaultdict(int) + + class_root_path = Path(class_root) + if not class_root_path.exists(): + raise FileNotFoundError(f"Class root does not exist: {class_root}") + + missing_classes = [c for c in CLASS_TO_ID if not (class_root_path / c).exists()] + if missing_classes: + raise FileNotFoundError( + f"Missing expected class folders under {class_root}: {missing_classes}" + ) + + for class_name, class_id in CLASS_TO_ID.items(): + class_dir = class_root_path / class_name + for img_path in sorted(class_dir.rglob("*")): + if img_path.is_file() and img_path.suffix.lower() in VALID_EXTS: + images.append(str(img_path.resolve())) + labels.append(class_id) + class_counts[class_name] += 1 + + return images, labels, dict(class_counts) + + +def create_splits_starc9(base_folder: str) -> Dict: + """ + Build train/val/test splits using only STANFORD validation sets. + + train = Training_data_normalized + val = Validation_data/STANFORD-CRC-HE-VAL-SMALL + test = Validation_data/STANFORD-CRC-HE-VAL-LARGE + """ + dataset_root = os.path.join(base_folder, "starc_9") + + train_root = os.path.join(dataset_root, "Training_data_normalized") + val_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-SMALL") + test_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-LARGE") + + train_images, train_labels, train_counts = collect_images_from_class_root(train_root) + val_images, val_labels, val_counts = collect_images_from_class_root(val_root) + test_images, test_labels, test_counts = collect_images_from_class_root(test_root) + + splits = { + "train": {"images": train_images, "labels": train_labels}, + "val": {"images": val_images, "labels": val_labels}, + "test": {"images": test_images, "labels": test_labels}, + "meta": { + "dataset_name": "STARC_9", + "class_to_id": CLASS_TO_ID, + "num_classes": len(CLASS_TO_ID), + "counts": { + "train": { + "total": len(train_images), + "per_class": train_counts, + }, + "val": { + "total": len(val_images), + "per_class": val_counts, + }, + "test": { + "total": len(test_images), + "per_class": test_counts, + }, + }, + "notes": [ + "CURATED-TCGA-CRC-HE-20K-NORMALIZED is intentionally excluded.", + "Validation uses STANFORD-CRC-HE-VAL-SMALL only.", + "Test uses STANFORD-CRC-HE-VAL-LARGE only.", + ], + }, + } + + os.makedirs(os.path.join(base_folder, "data_splits"), exist_ok=True) + out_json = os.path.join(base_folder, "data_splits", "starc_9.json") + with open(out_json, "w", encoding="utf-8") as f: + json.dump(splits, f, indent=2) + + print("\nSaved split file to:", out_json) + print("\nSample counts") + print("Train:", len(train_images)) + print("Val :", len(val_images)) + print("Test :", len(test_images)) + + print("\nPer-class counts") + print("Train:", train_counts) + print("Val :", val_counts) + print("Test :", test_counts) + + return splits + + +def main(): + """ + Edit this path before running. + """ + base_folder = "./datasets" + + print("Downloading STARC-9...") + dataset_root = download_starc9(base_folder) + print("Downloaded to:", dataset_root) + + print("\nCreating splits...") + create_splits_starc9(base_folder) + + print("\nDone.") + + +if __name__ == "__main__": + main() \ No newline at end of file From 7454a9155bf04868de205725de3ce77d87b88f9c Mon Sep 17 00:00:00 2001 From: PierreMarza Date: Wed, 1 Apr 2026 17:31:21 +0200 Subject: [PATCH 4/4] thunder-specific changes --- .../dataset/{STARC_9.yaml => starc9.yaml} | 8 +- src/thunder/datasets/__init__.py | 1 + src/thunder/datasets/data_splits.py | 4 + src/thunder/datasets/dataset/__init__.py | 1 + src/thunder/datasets/dataset/starc9.py | 221 +++++++----------- src/thunder/datasets/download.py | 5 + src/thunder/utils/constants.py | 1 + 7 files changed, 98 insertions(+), 143 deletions(-) rename src/thunder/config/dataset/{STARC_9.yaml => starc9.yaml} (86%) diff --git a/src/thunder/config/dataset/STARC_9.yaml b/src/thunder/config/dataset/starc9.yaml similarity index 86% rename from src/thunder/config/dataset/STARC_9.yaml rename to src/thunder/config/dataset/starc9.yaml index 43a5c40..762fc09 100644 --- a/src/thunder/config/dataset/STARC_9.yaml +++ b/src/thunder/config/dataset/starc9.yaml @@ -1,11 +1,11 @@ -dataset_name: STARC_9 +dataset_name: starc9 nb_classes: 9 base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/ compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"] nb_train_samples: 630000 -nb_val_samples: 20000 +nb_val_samples: 18000 nb_test_samples: 54000 -md5sum: "2a238a6340b693cd2b10d15f6afa2053" +md5sum: "3010519777b46827fdb16e656ed74975" image_sizes: [[256, 256]] mpp: 0.5 cancer_type: colorectal @@ -30,7 +30,7 @@ id_to_class: 6: BLD 7: FCT 8: TUM -id_to_classname: # From KEEP paper +id_to_classname: 0: adipose tissue 1: lymphoid tissue 2: mucin diff --git a/src/thunder/datasets/__init__.py b/src/thunder/datasets/__init__.py index b8a1839..04bc8a6 100644 --- a/src/thunder/datasets/__init__.py +++ b/src/thunder/datasets/__init__.py @@ -16,6 +16,7 @@ spider_colorectal, spider_skin, spider_thorax, + starc9, tcga_crc_msi, tcga_tils, tcga_uniform, diff --git a/src/thunder/datasets/data_splits.py b/src/thunder/datasets/data_splits.py index e770205..2009bfa 100644 --- a/src/thunder/datasets/data_splits.py +++ b/src/thunder/datasets/data_splits.py @@ -39,6 +39,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None: "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", ] elif datasets[0] == "classification": datasets = [ @@ -58,6 +59,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None: "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", ] elif datasets[0] == "segmentation": datasets = [ @@ -104,6 +106,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None: create_splits_spider_colorectal, create_splits_spider_skin, create_splits_spider_thorax, + create_splits_starc9, create_splits_tcga_crc_msi, create_splits_tcga_tils, create_splits_tcga_uniform, @@ -128,6 +131,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None: "spider_colorectal": create_splits_spider_colorectal, "spider_skin": create_splits_spider_skin, "spider_thorax": create_splits_spider_thorax, + "starc9": create_splits_starc9, # Segmentation "ocelot": create_splits_ocelot, "pannuke": create_splits_pannuke, diff --git a/src/thunder/datasets/dataset/__init__.py b/src/thunder/datasets/dataset/__init__.py index f4288b5..2308237 100644 --- a/src/thunder/datasets/dataset/__init__.py +++ b/src/thunder/datasets/dataset/__init__.py @@ -25,6 +25,7 @@ ) from .spider_skin import create_splits_spider_skin, download_spider_skin from .spider_thorax import create_splits_spider_thorax, download_spider_thorax +from .starc9 import create_splits_starc9, download_starc9 from .tcga_crc_msi import create_splits_tcga_crc_msi, download_tcga_crc_msi from .tcga_tils import create_splits_tcga_tils, download_tcga_tils from .tcga_uniform import create_splits_tcga_uniform, download_tcga_uniform diff --git a/src/thunder/datasets/dataset/starc9.py b/src/thunder/datasets/dataset/starc9.py index 127fb77..c1e5d9b 100644 --- a/src/thunder/datasets/dataset/starc9.py +++ b/src/thunder/datasets/dataset/starc9.py @@ -1,13 +1,5 @@ -import os -import json -import zipfile -from pathlib import Path -from collections import defaultdict from typing import Dict, List, Tuple -from huggingface_hub import snapshot_download - - CLASS_TO_ID = { "ADI": 0, "LYM": 1, @@ -23,7 +15,7 @@ VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"} -def download_starc9(root_folder: str) -> str: +def download_starc9(root_folder: str) -> None: """ Download the STARC-9 dataset from Hugging Face and extract all zip files. @@ -34,77 +26,48 @@ def download_starc9(root_folder: str) -> str: CURATED-TCGA is intentionally ignored here. """ - dataset_root = os.path.join(root_folder, "starc_9") + from huggingface_hub import snapshot_download snapshot_download( repo_id="Path2AI/STARC-9", repo_type="dataset", - local_dir=dataset_root, + local_dir=root_folder, local_dir_use_symlinks=False, ) - extract_all_zips(dataset_root) - flatten_nested_class_dirs(dataset_root) - return dataset_root + extract_all_zips(root_folder) def extract_all_zips(root_dir: str) -> None: """ Recursively extract every .zip under root_dir into a folder with the same stem. """ + import os + from pathlib import Path + + from ..utils import unzip_file + for current_root, _, files in os.walk(root_dir): for file_name in files: if not file_name.lower().endswith(".zip"): continue - zip_path = os.path.join(current_root, file_name) - extract_dir = os.path.join(current_root, Path(file_name).stem) - - if os.path.exists(extract_dir) and any(Path(extract_dir).iterdir()): - continue - - os.makedirs(extract_dir, exist_ok=True) - with zipfile.ZipFile(zip_path, "r") as zf: - zf.extractall(extract_dir) - - -def flatten_nested_class_dirs(root_dir: str) -> None: - """ - Fix common extraction issue like: - ADI/ADI/*.png - into: - ADI/*.png - """ - for split_root, class_dirs in find_candidate_class_roots(root_dir): - for class_name in class_dirs: - class_dir = Path(split_root) / class_name - nested_dir = class_dir / class_name - if nested_dir.is_dir(): - for item in nested_dir.iterdir(): - target = class_dir / item.name - if not target.exists(): - item.rename(target) - try: - nested_dir.rmdir() - except OSError: - pass - - -def find_candidate_class_roots(root_dir: str) -> List[Tuple[str, List[str]]]: - """ - Find directories that contain some/all class folders. - """ - candidates = [] - expected = set(CLASS_TO_ID.keys()) + unzip_file( + os.path.join(current_root, file_name), + current_root, + ) - for current_root, dirnames, _ in os.walk(root_dir): - present = sorted([d for d in dirnames if d in expected]) - if present: - candidates.append((current_root, present)) - return candidates + # Renaming folder extracted from STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip + if file_name == "STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip": + os.rename( + os.path.join(current_root, "NORMALIZED"), + os.path.join(current_root, "STANFORD-CRC-HE-VAL-LARGE"), + ) -def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int], Dict[str, int]]: +def collect_images_from_class_root( + class_root: str, +) -> Tuple[List[str], List[int], Dict[str, int]]: """ Read all images from a directory structured like: class_root/ @@ -112,9 +75,10 @@ def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int LYM/ ... """ + from pathlib import Path + images: List[str] = [] labels: List[int] = [] - class_counts: Dict[str, int] = defaultdict(int) class_root_path = Path(class_root) if not class_root_path.exists(): @@ -132,93 +96,72 @@ def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int if img_path.is_file() and img_path.suffix.lower() in VALID_EXTS: images.append(str(img_path.resolve())) labels.append(class_id) - class_counts[class_name] += 1 - - return images, labels, dict(class_counts) + return images, labels -def create_splits_starc9(base_folder: str) -> Dict: - """ - Build train/val/test splits using only STANFORD validation sets. - train = Training_data_normalized - val = Validation_data/STANFORD-CRC-HE-VAL-SMALL - test = Validation_data/STANFORD-CRC-HE-VAL-LARGE +def create_splits_starc9(base_folder: str, dataset_cfg: dict) -> None: """ - dataset_root = os.path.join(base_folder, "starc_9") + Generating data splits for the STARC-9 dataset. - train_root = os.path.join(dataset_root, "Training_data_normalized") - val_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-SMALL") - test_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-LARGE") - - train_images, train_labels, train_counts = collect_images_from_class_root(train_root) - val_images, val_labels, val_counts = collect_images_from_class_root(val_root) - test_images, test_labels, test_counts = collect_images_from_class_root(test_root) - - splits = { - "train": {"images": train_images, "labels": train_labels}, - "val": {"images": val_images, "labels": val_labels}, - "test": {"images": test_images, "labels": test_labels}, - "meta": { - "dataset_name": "STARC_9", - "class_to_id": CLASS_TO_ID, - "num_classes": len(CLASS_TO_ID), - "counts": { - "train": { - "total": len(train_images), - "per_class": train_counts, - }, - "val": { - "total": len(val_images), - "per_class": val_counts, - }, - "test": { - "total": len(test_images), - "per_class": test_counts, - }, - }, - "notes": [ - "CURATED-TCGA-CRC-HE-20K-NORMALIZED is intentionally excluded.", - "Validation uses STANFORD-CRC-HE-VAL-SMALL only.", - "Test uses STANFORD-CRC-HE-VAL-LARGE only.", - ], - }, - } - - os.makedirs(os.path.join(base_folder, "data_splits"), exist_ok=True) - out_json = os.path.join(base_folder, "data_splits", "starc_9.json") - with open(out_json, "w", encoding="utf-8") as f: - json.dump(splits, f, indent=2) - - print("\nSaved split file to:", out_json) - print("\nSample counts") - print("Train:", len(train_images)) - print("Val :", len(val_images)) - print("Test :", len(test_images)) - - print("\nPer-class counts") - print("Train:", train_counts) - print("Val :", val_counts) - print("Test :", test_counts) - - return splits - - -def main(): - """ - Edit this path before running. + :param base_folder: path to the main folder storing datasets. + :param dataset_cfg: dataset-specific config. """ - base_folder = "./datasets" + import os + + from ...utils.constants import UtilsConstants + from ...utils.utils import set_seed + from ..data_splits import ( + check_dataset, + create_few_shot_training_data, + init_dict, + save_dict, + ) - print("Downloading STARC-9...") - dataset_root = download_starc9(base_folder) - print("Downloaded to:", dataset_root) + # Setting the random seed + set_seed(UtilsConstants.DEFAULT_SEED.value) - print("\nCreating splits...") - create_splits_starc9(base_folder) + # Initializing dict + starc9_data_splits = init_dict() - print("\nDone.") + # Getting folder paths + dataset_root = os.path.join(base_folder, "starc9") + train_root = os.path.join(dataset_root, "Training_data_normalized") + val_root = os.path.join( + dataset_root, + "Validation_data", + "STANFORD-CRC-HE-VAL-SMALL", + ) + test_root = os.path.join( + dataset_root, + "Validation_data", + "STANFORD-CRC-HE-VAL-LARGE", + ) + # Collecting data + train_images, train_labels = collect_images_from_class_root(train_root) + val_images, val_labels = collect_images_from_class_root(val_root) + test_images, test_labels = collect_images_from_class_root(test_root) + + # Updating dict + starc9_data_splits["train"]["images"] = train_images + starc9_data_splits["train"]["labels"] = train_labels + starc9_data_splits["val"]["images"] = val_images + starc9_data_splits["val"]["labels"] = val_labels + starc9_data_splits["test"]["images"] = test_images + starc9_data_splits["test"]["labels"] = test_labels + + # Few-shot training data + starc9_data_splits = create_few_shot_training_data(starc9_data_splits) + + # Checking dataset characteristics + check_dataset( + starc9_data_splits, + dataset_cfg, + base_folder, + ) -if __name__ == "__main__": - main() \ No newline at end of file + # Saving dict + save_dict( + starc9_data_splits, os.path.join(base_folder, "data_splits", "starc9.json") + ) diff --git a/src/thunder/datasets/download.py b/src/thunder/datasets/download.py index b8127f0..d5efa46 100644 --- a/src/thunder/datasets/download.py +++ b/src/thunder/datasets/download.py @@ -27,6 +27,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False * spider_colorectal * spider_skin * spider_thorax + * starc9 * tcga_crc_msi * tcga_tils * tcga_uniform @@ -65,6 +66,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", "tcga_crc_msi", "tcga_tils", "tcga_uniform", @@ -84,6 +86,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", "tcga_crc_msi", "tcga_tils", "tcga_uniform", @@ -160,5 +163,7 @@ def download_dataset(dataset: str): download_spider_skin(root_folder) elif dataset == "spider_thorax": download_spider_thorax(root_folder) + elif dataset == "starc9": + download_starc9(root_folder) else: raise ValueError(f"Dataset {dataset} is not supported.") diff --git a/src/thunder/utils/constants.py b/src/thunder/utils/constants.py index 76a40bc..31633c1 100644 --- a/src/thunder/utils/constants.py +++ b/src/thunder/utils/constants.py @@ -61,6 +61,7 @@ class DatasetConstants(Enum): "spider_colorectal", "spider_skin", "spider_thorax", + "starc9", "tcga_crc_msi", "tcga_tils", "tcga_uniform",