Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 42 additions & 0 deletions src/thunder/config/dataset/starc9.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,42 @@
dataset_name: starc9
nb_classes: 9
base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/
compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"]
nb_train_samples: 630000
nb_val_samples: 18000
nb_test_samples: 54000
md5sum: "3010519777b46827fdb16e656ed74975"
image_sizes: [[256, 256]]
mpp: 0.5
cancer_type: colorectal
classes: ["ADI", "LYM", "MUC", "MUS", "NCS", "NOR", "BLD", "FCT", "TUM"]
class_to_id:
ADI: 0
LYM: 1
MUC: 2
MUS: 3
NCS: 4
NOR: 5
BLD: 6
FCT: 7
TUM: 8
id_to_class:
0: ADI
1: LYM
2: MUC
3: MUS
4: NCS
5: NOR
6: BLD
7: FCT
8: TUM
id_to_classname:
0: adipose tissue
1: lymphoid tissue
2: mucin
3: muscle
4: necrosis
5: normal mucosa
6: blood
7: fibroconnective tissue
8: tumor
1 change: 1 addition & 0 deletions src/thunder/datasets/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
spider_colorectal,
spider_skin,
spider_thorax,
starc9,
tcga_crc_msi,
tcga_tils,
tcga_uniform,
Expand Down
4 changes: 4 additions & 0 deletions src/thunder/datasets/data_splits.py
Original file line number Diff line number Diff line change
Expand Up @@ -39,6 +39,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None:
"spider_colorectal",
"spider_skin",
"spider_thorax",
"starc9",
]
elif datasets[0] == "classification":
datasets = [
Expand All @@ -58,6 +59,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None:
"spider_colorectal",
"spider_skin",
"spider_thorax",
"starc9",
]
elif datasets[0] == "segmentation":
datasets = [
Expand Down Expand Up @@ -104,6 +106,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None:
create_splits_spider_colorectal,
create_splits_spider_skin,
create_splits_spider_thorax,
create_splits_starc9,
create_splits_tcga_crc_msi,
create_splits_tcga_tils,
create_splits_tcga_uniform,
Expand All @@ -128,6 +131,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None:
"spider_colorectal": create_splits_spider_colorectal,
"spider_skin": create_splits_spider_skin,
"spider_thorax": create_splits_spider_thorax,
"starc9": create_splits_starc9,
# Segmentation
"ocelot": create_splits_ocelot,
"pannuke": create_splits_pannuke,
Expand Down
1 change: 1 addition & 0 deletions src/thunder/datasets/dataset/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@
)
from .spider_skin import create_splits_spider_skin, download_spider_skin
from .spider_thorax import create_splits_spider_thorax, download_spider_thorax
from .starc9 import create_splits_starc9, download_starc9
from .tcga_crc_msi import create_splits_tcga_crc_msi, download_tcga_crc_msi
from .tcga_tils import create_splits_tcga_tils, download_tcga_tils
from .tcga_uniform import create_splits_tcga_uniform, download_tcga_uniform
Expand Down
167 changes: 167 additions & 0 deletions src/thunder/datasets/dataset/starc9.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,167 @@
from typing import Dict, List, Tuple

CLASS_TO_ID = {
"ADI": 0,
"LYM": 1,
"MUC": 2,
"MUS": 3,
"NCS": 4,
"NOR": 5,
"BLD": 6,
"FCT": 7,
"TUM": 8,
}

VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}


def download_starc9(root_folder: str) -> None:
"""
Download the STARC-9 dataset from Hugging Face and extract all zip files.

Final split mapping:
- train: Training_data_normalized
- val: Validation_data/STANFORD-CRC-HE-VAL-SMALL
- test: Validation_data/STANFORD-CRC-HE-VAL-LARGE

CURATED-TCGA is intentionally ignored here.
"""
from huggingface_hub import snapshot_download

snapshot_download(
repo_id="Path2AI/STARC-9",
repo_type="dataset",
local_dir=root_folder,
local_dir_use_symlinks=False,
)

extract_all_zips(root_folder)


def extract_all_zips(root_dir: str) -> None:
"""
Recursively extract every .zip under root_dir into a folder with the same stem.
"""
import os
from pathlib import Path

from ..utils import unzip_file

for current_root, _, files in os.walk(root_dir):
for file_name in files:
if not file_name.lower().endswith(".zip"):
continue

unzip_file(
os.path.join(current_root, file_name),
current_root,
)

# Renaming folder extracted from STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip
if file_name == "STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip":
os.rename(
os.path.join(current_root, "NORMALIZED"),
os.path.join(current_root, "STANFORD-CRC-HE-VAL-LARGE"),
)


def collect_images_from_class_root(
class_root: str,
) -> Tuple[List[str], List[int], Dict[str, int]]:
"""
Read all images from a directory structured like:
class_root/
ADI/
LYM/
...
"""
from pathlib import Path

images: List[str] = []
labels: List[int] = []

class_root_path = Path(class_root)
if not class_root_path.exists():
raise FileNotFoundError(f"Class root does not exist: {class_root}")

missing_classes = [c for c in CLASS_TO_ID if not (class_root_path / c).exists()]
if missing_classes:
raise FileNotFoundError(
f"Missing expected class folders under {class_root}: {missing_classes}"
)

for class_name, class_id in CLASS_TO_ID.items():
class_dir = class_root_path / class_name
for img_path in sorted(class_dir.rglob("*")):
if img_path.is_file() and img_path.suffix.lower() in VALID_EXTS:
images.append(str(img_path.resolve()))
labels.append(class_id)

return images, labels


def create_splits_starc9(base_folder: str, dataset_cfg: dict) -> None:
"""
Generating data splits for the STARC-9 dataset.

:param base_folder: path to the main folder storing datasets.
:param dataset_cfg: dataset-specific config.
"""
import os

from ...utils.constants import UtilsConstants
from ...utils.utils import set_seed
from ..data_splits import (
check_dataset,
create_few_shot_training_data,
init_dict,
save_dict,
)

# Setting the random seed
set_seed(UtilsConstants.DEFAULT_SEED.value)

# Initializing dict
starc9_data_splits = init_dict()

# Getting folder paths
dataset_root = os.path.join(base_folder, "starc9")
train_root = os.path.join(dataset_root, "Training_data_normalized")
val_root = os.path.join(
dataset_root,
"Validation_data",
"STANFORD-CRC-HE-VAL-SMALL",
)
test_root = os.path.join(
dataset_root,
"Validation_data",
"STANFORD-CRC-HE-VAL-LARGE",
)

# Collecting data
train_images, train_labels = collect_images_from_class_root(train_root)
val_images, val_labels = collect_images_from_class_root(val_root)
test_images, test_labels = collect_images_from_class_root(test_root)

# Updating dict
starc9_data_splits["train"]["images"] = train_images
starc9_data_splits["train"]["labels"] = train_labels
starc9_data_splits["val"]["images"] = val_images
starc9_data_splits["val"]["labels"] = val_labels
starc9_data_splits["test"]["images"] = test_images
starc9_data_splits["test"]["labels"] = test_labels

# Few-shot training data
starc9_data_splits = create_few_shot_training_data(starc9_data_splits)

# Checking dataset characteristics
check_dataset(
starc9_data_splits,
dataset_cfg,
base_folder,
)

# Saving dict
save_dict(
starc9_data_splits, os.path.join(base_folder, "data_splits", "starc9.json")
)
5 changes: 5 additions & 0 deletions src/thunder/datasets/download.py
Original file line number Diff line number Diff line change
Expand Up @@ -27,6 +27,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
* spider_colorectal
* spider_skin
* spider_thorax
* starc9
* tcga_crc_msi
* tcga_tils
* tcga_uniform
Expand Down Expand Up @@ -65,6 +66,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
"spider_colorectal",
"spider_skin",
"spider_thorax",
"starc9",
"tcga_crc_msi",
"tcga_tils",
"tcga_uniform",
Expand All @@ -84,6 +86,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
"spider_colorectal",
"spider_skin",
"spider_thorax",
"starc9",
"tcga_crc_msi",
"tcga_tils",
"tcga_uniform",
Expand Down Expand Up @@ -160,5 +163,7 @@ def download_dataset(dataset: str):
download_spider_skin(root_folder)
elif dataset == "spider_thorax":
download_spider_thorax(root_folder)
elif dataset == "starc9":
download_starc9(root_folder)
else:
raise ValueError(f"Dataset {dataset} is not supported.")
1 change: 1 addition & 0 deletions src/thunder/utils/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,7 @@ class DatasetConstants(Enum):
"spider_colorectal",
"spider_skin",
"spider_thorax",
"starc9",
"tcga_crc_msi",
"tcga_tils",
"tcga_uniform",
Expand Down