From e339a93a113e5be1b0ffb14be14e3919df39d2eb Mon Sep 17 00:00:00 2001
From: Barathi Subramanian <79915093+barathi-1993@users.noreply.github.com>
Date: Wed, 4 Feb 2026 19:21:29 -0800
Subject: [PATCH 1/4] Add files via upload

---
 src/thunder/config/dataset/STARC_9.yaml | 42 +++++++++++++++++++++++++
 1 file changed, 42 insertions(+)
 create mode 100644 src/thunder/config/dataset/STARC_9.yaml

diff --git a/src/thunder/config/dataset/STARC_9.yaml b/src/thunder/config/dataset/STARC_9.yaml
new file mode 100644
index 0000000..0ed2eda
--- /dev/null
+++ b/src/thunder/config/dataset/STARC_9.yaml
@@ -0,0 +1,42 @@
+dataset_name: STARC_9
+nb_classes: 9
+base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/
+compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"]
+nb_train_samples: 630000
+nb_val_samples: 540000
+nb_test_samples: 20000
+md5sum: "2a238a6340b693cd2b10d15f6afa2053"
+image_sizes: [[256, 256]]
+mpp: 0.5
+cancer_type: colorectal
+classes: ["ADI", "LYM", "MUC", "MUS", "NCS", "NOR", "BLD", "FCT", "TUM"]
+class_to_id:
+  ADI: 0
+  LYM: 1
+  MUC: 2
+  MUS: 3
+  NCS: 4
+  NOR: 5
+  BLD: 6
+  FCT: 7
+  TUM: 8
+id_to_class:
+  0: ADI
+  1: LYM
+  2: MUC
+  3: MUS
+  4: NCS
+  5: NOR
+  6: BLD
+  7: FCT
+  8: TUM
+id_to_classname: # From KEEP paper
+  0: adipose tissue
+  1: lymphoid tissue 
+  2: mucin
+  3: muscle
+  4: necrosis
+  5: normal mucosa 
+  6: blood
+  7: fibroconnective tissue 
+  8: tumor
\ No newline at end of file

From ce6efeb4c9ea5c93773cd20dda0498ba725f77f9 Mon Sep 17 00:00:00 2001
From: Barathi Subramanian <79915093+barathi-1993@users.noreply.github.com>
Date: Tue, 24 Mar 2026 19:43:50 -0700
Subject: [PATCH 2/4] Update STARC_9.yaml

---
 src/thunder/config/dataset/STARC_9.yaml | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/src/thunder/config/dataset/STARC_9.yaml b/src/thunder/config/dataset/STARC_9.yaml
index 0ed2eda..43a5c40 100644
--- a/src/thunder/config/dataset/STARC_9.yaml
+++ b/src/thunder/config/dataset/STARC_9.yaml
@@ -3,8 +3,8 @@ nb_classes: 9
 base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/
 compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"]
 nb_train_samples: 630000
-nb_val_samples: 540000
-nb_test_samples: 20000
+nb_val_samples: 20000
+nb_test_samples: 54000
 md5sum: "2a238a6340b693cd2b10d15f6afa2053"
 image_sizes: [[256, 256]]
 mpp: 0.5
@@ -39,4 +39,4 @@ id_to_classname: # From KEEP paper
   5: normal mucosa 
   6: blood
   7: fibroconnective tissue 
-  8: tumor
\ No newline at end of file
+  8: tumor

From 3a91041c3e457ad5b09a73919f6218d75bea8fcd Mon Sep 17 00:00:00 2001
From: Barathi Subramanian <79915093+barathi-1993@users.noreply.github.com>
Date: Tue, 24 Mar 2026 19:49:24 -0700
Subject: [PATCH 3/4] Add files via upload

---
 src/thunder/datasets/dataset/starc9.py | 224 +++++++++++++++++++++++++
 1 file changed, 224 insertions(+)
 create mode 100644 src/thunder/datasets/dataset/starc9.py

diff --git a/src/thunder/datasets/dataset/starc9.py b/src/thunder/datasets/dataset/starc9.py
new file mode 100644
index 0000000..127fb77
--- /dev/null
+++ b/src/thunder/datasets/dataset/starc9.py
@@ -0,0 +1,224 @@
+import os
+import json
+import zipfile
+from pathlib import Path
+from collections import defaultdict
+from typing import Dict, List, Tuple
+
+from huggingface_hub import snapshot_download
+
+
+CLASS_TO_ID = {
+    "ADI": 0,
+    "LYM": 1,
+    "MUC": 2,
+    "MUS": 3,
+    "NCS": 4,
+    "NOR": 5,
+    "BLD": 6,
+    "FCT": 7,
+    "TUM": 8,
+}
+
+VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
+
+
+def download_starc9(root_folder: str) -> str:
+    """
+    Download the STARC-9 dataset from Hugging Face and extract all zip files.
+
+    Final split mapping:
+    - train: Training_data_normalized
+    - val:   Validation_data/STANFORD-CRC-HE-VAL-SMALL
+    - test:  Validation_data/STANFORD-CRC-HE-VAL-LARGE
+
+    CURATED-TCGA is intentionally ignored here.
+    """
+    dataset_root = os.path.join(root_folder, "starc_9")
+
+    snapshot_download(
+        repo_id="Path2AI/STARC-9",
+        repo_type="dataset",
+        local_dir=dataset_root,
+        local_dir_use_symlinks=False,
+    )
+
+    extract_all_zips(dataset_root)
+    flatten_nested_class_dirs(dataset_root)
+    return dataset_root
+
+
+def extract_all_zips(root_dir: str) -> None:
+    """
+    Recursively extract every .zip under root_dir into a folder with the same stem.
+    """
+    for current_root, _, files in os.walk(root_dir):
+        for file_name in files:
+            if not file_name.lower().endswith(".zip"):
+                continue
+
+            zip_path = os.path.join(current_root, file_name)
+            extract_dir = os.path.join(current_root, Path(file_name).stem)
+
+            if os.path.exists(extract_dir) and any(Path(extract_dir).iterdir()):
+                continue
+
+            os.makedirs(extract_dir, exist_ok=True)
+            with zipfile.ZipFile(zip_path, "r") as zf:
+                zf.extractall(extract_dir)
+
+
+def flatten_nested_class_dirs(root_dir: str) -> None:
+    """
+    Fix common extraction issue like:
+        ADI/ADI/*.png
+    into:
+        ADI/*.png
+    """
+    for split_root, class_dirs in find_candidate_class_roots(root_dir):
+        for class_name in class_dirs:
+            class_dir = Path(split_root) / class_name
+            nested_dir = class_dir / class_name
+            if nested_dir.is_dir():
+                for item in nested_dir.iterdir():
+                    target = class_dir / item.name
+                    if not target.exists():
+                        item.rename(target)
+                try:
+                    nested_dir.rmdir()
+                except OSError:
+                    pass
+
+
+def find_candidate_class_roots(root_dir: str) -> List[Tuple[str, List[str]]]:
+    """
+    Find directories that contain some/all class folders.
+    """
+    candidates = []
+    expected = set(CLASS_TO_ID.keys())
+
+    for current_root, dirnames, _ in os.walk(root_dir):
+        present = sorted([d for d in dirnames if d in expected])
+        if present:
+            candidates.append((current_root, present))
+    return candidates
+
+
+def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int], Dict[str, int]]:
+    """
+    Read all images from a directory structured like:
+        class_root/
+            ADI/
+            LYM/
+            ...
+    """
+    images: List[str] = []
+    labels: List[int] = []
+    class_counts: Dict[str, int] = defaultdict(int)
+
+    class_root_path = Path(class_root)
+    if not class_root_path.exists():
+        raise FileNotFoundError(f"Class root does not exist: {class_root}")
+
+    missing_classes = [c for c in CLASS_TO_ID if not (class_root_path / c).exists()]
+    if missing_classes:
+        raise FileNotFoundError(
+            f"Missing expected class folders under {class_root}: {missing_classes}"
+        )
+
+    for class_name, class_id in CLASS_TO_ID.items():
+        class_dir = class_root_path / class_name
+        for img_path in sorted(class_dir.rglob("*")):
+            if img_path.is_file() and img_path.suffix.lower() in VALID_EXTS:
+                images.append(str(img_path.resolve()))
+                labels.append(class_id)
+                class_counts[class_name] += 1
+
+    return images, labels, dict(class_counts)
+
+
+def create_splits_starc9(base_folder: str) -> Dict:
+    """
+    Build train/val/test splits using only STANFORD validation sets.
+
+    train = Training_data_normalized
+    val   = Validation_data/STANFORD-CRC-HE-VAL-SMALL
+    test  = Validation_data/STANFORD-CRC-HE-VAL-LARGE
+    """
+    dataset_root = os.path.join(base_folder, "starc_9")
+
+    train_root = os.path.join(dataset_root, "Training_data_normalized")
+    val_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-SMALL")
+    test_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-LARGE")
+
+    train_images, train_labels, train_counts = collect_images_from_class_root(train_root)
+    val_images, val_labels, val_counts = collect_images_from_class_root(val_root)
+    test_images, test_labels, test_counts = collect_images_from_class_root(test_root)
+
+    splits = {
+        "train": {"images": train_images, "labels": train_labels},
+        "val": {"images": val_images, "labels": val_labels},
+        "test": {"images": test_images, "labels": test_labels},
+        "meta": {
+            "dataset_name": "STARC_9",
+            "class_to_id": CLASS_TO_ID,
+            "num_classes": len(CLASS_TO_ID),
+            "counts": {
+                "train": {
+                    "total": len(train_images),
+                    "per_class": train_counts,
+                },
+                "val": {
+                    "total": len(val_images),
+                    "per_class": val_counts,
+                },
+                "test": {
+                    "total": len(test_images),
+                    "per_class": test_counts,
+                },
+            },
+            "notes": [
+                "CURATED-TCGA-CRC-HE-20K-NORMALIZED is intentionally excluded.",
+                "Validation uses STANFORD-CRC-HE-VAL-SMALL only.",
+                "Test uses STANFORD-CRC-HE-VAL-LARGE only.",
+            ],
+        },
+    }
+
+    os.makedirs(os.path.join(base_folder, "data_splits"), exist_ok=True)
+    out_json = os.path.join(base_folder, "data_splits", "starc_9.json")
+    with open(out_json, "w", encoding="utf-8") as f:
+        json.dump(splits, f, indent=2)
+
+    print("\nSaved split file to:", out_json)
+    print("\nSample counts")
+    print("Train:", len(train_images))
+    print("Val  :", len(val_images))
+    print("Test :", len(test_images))
+
+    print("\nPer-class counts")
+    print("Train:", train_counts)
+    print("Val  :", val_counts)
+    print("Test :", test_counts)
+
+    return splits
+
+
+def main():
+    """
+    Edit this path before running.
+    """
+    base_folder = "./datasets"
+
+    print("Downloading STARC-9...")
+    dataset_root = download_starc9(base_folder)
+    print("Downloaded to:", dataset_root)
+
+    print("\nCreating splits...")
+    create_splits_starc9(base_folder)
+
+    print("\nDone.")
+
+
+if __name__ == "__main__":
+    main()
\ No newline at end of file

From 7454a9155bf04868de205725de3ce77d87b88f9c Mon Sep 17 00:00:00 2001
From: PierreMarza <pierre.marza@gmail.com>
Date: Wed, 1 Apr 2026 17:31:21 +0200
Subject: [PATCH 4/4] thunder-specific changes

---
 .../dataset/{STARC_9.yaml => starc9.yaml}     |   8 +-
 src/thunder/datasets/__init__.py              |   1 +
 src/thunder/datasets/data_splits.py           |   4 +
 src/thunder/datasets/dataset/__init__.py      |   1 +
 src/thunder/datasets/dataset/starc9.py        | 221 +++++++-----------
 src/thunder/datasets/download.py              |   5 +
 src/thunder/utils/constants.py                |   1 +
 7 files changed, 98 insertions(+), 143 deletions(-)
 rename src/thunder/config/dataset/{STARC_9.yaml => starc9.yaml} (86%)

diff --git a/src/thunder/config/dataset/STARC_9.yaml b/src/thunder/config/dataset/starc9.yaml
similarity index 86%
rename from src/thunder/config/dataset/STARC_9.yaml
rename to src/thunder/config/dataset/starc9.yaml
index 43a5c40..762fc09 100644
--- a/src/thunder/config/dataset/STARC_9.yaml
+++ b/src/thunder/config/dataset/starc9.yaml
@@ -1,11 +1,11 @@
-dataset_name: STARC_9
+dataset_name: starc9
 nb_classes: 9
 base_data_folder: ${oc.env:THUNDER_BASE_DATA_FOLDER}/datasets/
 compatible_tasks: ["adversarial_attack", "alignment_scoring", "image_retrieval", "knn", "linear_probing", "pre_computing_embeddings", "simple_shot", "transformation_invariance", "zero_shot_vlm"]
 nb_train_samples: 630000
-nb_val_samples: 20000
+nb_val_samples: 18000
 nb_test_samples: 54000
-md5sum: "2a238a6340b693cd2b10d15f6afa2053"
+md5sum: "3010519777b46827fdb16e656ed74975"
 image_sizes: [[256, 256]]
 mpp: 0.5
 cancer_type: colorectal
@@ -30,7 +30,7 @@ id_to_class:
   6: BLD
   7: FCT
   8: TUM
-id_to_classname: # From KEEP paper
+id_to_classname:
   0: adipose tissue
   1: lymphoid tissue 
   2: mucin
diff --git a/src/thunder/datasets/__init__.py b/src/thunder/datasets/__init__.py
index b8a1839..04bc8a6 100644
--- a/src/thunder/datasets/__init__.py
+++ b/src/thunder/datasets/__init__.py
@@ -16,6 +16,7 @@
     spider_colorectal,
     spider_skin,
     spider_thorax,
+    starc9,
     tcga_crc_msi,
     tcga_tils,
     tcga_uniform,
diff --git a/src/thunder/datasets/data_splits.py b/src/thunder/datasets/data_splits.py
index e770205..2009bfa 100644
--- a/src/thunder/datasets/data_splits.py
+++ b/src/thunder/datasets/data_splits.py
@@ -39,6 +39,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None:
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
             ]
         elif datasets[0] == "classification":
             datasets = [
@@ -58,6 +59,7 @@ def generate_splits(datasets: Union[List[str], str]) -> None:
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
             ]
         elif datasets[0] == "segmentation":
             datasets = [
@@ -104,6 +106,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None:
         create_splits_spider_colorectal,
         create_splits_spider_skin,
         create_splits_spider_thorax,
+        create_splits_starc9,
         create_splits_tcga_crc_msi,
         create_splits_tcga_tils,
         create_splits_tcga_uniform,
@@ -128,6 +131,7 @@ def generate_splits_for_dataset(dataset_name: str) -> None:
         "spider_colorectal": create_splits_spider_colorectal,
         "spider_skin": create_splits_spider_skin,
         "spider_thorax": create_splits_spider_thorax,
+        "starc9": create_splits_starc9,
         # Segmentation
         "ocelot": create_splits_ocelot,
         "pannuke": create_splits_pannuke,
diff --git a/src/thunder/datasets/dataset/__init__.py b/src/thunder/datasets/dataset/__init__.py
index f4288b5..2308237 100644
--- a/src/thunder/datasets/dataset/__init__.py
+++ b/src/thunder/datasets/dataset/__init__.py
@@ -25,6 +25,7 @@
 )
 from .spider_skin import create_splits_spider_skin, download_spider_skin
 from .spider_thorax import create_splits_spider_thorax, download_spider_thorax
+from .starc9 import create_splits_starc9, download_starc9
 from .tcga_crc_msi import create_splits_tcga_crc_msi, download_tcga_crc_msi
 from .tcga_tils import create_splits_tcga_tils, download_tcga_tils
 from .tcga_uniform import create_splits_tcga_uniform, download_tcga_uniform
diff --git a/src/thunder/datasets/dataset/starc9.py b/src/thunder/datasets/dataset/starc9.py
index 127fb77..c1e5d9b 100644
--- a/src/thunder/datasets/dataset/starc9.py
+++ b/src/thunder/datasets/dataset/starc9.py
@@ -1,13 +1,5 @@
-import os
-import json
-import zipfile
-from pathlib import Path
-from collections import defaultdict
 from typing import Dict, List, Tuple
 
-from huggingface_hub import snapshot_download
-
-
 CLASS_TO_ID = {
     "ADI": 0,
     "LYM": 1,
@@ -23,7 +15,7 @@
 VALID_EXTS = {".png", ".jpg", ".jpeg", ".tif", ".tiff", ".bmp", ".webp"}
 
 
-def download_starc9(root_folder: str) -> str:
+def download_starc9(root_folder: str) -> None:
     """
     Download the STARC-9 dataset from Hugging Face and extract all zip files.
 
@@ -34,77 +26,48 @@ def download_starc9(root_folder: str) -> str:
 
     CURATED-TCGA is intentionally ignored here.
     """
-    dataset_root = os.path.join(root_folder, "starc_9")
+    from huggingface_hub import snapshot_download
 
     snapshot_download(
         repo_id="Path2AI/STARC-9",
         repo_type="dataset",
-        local_dir=dataset_root,
+        local_dir=root_folder,
         local_dir_use_symlinks=False,
     )
 
-    extract_all_zips(dataset_root)
-    flatten_nested_class_dirs(dataset_root)
-    return dataset_root
+    extract_all_zips(root_folder)
 
 
 def extract_all_zips(root_dir: str) -> None:
     """
     Recursively extract every .zip under root_dir into a folder with the same stem.
     """
+    import os
+    from pathlib import Path
+
+    from ..utils import unzip_file
+
     for current_root, _, files in os.walk(root_dir):
         for file_name in files:
             if not file_name.lower().endswith(".zip"):
                 continue
 
-            zip_path = os.path.join(current_root, file_name)
-            extract_dir = os.path.join(current_root, Path(file_name).stem)
-
-            if os.path.exists(extract_dir) and any(Path(extract_dir).iterdir()):
-                continue
-
-            os.makedirs(extract_dir, exist_ok=True)
-            with zipfile.ZipFile(zip_path, "r") as zf:
-                zf.extractall(extract_dir)
-
-
-def flatten_nested_class_dirs(root_dir: str) -> None:
-    """
-    Fix common extraction issue like:
-        ADI/ADI/*.png
-    into:
-        ADI/*.png
-    """
-    for split_root, class_dirs in find_candidate_class_roots(root_dir):
-        for class_name in class_dirs:
-            class_dir = Path(split_root) / class_name
-            nested_dir = class_dir / class_name
-            if nested_dir.is_dir():
-                for item in nested_dir.iterdir():
-                    target = class_dir / item.name
-                    if not target.exists():
-                        item.rename(target)
-                try:
-                    nested_dir.rmdir()
-                except OSError:
-                    pass
-
-
-def find_candidate_class_roots(root_dir: str) -> List[Tuple[str, List[str]]]:
-    """
-    Find directories that contain some/all class folders.
-    """
-    candidates = []
-    expected = set(CLASS_TO_ID.keys())
+            unzip_file(
+                os.path.join(current_root, file_name),
+                current_root,
+            )
 
-    for current_root, dirnames, _ in os.walk(root_dir):
-        present = sorted([d for d in dirnames if d in expected])
-        if present:
-            candidates.append((current_root, present))
-    return candidates
+            # Renaming folder extracted from STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip
+            if file_name == "STANFORD-CRC-HE-VAL-LARGE-NORMALIZED.zip":
+                os.rename(
+                    os.path.join(current_root, "NORMALIZED"),
+                    os.path.join(current_root, "STANFORD-CRC-HE-VAL-LARGE"),
+                )
 
 
-def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int], Dict[str, int]]:
+def collect_images_from_class_root(
+    class_root: str,
+) -> Tuple[List[str], List[int], Dict[str, int]]:
     """
     Read all images from a directory structured like:
         class_root/
@@ -112,9 +75,10 @@ def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int
             LYM/
             ...
     """
+    from pathlib import Path
+
     images: List[str] = []
     labels: List[int] = []
-    class_counts: Dict[str, int] = defaultdict(int)
 
     class_root_path = Path(class_root)
     if not class_root_path.exists():
@@ -132,93 +96,72 @@ def collect_images_from_class_root(class_root: str) -> Tuple[List[str], List[int
             if img_path.is_file() and img_path.suffix.lower() in VALID_EXTS:
                 images.append(str(img_path.resolve()))
                 labels.append(class_id)
-                class_counts[class_name] += 1
-
-    return images, labels, dict(class_counts)
 
+    return images, labels
 
-def create_splits_starc9(base_folder: str) -> Dict:
-    """
-    Build train/val/test splits using only STANFORD validation sets.
 
-    train = Training_data_normalized
-    val   = Validation_data/STANFORD-CRC-HE-VAL-SMALL
-    test  = Validation_data/STANFORD-CRC-HE-VAL-LARGE
+def create_splits_starc9(base_folder: str, dataset_cfg: dict) -> None:
     """
-    dataset_root = os.path.join(base_folder, "starc_9")
+    Generating data splits for the STARC-9 dataset.
 
-    train_root = os.path.join(dataset_root, "Training_data_normalized")
-    val_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-SMALL")
-    test_root = os.path.join(dataset_root, "Validation_data", "STANFORD-CRC-HE-VAL-LARGE")
-
-    train_images, train_labels, train_counts = collect_images_from_class_root(train_root)
-    val_images, val_labels, val_counts = collect_images_from_class_root(val_root)
-    test_images, test_labels, test_counts = collect_images_from_class_root(test_root)
-
-    splits = {
-        "train": {"images": train_images, "labels": train_labels},
-        "val": {"images": val_images, "labels": val_labels},
-        "test": {"images": test_images, "labels": test_labels},
-        "meta": {
-            "dataset_name": "STARC_9",
-            "class_to_id": CLASS_TO_ID,
-            "num_classes": len(CLASS_TO_ID),
-            "counts": {
-                "train": {
-                    "total": len(train_images),
-                    "per_class": train_counts,
-                },
-                "val": {
-                    "total": len(val_images),
-                    "per_class": val_counts,
-                },
-                "test": {
-                    "total": len(test_images),
-                    "per_class": test_counts,
-                },
-            },
-            "notes": [
-                "CURATED-TCGA-CRC-HE-20K-NORMALIZED is intentionally excluded.",
-                "Validation uses STANFORD-CRC-HE-VAL-SMALL only.",
-                "Test uses STANFORD-CRC-HE-VAL-LARGE only.",
-            ],
-        },
-    }
-
-    os.makedirs(os.path.join(base_folder, "data_splits"), exist_ok=True)
-    out_json = os.path.join(base_folder, "data_splits", "starc_9.json")
-    with open(out_json, "w", encoding="utf-8") as f:
-        json.dump(splits, f, indent=2)
-
-    print("\nSaved split file to:", out_json)
-    print("\nSample counts")
-    print("Train:", len(train_images))
-    print("Val  :", len(val_images))
-    print("Test :", len(test_images))
-
-    print("\nPer-class counts")
-    print("Train:", train_counts)
-    print("Val  :", val_counts)
-    print("Test :", test_counts)
-
-    return splits
-
-
-def main():
-    """
-    Edit this path before running.
+    :param base_folder: path to the main folder storing datasets.
+    :param dataset_cfg: dataset-specific config.
     """
-    base_folder = "./datasets"
+    import os
+
+    from ...utils.constants import UtilsConstants
+    from ...utils.utils import set_seed
+    from ..data_splits import (
+        check_dataset,
+        create_few_shot_training_data,
+        init_dict,
+        save_dict,
+    )
 
-    print("Downloading STARC-9...")
-    dataset_root = download_starc9(base_folder)
-    print("Downloaded to:", dataset_root)
+    # Setting the random seed
+    set_seed(UtilsConstants.DEFAULT_SEED.value)
 
-    print("\nCreating splits...")
-    create_splits_starc9(base_folder)
+    # Initializing dict
+    starc9_data_splits = init_dict()
 
-    print("\nDone.")
+    # Getting folder paths
+    dataset_root = os.path.join(base_folder, "starc9")
+    train_root = os.path.join(dataset_root, "Training_data_normalized")
+    val_root = os.path.join(
+        dataset_root,
+        "Validation_data",
+        "STANFORD-CRC-HE-VAL-SMALL",
+    )
+    test_root = os.path.join(
+        dataset_root,
+        "Validation_data",
+        "STANFORD-CRC-HE-VAL-LARGE",
+    )
 
+    # Collecting data
+    train_images, train_labels = collect_images_from_class_root(train_root)
+    val_images, val_labels = collect_images_from_class_root(val_root)
+    test_images, test_labels = collect_images_from_class_root(test_root)
+
+    # Updating dict
+    starc9_data_splits["train"]["images"] = train_images
+    starc9_data_splits["train"]["labels"] = train_labels
+    starc9_data_splits["val"]["images"] = val_images
+    starc9_data_splits["val"]["labels"] = val_labels
+    starc9_data_splits["test"]["images"] = test_images
+    starc9_data_splits["test"]["labels"] = test_labels
+
+    # Few-shot training data
+    starc9_data_splits = create_few_shot_training_data(starc9_data_splits)
+
+    # Checking dataset characteristics
+    check_dataset(
+        starc9_data_splits,
+        dataset_cfg,
+        base_folder,
+    )
 
-if __name__ == "__main__":
-    main()
\ No newline at end of file
+    # Saving dict
+    save_dict(
+        starc9_data_splits, os.path.join(base_folder, "data_splits", "starc9.json")
+    )
diff --git a/src/thunder/datasets/download.py b/src/thunder/datasets/download.py
index b8127f0..d5efa46 100644
--- a/src/thunder/datasets/download.py
+++ b/src/thunder/datasets/download.py
@@ -27,6 +27,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
         * spider_colorectal
         * spider_skin
         * spider_thorax
+        * starc9
         * tcga_crc_msi
         * tcga_tils
         * tcga_uniform
@@ -65,6 +66,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
                 "tcga_crc_msi",
                 "tcga_tils",
                 "tcga_uniform",
@@ -84,6 +86,7 @@ def download_datasets(datasets: Union[List[str], str], make_splits: bool = False
                 "spider_colorectal",
                 "spider_skin",
                 "spider_thorax",
+                "starc9",
                 "tcga_crc_msi",
                 "tcga_tils",
                 "tcga_uniform",
@@ -160,5 +163,7 @@ def download_dataset(dataset: str):
         download_spider_skin(root_folder)
     elif dataset == "spider_thorax":
         download_spider_thorax(root_folder)
+    elif dataset == "starc9":
+        download_starc9(root_folder)
     else:
         raise ValueError(f"Dataset {dataset} is not supported.")
diff --git a/src/thunder/utils/constants.py b/src/thunder/utils/constants.py
index 76a40bc..31633c1 100644
--- a/src/thunder/utils/constants.py
+++ b/src/thunder/utils/constants.py
@@ -61,6 +61,7 @@ class DatasetConstants(Enum):
         "spider_colorectal",
         "spider_skin",
         "spider_thorax",
+        "starc9",
         "tcga_crc_msi",
         "tcga_tils",
         "tcga_uniform",