NousResearch · sumo43 · Jul 21, 2025 · Jul 23, 2025 · Jul 26, 2025 · Jul 26, 2025
diff --git a/.gitignore b/.gitignore
@@ -42,3 +42,17 @@ Sessionx.vim
 
 # macOS dir files
 .DS_Store
+
+# Ignore everything inside scripts/
+scripts/*
+
+# Keep Python files in scripts/
+!scripts/*.py
+
+# Keep contents of these subdirectories
+!scripts/example/
+!scripts/example/**
+!scripts/generate/
+!scripts/generate/**
+!scripts/estimate/
+!scripts/estimate/**
diff --git a/README.md b/README.md
@@ -158,6 +158,29 @@ srun torchrun --nnodes 2
 
 If your gpu count per node is not 8, adjust `--nproc_per_node` in the torchrun command and `#SBATCH --gpus-per-task` in the SBATCH command section.
 
+## (NOUS) training with sample packing and multimodality
+
+### Training Qwen3-8B with sample packing
+To preprocess and pack a textonly chat dataset, run `scripts/preprocess_data.py`:
+```
+python3 scripts/preprocess_data.py --dataset NousResearch/Hermes-3-Dataset --tokenizer Qwen/Qwen3-8B --chat --pack-to-sequence-length 8000 --split "train[:1000]" --save-to-disk ./dataset
+```
+
+Qwen3-8B can be trained using this dataset:
+```
+CONFIG_FILE="./torchtitan/models/qwen3/train_configs/qwen3_8b_finetuning.toml" ./run_train.sh
+```
+
+## Training Mistral Small 3.1 with multimodal sample packing
+To preprocess and pack a multimodal chat dataset, run `scripts/preprocess_multimodal_data.py`:
+```
+python3 scripts/preprocess_multimodal_data.py --dataset /home/shared/datasets/cambrian_sample.json --preprocessor mistralai/Mistral-Small-3.1-24B-Instruct-2503  --chat --pack-to-sequence-length 8000 --split "train" --save-to-disk ./multimodal_dataset --limit 1000
+```
+
+Mistral Small 3.1 can be trained using this dataset:
+```
+CONFIG_FILE="./torchtitan/models/mistral3/train_configs/mistral24b_finetuning.toml" ./run_train.sh
+```
 
 ## Citation
 

diff --git a/scripts/convert.py b/scripts/convert.py
@@ -0,0 +1,59 @@
+import re
+from datasets import load_dataset
+from datasets.utils.info_utils import VerificationMode
+
+# Load your dataset (replace 'your_dataset_name' with the actual Hugging Face dataset name or path)
+ds = load_dataset("nvidia/Llama-Nemotron-VLM-Dataset-v1", verification_mode=VerificationMode.NO_CHECKS, split="vqa_8")
+
+def process_conversation(row):
+    image_path = row["image"]
+    original_conv = row["conversations"]
+
+    messages = [
+        {
+            "role": "system",
+            "content": [
+                {"type": "text", "text": "You are a helpful assistant."}
+            ]
+        }
+    ]
+
+    # Assuming the conversation alternates starting with human, and image is only in the first human message
+    for i, turn in enumerate(original_conv):
+        role = "user" if turn["from"] == "human" else "assistant"
+        value = turn["value"]
+
+        if role == "user" and i == 0 and "<image>" in value:
+            # Split the value around <image>
+            parts = re.split(r'(\n?<image>\n?)', value)
+            content = []
+            for part in parts:
+                if re.match(r'\n?<image>\n?', part):
+                    content.append({"type": "image", "path": './ChartQA Dataset/' + image_path})
+                elif part.strip():
+                    content.append({"type": "text", "text": part.strip()})
+        else:
+            content = [{"type": "text", "text": value.strip()}]
+
+        messages.append({
+            "role": role,
+            "content": content
+        })
+
+    # The format has an outer list with a dict containing "messages"
+    new_conv = [{"messages": messages}]
+
+    return {"conversations": messages}
+
+# Apply the transformation and keep only the new "conversations" column
+new_ds = ds.map(
+    process_conversation,
+    remove_columns=ds.column_names
+)
+
+# Optionally, push to Hugging Face or save
+# new_ds.push_to_hub("new_dataset_name")
+# or
+new_ds.save_to_disk("ChartQA_Subset")
+
+#print(new_ds[0])