From f84238ce686e74a3b871c1963b80edd60ba24cf0 Mon Sep 17 00:00:00 2001 From: Alfred Date: Fri, 26 Sep 2025 00:16:43 +0200 Subject: [PATCH 1/7] feature: distinct seeding in data generation --- .../coinrun/generate_coinrun_dataset.py | 33 ++++++++++++------- 1 file changed, 22 insertions(+), 11 deletions(-) diff --git a/data/jasmine_data/coinrun/generate_coinrun_dataset.py b/data/jasmine_data/coinrun/generate_coinrun_dataset.py index d933dc1..779f2e1 100644 --- a/data/jasmine_data/coinrun/generate_coinrun_dataset.py +++ b/data/jasmine_data/coinrun/generate_coinrun_dataset.py @@ -11,11 +11,12 @@ import tyro import json import os -from jasmine_data.utils import save_chunks +from data.jasmine_data.utils import save_chunks @dataclass class Args: + env_name: str = "coinrun" num_episodes_train: int = 10000 num_episodes_val: int = 500 num_episodes_test: int = 500 @@ -39,7 +40,7 @@ class Args: # --- Generate episodes --- -def generate_episodes(num_episodes, split): +def generate_episodes(num_episodes, split, start_seed, env_name): episode_idx = 0 episode_metadata = [] obs_chunks = [] @@ -47,8 +48,9 @@ def generate_episodes(num_episodes, split): file_idx = 0 output_dir_split = os.path.join(args.output_dir, split) while episode_idx < num_episodes: - seed = np.random.randint(0, 10000) - env = ProcgenGym3Env(num=1, env_name="coinrun", start_level=seed) + env = ProcgenGym3Env( + num=1, env_name=env_name, start_level=start_seed + episode_idx + ) observations_seq = [] actions_seq = [] @@ -116,21 +118,30 @@ def generate_episodes(num_episodes, split): def get_action_space(): - env = ProcgenGym3Env(num=1, env_name="coinrun", start_level=0) + env = ProcgenGym3Env(num=1, env_name=args.env_name, start_level=0) return env.ac_space.eltype.n def main(): - # Set random seed and create dataset directories - np.random.seed(args.seed) # --- Generate episodes --- - train_episode_metadata = generate_episodes(args.num_episodes_train, "train") - val_episode_metadata = generate_episodes(args.num_episodes_val, "val") - test_episode_metadata = generate_episodes(args.num_episodes_test, "test") + + train_start_seed = 0 + val_start_seed = args.num_episodes_train + test_start_seed = args.num_episodes_train + args.num_episodes_val + + train_episode_metadata = generate_episodes( + args.num_episodes_train, "train", train_start_seed, args.env_name + ) + val_episode_metadata = generate_episodes( + args.num_episodes_val, "val", val_start_seed, args.env_name + ) + test_episode_metadata = generate_episodes( + args.num_episodes_test, "test", test_start_seed, args.env_name + ) # --- Save metadata --- metadata = { - "env": "coinrun", + "env": args.env_name, "num_actions": get_action_space(), "num_episodes_train": args.num_episodes_train, "num_episodes_val": args.num_episodes_val, From 2ce2a5e003da5b5d246f239dfa108221c7f739d5 Mon Sep 17 00:00:00 2001 From: Alfred Date: Fri, 26 Sep 2025 00:18:47 +0200 Subject: [PATCH 2/7] chore: remove linebreak --- data/jasmine_data/coinrun/generate_coinrun_dataset.py | 1 - 1 file changed, 1 deletion(-) diff --git a/data/jasmine_data/coinrun/generate_coinrun_dataset.py b/data/jasmine_data/coinrun/generate_coinrun_dataset.py index 779f2e1..4e5d654 100644 --- a/data/jasmine_data/coinrun/generate_coinrun_dataset.py +++ b/data/jasmine_data/coinrun/generate_coinrun_dataset.py @@ -124,7 +124,6 @@ def get_action_space(): def main(): # --- Generate episodes --- - train_start_seed = 0 val_start_seed = args.num_episodes_train test_start_seed = args.num_episodes_train + args.num_episodes_val From 699e9ecb9e1c9f0401ba54cc046026be7dc26ff3 Mon Sep 17 00:00:00 2001 From: Alfred Date: Fri, 26 Sep 2025 00:27:38 +0200 Subject: [PATCH 3/7] fix: remove magic number --- data/jasmine_data/coinrun/generate_coinrun_dataset.py | 10 ++++++---- 1 file changed, 6 insertions(+), 4 deletions(-) diff --git a/data/jasmine_data/coinrun/generate_coinrun_dataset.py b/data/jasmine_data/coinrun/generate_coinrun_dataset.py index 4e5d654..7c34a91 100644 --- a/data/jasmine_data/coinrun/generate_coinrun_dataset.py +++ b/data/jasmine_data/coinrun/generate_coinrun_dataset.py @@ -123,11 +123,13 @@ def get_action_space(): def main(): - # --- Generate episodes --- - train_start_seed = 0 - val_start_seed = args.num_episodes_train - test_start_seed = args.num_episodes_train + args.num_episodes_val + # Set random seed and create dataset directories + np.random.seed(args.seed) + train_start_seed = np.random.randint(0, 1000) + val_start_seed = train_start_seed + args.num_episodes_train + test_start_seed = val_start_seed + args.num_episodes_val + # --- Generate episodes --- train_episode_metadata = generate_episodes( args.num_episodes_train, "train", train_start_seed, args.env_name ) From 4821564678008211df8cf588cd16ce760b186423 Mon Sep 17 00:00:00 2001 From: Alfred Date: Fri, 26 Sep 2025 15:47:44 +0200 Subject: [PATCH 4/7] chore: rename seeding variable to start level --- .../coinrun/generate_coinrun_dataset.py | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/data/jasmine_data/coinrun/generate_coinrun_dataset.py b/data/jasmine_data/coinrun/generate_coinrun_dataset.py index 7c34a91..26fda31 100644 --- a/data/jasmine_data/coinrun/generate_coinrun_dataset.py +++ b/data/jasmine_data/coinrun/generate_coinrun_dataset.py @@ -40,7 +40,7 @@ class Args: # --- Generate episodes --- -def generate_episodes(num_episodes, split, start_seed, env_name): +def generate_episodes(num_episodes, split, start_level, env_name): episode_idx = 0 episode_metadata = [] obs_chunks = [] @@ -49,7 +49,7 @@ def generate_episodes(num_episodes, split, start_seed, env_name): output_dir_split = os.path.join(args.output_dir, split) while episode_idx < num_episodes: env = ProcgenGym3Env( - num=1, env_name=env_name, start_level=start_seed + episode_idx + num=1, env_name=env_name, start_level=start_level + episode_idx ) observations_seq = [] @@ -125,19 +125,19 @@ def get_action_space(): def main(): # Set random seed and create dataset directories np.random.seed(args.seed) - train_start_seed = np.random.randint(0, 1000) - val_start_seed = train_start_seed + args.num_episodes_train - test_start_seed = val_start_seed + args.num_episodes_val + train_start_level = np.random.randint(0, 1000) + val_start_level = train_start_level + args.num_episodes_train + test_start_level = val_start_level + args.num_episodes_val # --- Generate episodes --- train_episode_metadata = generate_episodes( - args.num_episodes_train, "train", train_start_seed, args.env_name + args.num_episodes_train, "train", train_start_level, args.env_name ) val_episode_metadata = generate_episodes( - args.num_episodes_val, "val", val_start_seed, args.env_name + args.num_episodes_val, "val", val_start_level, args.env_name ) test_episode_metadata = generate_episodes( - args.num_episodes_test, "test", test_start_seed, args.env_name + args.num_episodes_test, "test", test_start_level, args.env_name ) # --- Save metadata --- From 01634c8719e941af6af148497aa45b5d63778461 Mon Sep 17 00:00:00 2001 From: Mihir Mahajan Date: Tue, 30 Sep 2025 15:58:36 +0200 Subject: [PATCH 5/7] changed import --- data/jasmine_data/coinrun/generate_coinrun_dataset.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/data/jasmine_data/coinrun/generate_coinrun_dataset.py b/data/jasmine_data/coinrun/generate_coinrun_dataset.py index 26fda31..233400c 100644 --- a/data/jasmine_data/coinrun/generate_coinrun_dataset.py +++ b/data/jasmine_data/coinrun/generate_coinrun_dataset.py @@ -11,7 +11,7 @@ import tyro import json import os -from data.jasmine_data.utils import save_chunks +from jasmine_data.utils import save_chunks @dataclass From 040899cb2a227c4f44166f86dc469a81f19d45d2 Mon Sep 17 00:00:00 2001 From: Alfred Date: Wed, 1 Oct 2025 16:20:49 +0200 Subject: [PATCH 6/7] debug: distinct seeding without chunking --- .../generat_coinrun_dataset_no_chunking.py | 132 ++++++++++++++++++ 1 file changed, 132 insertions(+) create mode 100644 data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking.py diff --git a/data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking.py b/data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking.py new file mode 100644 index 0000000..f141866 --- /dev/null +++ b/data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking.py @@ -0,0 +1,132 @@ +""" +Generates a dataset of random-action CoinRun episodes. +Episodes are saved individually as memory-mapped files for efficient loading. +""" + +from dataclasses import dataclass +from typing import Sequence + +from gym3 import types_np +import numpy as np +from procgen import ProcgenGym3Env +import tyro +import json +import os +import pickle +from array_record.python.array_record_module import ArrayRecordWriter + + +@dataclass +class Args: + env_name: str = "coinrun" + num_episodes_train: int = 10000 + num_episodes_val: int = 500 + num_episodes_test: int = 500 + output_dir: str = "data/coinrun_episodes" + min_episode_length: int = 1000 + max_episode_length: int = 1000 + seed: int = 0 + + +args = tyro.cli(Args) +assert ( + args.max_episode_length >= args.min_episode_length +), "Maximum episode length must be greater than or equal to minimum episode length." + + +# --- Generate episodes --- +def generate_episodes(num_episodes, split, start_level, env_name): + episode_idx = 0 + episode_metadata = [] + file_idx = 0 + output_dir_split = os.path.join(args.output_dir, split) + os.makedirs(output_dir_split, exist_ok=True) + + total_sequence_length = 0 + while episode_idx < num_episodes: + env = ProcgenGym3Env( + num=1, env_name=env_name, start_level=start_level + episode_idx + ) + + observations_seq = [] + actions_seq = [] + + # --- Run episode --- + step_t = 0 + first_obs = True + for step_t in range(args.max_episode_length): + _, obs, first = env.observe() + action = types_np.sample(env.ac_space, bshape=(env.num,)) + env.act(action) + observations_seq.append(obs["rgb"]) + actions_seq.append(action) + if first and not first_obs: + break + first_obs = False + + if step_t + 1 >= args.min_episode_length: + episode_data = np.concatenate(observations_seq, axis=0).astype(np.uint8) + + # save as array record + episode_path = os.path.join( + output_dir_split, f"episode_{episode_idx}.array_record" + ) + writer = ArrayRecordWriter(str(episode_path), "group_size:1") + writer.write( + pickle.dumps( + {"raw_video": episode_data.tobytes(), "sequence_length": step_t + 1} + ) + ) + total_sequence_length += step_t + 1 + writer.close() + + # save episode metadata + episode_metadata.append({"path": episode_path, "length": step_t + 1}) + episode_idx += 1 + + print(f"Done generating {split} split") + return total_sequence_length + + +def get_action_space(): + env = ProcgenGym3Env(num=1, env_name=args.env_name, start_level=0) + return env.ac_space.eltype.n + + +def main(): + # Set random seed and create dataset directories + np.random.seed(args.seed) + train_start_level = np.random.randint(0, 1000) + val_start_level = train_start_level + args.num_episodes_train + test_start_level = val_start_level + args.num_episodes_val + + # --- Generate episodes --- + train_total_sequence_length = generate_episodes( + args.num_episodes_train, "train", train_start_level, args.env_name + ) + val_total_sequence_length = generate_episodes( + args.num_episodes_val, "val", val_start_level, args.env_name + ) + test_total_sequence_length = generate_episodes( + args.num_episodes_test, "test", test_start_level, args.env_name + ) + + # --- Save metadata --- + metadata = { + "env": args.env_name, + "num_actions": get_action_space(), + "num_episodes_train": args.num_episodes_train, + "num_episodes_val": args.num_episodes_val, + "num_episodes_test": args.num_episodes_test, + "total_sequence_length_train": train_total_sequence_length, + "total_sequence_length_val": val_total_sequence_length, + "total_sequence_length_test": test_total_sequence_length, + } + with open(os.path.join(args.output_dir, "metadata.json"), "w") as f: + json.dump(metadata, f) + + print(f"Done generating dataset.") + + +if __name__ == "__main__": + main() From 47c92682fd13a48cf9fcf1e538dff52dd13edfdb Mon Sep 17 00:00:00 2001 From: Alfred Date: Wed, 1 Oct 2025 17:00:49 +0200 Subject: [PATCH 7/7] debug: save as npy and array record --- ...coinrun_dataset_no_chunking_npy_arr_rec.py | 144 ++++++++++++++++++ 1 file changed, 144 insertions(+) create mode 100644 data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking_npy_arr_rec.py diff --git a/data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking_npy_arr_rec.py b/data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking_npy_arr_rec.py new file mode 100644 index 0000000..a5bfe04 --- /dev/null +++ b/data/jasmine_data/coinrun/generat_coinrun_dataset_no_chunking_npy_arr_rec.py @@ -0,0 +1,144 @@ +""" +Generates a dataset of random-action CoinRun episodes. +Episodes are saved individually as memory-mapped files for efficient loading. +""" + +from dataclasses import dataclass +from typing import Sequence + +from gym3 import types_np +import numpy as np +from procgen import ProcgenGym3Env +import tyro +import json +import os +import pickle +from array_record.python.array_record_module import ArrayRecordWriter + + +@dataclass +class Args: + env_name: str = "coinrun" + num_episodes_train: int = 10000 + num_episodes_val: int = 500 + num_episodes_test: int = 500 + output_dir: str = "data/coinrun_episodes" + min_episode_length: int = 1000 + max_episode_length: int = 1000 + seed: int = 0 + + +args = tyro.cli(Args) +assert ( + args.max_episode_length >= args.min_episode_length +), "Maximum episode length must be greater than or equal to minimum episode length." + + +# --- Generate episodes --- +def generate_episodes(num_episodes, split, start_level, env_name): + episode_idx = 0 + episode_metadata = [] + file_idx = 0 + output_dir_split_npy = os.path.join(args.output_dir, "npy", split) + output_dir_split_arr_rec = os.path.join(args.output_dir, "array_record", split) + os.makedirs(output_dir_split_npy, exist_ok=True) + os.makedirs(output_dir_split_arr_rec, exist_ok=True) + + total_sequence_length = 0 + while episode_idx < num_episodes: + env = ProcgenGym3Env( + num=1, env_name=env_name, start_level=start_level + episode_idx + ) + + observations_seq = [] + actions_seq = [] + + # --- Run episode --- + step_t = 0 + first_obs = True + for step_t in range(args.max_episode_length): + _, obs, first = env.observe() + action = types_np.sample(env.ac_space, bshape=(env.num,)) + env.act(action) + observations_seq.append(obs["rgb"]) + actions_seq.append(action) + if first and not first_obs: + break + first_obs = False + + if step_t + 1 >= args.min_episode_length: + episode_data = np.concatenate(observations_seq, axis=0).astype(np.uint8) + + # save as npy + episode_path = os.path.join( + output_dir_split_npy, f"episode_{episode_idx}.npy" + ) + np.save(episode_path, episode_data) + + # save as array record + episode_path = os.path.join( + output_dir_split_arr_rec, f"episode_{episode_idx}.array_record" + ) + writer = ArrayRecordWriter(str(episode_path), "group_size:1") + writer.write( + pickle.dumps( + {"raw_video": episode_data.tobytes(), "sequence_length": step_t + 1} + ) + ) + total_sequence_length += step_t + 1 + writer.close() + + # save episode metadata + episode_metadata.append({"path": episode_path, "length": step_t + 1}) + episode_idx += 1 + + print(f"Done generating {split} split") + return total_sequence_length + + +def get_action_space(): + env = ProcgenGym3Env(num=1, env_name=args.env_name, start_level=0) + return env.ac_space.eltype.n + + +def main(): + # Set random seed and create dataset directories + np.random.seed(args.seed) + train_start_level = np.random.randint(0, 1000) + val_start_level = train_start_level + args.num_episodes_train + test_start_level = val_start_level + args.num_episodes_val + + # --- Generate episodes --- + train_total_sequence_length = generate_episodes( + args.num_episodes_train, "train", train_start_level, args.env_name + ) + val_total_sequence_length = generate_episodes( + args.num_episodes_val, "val", val_start_level, args.env_name + ) + test_total_sequence_length = generate_episodes( + args.num_episodes_test, "test", test_start_level, args.env_name + ) + + # --- Save metadata --- + metadata = { + "env": args.env_name, + "num_actions": get_action_space(), + "num_episodes_train": args.num_episodes_train, + "num_episodes_val": args.num_episodes_val, + "num_episodes_test": args.num_episodes_test, + "total_sequence_length_train": train_total_sequence_length, + "total_sequence_length_val": val_total_sequence_length, + "total_sequence_length_test": test_total_sequence_length, + } + + with open(os.path.join(args.output_dir, "npy", "metadata.json"), "w") as f: + json.dump(metadata, f) + + with open(os.path.join(args.output_dir, "array_record", "metadata.json"), "w") as f: + json.dump(metadata, f) + + print(f"Done generating dataset.") + + +if __name__ == "__main__": + main()