From 2f887f39e13f893d6a25c1ce22f0ae293c40ffbb Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Tue, 8 Jul 2025 15:05:15 +0200 Subject: [PATCH 1/5] fix: warmstart configs (avoid hardcoded paths) --- .../training/config_lorem_ipsum_long_fsdp1_warmstart.yaml | 2 +- .../training/config_lorem_ipsum_long_fsdp2_warmstart.yaml | 7 +++---- tutorials/warmstart/configs/warmstart_config.yaml | 2 +- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml b/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml index 98b66edc5..7235950a0 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp1_warmstart.yaml @@ -67,7 +67,7 @@ settings: variant_key: last_step_from_checkpoint_path config: checkpoint_path: ${settings.warmstart_checkpoint_paths.model_checkpoint_path} - warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} + warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} # use modalities warmstart [..] --last_checkpoint_info_file_path [..] collate_fn: component_key: collate_fn diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml index c502acae9..d9f5c3aae 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml @@ -67,8 +67,7 @@ settings: variant_key: last_step_from_checkpoint_path config: checkpoint_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path} - warmstart_checkpoint_paths: # ${warmstart_env:checkpoint_paths} - checkpoint_folder_path: /raid/fromm/modalities/data/checkpoints/2025-04-16__12-40-51_6dcbb1a0/eid_2025-04-16__12-40-51_6dcbb1a0-seen_steps_32-seen_tokens_65536-target_steps_162-target_tokens_331776 + warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} # use modalities warmstart [..] --last_checkpoint_info_file_path [..] collate_fn: component_key: collate_fn @@ -259,8 +258,8 @@ model_raw: component_key: model variant_key: gpt2 config: - use_meta_device: true - use_weight_tying: false + use_meta_device: false + use_weight_tying: true sample_key: ${settings.referencing_keys.sample_key} poe_type: NOPE sequence_length: ${settings.step_profile.sequence_length} diff --git a/tutorials/warmstart/configs/warmstart_config.yaml b/tutorials/warmstart/configs/warmstart_config.yaml index 199c88be1..e546ed777 100644 --- a/tutorials/warmstart/configs/warmstart_config.yaml +++ b/tutorials/warmstart/configs/warmstart_config.yaml @@ -50,7 +50,7 @@ settings: variant_key: last_step_from_checkpoint_path config: checkpoint_path: ${settings.warmstart_checkpoint_paths.checkpoint_folder_path} - warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} + warmstart_checkpoint_paths: ${warmstart_env:checkpoint_paths} # use modalities warmstart [..] --last_checkpoint_info_file_path [..] collate_fn: component_key: collate_fn From b2d197ca1e0f07d2a339ea67a43a593d70b6cd33 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Tue, 8 Jul 2025 15:52:45 +0200 Subject: [PATCH 2/5] chore: migrate to torch==2.7.1 and flash-attn=2.8.0.post2 --- .github/workflows/build_and_deploy_documentation.yml | 2 +- .github/workflows/tests_full.yml | 4 ++-- README.md | 6 +++--- pyproject.toml | 2 +- 4 files changed, 7 insertions(+), 7 deletions(-) diff --git a/.github/workflows/build_and_deploy_documentation.yml b/.github/workflows/build_and_deploy_documentation.yml index c6595f53d..32096ed4b 100644 --- a/.github/workflows/build_and_deploy_documentation.yml +++ b/.github/workflows/build_and_deploy_documentation.yml @@ -25,7 +25,7 @@ jobs: run: | sudo apt-get update sudo apt-get install git -y - python -m pip install torch==2.6.0 + python -m pip install torch==2.7.1 python -m pip install --upgrade pip setuptools wheel export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE python -m pip install -e . diff --git a/.github/workflows/tests_full.yml b/.github/workflows/tests_full.yml index 72deb4490..a1326c62f 100644 --- a/.github/workflows/tests_full.yml +++ b/.github/workflows/tests_full.yml @@ -23,11 +23,11 @@ jobs: sudo apt-get update sudo apt-get install curl -y # required by coveralls sudo apt-get install git -y - python -m pip install torch==2.6.0 + python -m pip install torch==2.7.1 python -m pip install --upgrade pip setuptools wheel export FLASH_ATTENTION_SKIP_CUDA_BUILD=TRUE python -m pip install ninja # Lowers compilation time of flash attention significantly - python -m pip install flash-attn==2.7.4.post1 --no-build-isolation + python -m pip install flash-attn==2.8.0.post2 --no-build-isolation python -m pip install -e .[tests] - name: Run tests run: | diff --git a/README.md b/README.md index 1e6bdd569..fc90a57fe 100644 --- a/README.md +++ b/README.md @@ -42,9 +42,9 @@ conda create -n modalities python=3.11 conda activate modalities # install PyTorch, Ninja and Flash Attention (mandatory) -pip install torch==2.6.0 +pip install torch==2.7.1 pip install ninja # Lowers compilation time of flash attention significantly -pip install flash-attn==2.7.4.post1 --no-build-isolation +pip install flash-attn==2.8.0.post2 --no-build-isolation ``` ### Option 1: Installation from source @@ -82,7 +82,7 @@ uv venv --seed --python 3.11 --prompt modalities source .venv/bin/activate uv pip install torch uv pip install ninja -uv pip install --no-build-isolation flash-attn==2.7.4.post1 +uv pip install --no-build-isolation flash-attn==2.8.0.post2 # for developer: use [tests,linting] and install pre-commit hooks uv pip install -e .[tests,linting] pre-commit install --install-hooks diff --git a/pyproject.toml b/pyproject.toml index 5a3c84bf1..17bc6ee0e 100644 --- a/pyproject.toml +++ b/pyproject.toml @@ -6,7 +6,7 @@ description = "Modalities, a PyTorch-native framework for distributed and reprod readme = "README.md" dependencies = [ "numpy<2.0", - "torch==2.6.0", + "torch==2.7.1", "packaging", "tqdm", "pyyaml", From e2a21c417d6d64ecc51594a8ee24ce394fda6e2e Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Thu, 10 Jul 2025 16:12:43 +0200 Subject: [PATCH 3/5] chore: increase wandb init timeout --- src/modalities/__main__.py | 2 +- .../logging_broker/subscriber_impl/results_subscriber.py | 7 ++++++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/src/modalities/__main__.py b/src/modalities/__main__.py index 6e6018603..c6f08a97b 100644 --- a/src/modalities/__main__.py +++ b/src/modalities/__main__.py @@ -580,7 +580,7 @@ def build_components(self, components_model_type: Type[BaseModel]) -> BaseModel: return components def run(self, components: TrainingComponentsInstantiationModel): - """Entrypoint fo running the training process. + """Entrypoint for running the training process. We pass in a TrainingComponentsInstantiationModel, which is a pydantic model that contains all the components needed for the training process. diff --git a/src/modalities/logging_broker/subscriber_impl/results_subscriber.py b/src/modalities/logging_broker/subscriber_impl/results_subscriber.py index 05f3c2fd3..aa9aed58f 100644 --- a/src/modalities/logging_broker/subscriber_impl/results_subscriber.py +++ b/src/modalities/logging_broker/subscriber_impl/results_subscriber.py @@ -70,7 +70,12 @@ def __init__( with open(config_file_path, "r", encoding="utf-8") as file: config = yaml.safe_load(file) self.run = wandb.init( - project=project, name=experiment_id, mode=mode.value.lower(), dir=logging_directory, config=config + project=project, + name=experiment_id, + mode=mode.value.lower(), + dir=logging_directory, + config=config, + settings=wandb.Settings(init_timeout=120), ) self.run.log_artifact(config_file_path, name=f"config_{wandb.run.id}", type="config") From 4b630c1db95cad85dae19bf51114a281d786a356 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Thu, 10 Jul 2025 16:50:38 +0200 Subject: [PATCH 4/5] fix: warmstart configs (reset weight_tying to false) --- .../training/config_lorem_ipsum_long_fsdp2_warmstart.yaml | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml b/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml index d9f5c3aae..d7b465364 100644 --- a/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml +++ b/config_files/training/config_lorem_ipsum_long_fsdp2_warmstart.yaml @@ -258,8 +258,8 @@ model_raw: component_key: model variant_key: gpt2 config: - use_meta_device: false - use_weight_tying: true + use_meta_device: true + use_weight_tying: false sample_key: ${settings.referencing_keys.sample_key} poe_type: NOPE sequence_length: ${settings.step_profile.sequence_length} From f280e466bc3dac6ecb87d1e5f1c51ae9dd433ab4 Mon Sep 17 00:00:00 2001 From: Felix Stollenwerk Date: Thu, 10 Jul 2025 17:43:01 +0200 Subject: [PATCH 5/5] chore: pin torch version in uv installation instructions --- README.md | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/README.md b/README.md index fc90a57fe..686dc1ebe 100644 --- a/README.md +++ b/README.md @@ -80,7 +80,7 @@ pip install modalities curl -LsSf https://astral.sh/uv/install.sh | sh uv venv --seed --python 3.11 --prompt modalities source .venv/bin/activate -uv pip install torch +uv pip install torch==2.7.1 uv pip install ninja uv pip install --no-build-isolation flash-attn==2.8.0.post2 # for developer: use [tests,linting] and install pre-commit hooks