From b3888f342adb890f960186f5e44b790292e2bc48 Mon Sep 17 00:00:00 2001 From: Luke Friedrichs Date: Wed, 5 Nov 2025 15:46:56 +0100 Subject: [PATCH 1/8] fixed some weird behaivoir in deepspeed which does allow me to import moe Signed-off-by: Luke Friedrichs --- .../inference/v2/model_implementations/layer_container_base.py | 2 ++ 1 file changed, 2 insertions(+) diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py index feb65b4a5f5d..aefd57a3742a 100644 --- a/deepspeed/inference/v2/model_implementations/layer_container_base.py +++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py @@ -94,6 +94,7 @@ def __new__(cls, clsname, bases, attrs): # Check for invalid mappings if base_dependency not in all_names: + continue raise ValueError( "Target parameter \"{}\" not found in this layer. Valid targets are {}".format( base_dependency, all_names)) @@ -122,6 +123,7 @@ def __new__(cls, clsname, bases, attrs): raise ValueError( "ParametrizedList index inference can only work with a single glob: {}".format(src_name)) elif glob_count == 0: + continue raise ValueError( "Must have wildcard (*) in source name for ParametrizedList mapping: {}".format(src_name)) From cb0cde732f3250033b606352e0dd32da532c9218 Mon Sep 17 00:00:00 2001 From: Logan Adams <114770087+loadams@users.noreply.github.com> Date: Wed, 5 Nov 2025 17:22:32 -0800 Subject: [PATCH 2/8] Update version.txt after release (#7675) Signed-off-by: Luke Friedrichs --- version.txt | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/version.txt b/version.txt index 503a21deb47d..267d7e011fea 100644 --- a/version.txt +++ b/version.txt @@ -1 +1 @@ -0.18.2 +0.18.3 From e37bdefcb8d035587e18a559836157e79a10a559 Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Thu, 6 Nov 2025 11:42:22 -0800 Subject: [PATCH 3/8] [modal ci] fixes (#7676) 1. `modal-accelerate` needs now `uv` installed explicitly since the image change to 2025 one. 2. moved accelerate repo cloning into the job, since the original way was incorrect as it was caching some accelerate version and not updating it. 3. annotated that how to actually test the ci work when changing the workflow as `pull_request_target` will not run the updated .py+.yaml files. --------- Signed-off-by: Stas Bekman Signed-off-by: Luke Friedrichs --- .github/workflows/modal-accelerate.yml | 5 +++++ ci/accelerate.py | 29 +++++++++++++++++++------- 2 files changed, 27 insertions(+), 7 deletions(-) diff --git a/.github/workflows/modal-accelerate.yml b/.github/workflows/modal-accelerate.yml index 342fcd4707f4..f188e5f3e0e2 100644 --- a/.github/workflows/modal-accelerate.yml +++ b/.github/workflows/modal-accelerate.yml @@ -20,6 +20,11 @@ on: branches: - master + # you have to switch to `pull_request` if you need to change the CI job's python script, + # otherwise GH will use a master version of the CI files, ignoring the modifications in the PR - + # the other way is to use modal cli to test this job from one's host - it'd require setting up + # modal secrets + # pull_request: pull_request_target: paths-ignore: - 'docs/**' diff --git a/ci/accelerate.py b/ci/accelerate.py index f9fc09d75f19..cbf396a991f1 100644 --- a/ci/accelerate.py +++ b/ci/accelerate.py @@ -12,13 +12,10 @@ # yapf: disable image = (modal.Image .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10") - .run_commands("apt update && apt install -y libaio-dev") .apt_install("git") - .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0") - .run_commands( - "git clone https://github.com/huggingface/accelerate && \ - uv pip install --system --compile-bytecode ./accelerate[testing]" - ) + .pip_install("uv") + # uv_pip_install already includes --compile-bytecode + .uv_pip_install("datasets==3.6.0", extra_options="--system") .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any") .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any") .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True) @@ -36,8 +33,26 @@ ) def pytest(): import subprocess + + cmd = "git clone https://github.com/huggingface/accelerate" + print(f"running: {cmd}") + subprocess.run( + cmd.split(), + check=True, + cwd=ROOT_PATH / ".", + ) + cmd = "uv pip install --system --compile-bytecode ./accelerate[testing]" + print(f"running: {cmd}") + subprocess.run( + cmd.split(), + check=True, + cwd=ROOT_PATH / ".", + ) + + cmd = "pytest ./accelerate/tests/deepspeed" + print(f"running: {cmd}") subprocess.run( - "pytest /accelerate/tests/deepspeed".split(), + cmd.split(), check=True, cwd=ROOT_PATH / ".", ) From 1834cbbd868e09bffba24aa5f1b3447cbd5f05ba Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Fri, 7 Nov 2025 15:25:10 -0800 Subject: [PATCH 4/8] leaf modules: explain better (#7674) add Masahiro's explanation to why that code is there. --------- Signed-off-by: Stas Bekman Signed-off-by: Luke Friedrichs --- deepspeed/runtime/zero/stage3.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py index f8c146c81d2f..dc3974e1680e 100644 --- a/deepspeed/runtime/zero/stage3.py +++ b/deepspeed/runtime/zero/stage3.py @@ -1264,6 +1264,7 @@ def make_hook(params): def reduce_leaf_module_grads(module, grad_input, grad_output): for param in params: + # this takes care of grads for MoE experts that didn't participate in the current iteration/layer if param.grad is None: param.grad = torch.zeros_like(param) self.reduce_ready_partitions_and_remove_grads(param) From ad66ab294e57c1a357019b07ed220446237a96af Mon Sep 17 00:00:00 2001 From: Stas Bekman Date: Sat, 8 Nov 2025 06:05:21 -0800 Subject: [PATCH 5/8] disable nv-lightning-v100.yml cI (#7681) as we lost v100s - disable first so that it stops interfering with PRs, then port to modal. Signed-off-by: Luke Friedrichs --- .github/workflows/nv-lightning-v100.yml | 104 ++++++++++++------------ 1 file changed, 53 insertions(+), 51 deletions(-) diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml index d31ae5569848..eeb8516a324d 100644 --- a/.github/workflows/nv-lightning-v100.yml +++ b/.github/workflows/nv-lightning-v100.yml @@ -1,51 +1,53 @@ -name: nv-lightning-v100 - -on: - workflow_dispatch: - pull_request: - paths-ignore: - - 'docs/**' - - 'blogs/**' - - 'deepspeed/inference/v2/**' - - 'tests/unit/inference/v2/**' - merge_group: - branches: [ master ] - schedule: - - cron: "0 0 * * *" - -concurrency: - group: ${{ github.workflow }}-${{ github.ref }} - cancel-in-progress: true - -jobs: - unit-tests: - runs-on: [self-hosted, nvidia, cu124, v100] - - steps: - - uses: actions/checkout@v4 - - - id: setup-venv - uses: ./.github/workflows/setup-venv - - - name: Install pytorch - run: | - pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124 - python -c "import torch; print('torch:', torch.__version__, torch)" - python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - - - name: Install deepspeed - run: | - pip install .[dev,autotuning] - ds_report - - - name: Python environment - run: | - pip list - - - name: PyTorch Lightning Tests - run: | - unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch - pip install pytorch-lightning - pip install "protobuf<4.21.0" - cd tests - pytest $PYTEST_OPTS lightning/ +# name: nv-lightning-v100 + +# disabled as the v100s are no more - need to port to modal while removing v100 + +# on: +# workflow_dispatch: +# pull_request: +# paths-ignore: +# - 'docs/**' +# - 'blogs/**' +# - 'deepspeed/inference/v2/**' +# - 'tests/unit/inference/v2/**' +# merge_group: +# branches: [ master ] +# schedule: +# - cron: "0 0 * * *" + +# concurrency: +# group: ${{ github.workflow }}-${{ github.ref }} +# cancel-in-progress: true + +# jobs: +# unit-tests: +# runs-on: [self-hosted, nvidia, cu124, v100] + +# steps: +# - uses: actions/checkout@v4 + +# - id: setup-venv +# uses: ./.github/workflows/setup-venv + +# - name: Install pytorch +# run: | +# pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124 +# python -c "import torch; print('torch:', torch.__version__, torch)" +# python -c "import torch; print('CUDA available:', torch.cuda.is_available())" + +# - name: Install deepspeed +# run: | +# pip install .[dev,autotuning] +# ds_report + +# - name: Python environment +# run: | +# pip list + +# - name: PyTorch Lightning Tests +# run: | +# unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch +# pip install pytorch-lightning +# pip install "protobuf<4.21.0" +# cd tests +# pytest $PYTEST_OPTS lightning/ From ed8c43630e8ae76e00d11b5d8ef3779b16ad6469 Mon Sep 17 00:00:00 2001 From: "Ma, Guokai" Date: Tue, 11 Nov 2025 13:26:36 +0800 Subject: [PATCH 6/8] allow seperate learning rate "muon_lr" and "adam_lr" for muon optimizer (#7658) This PR allows seperate learning rate for muon and adam part of the Muon optimizer. Following up https://github.com/deepspeedai/DeepSpeed/issues/7657 Signed-off-by: Guokai Ma Co-authored-by: Olatunji Ruwase Signed-off-by: Luke Friedrichs --- deepspeed/runtime/engine.py | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index bc1ec7bc1880..12867437d9dd 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -1599,15 +1599,21 @@ def _configure_basic_optimizer(self, model_parameters): param_groups = [] if muon_params: accepted_parameters = dict() - for key in ["lr", "momentum", "weight_decay"]: + for key in ["lr", "momentum", "weight_decay", "muon_lr"]: if key in optimizer_parameters: - accepted_parameters[key] = optimizer_parameters[key] + if key == "muon_lr": # muon_lr will override lr + accepted_parameters['lr'] = optimizer_parameters[key] + else: + accepted_parameters[key] = optimizer_parameters[key] param_groups.append(dict(params=muon_params, use_muon=True, **accepted_parameters)) if non_muon_params: accepted_parameters = dict() - for key in ["lr", "betas", "eps", "weight_decay"]: + for key in ["lr", "betas", "eps", "weight_decay", "adam_lr"]: if key in optimizer_parameters: - accepted_parameters[key] = optimizer_parameters[key] + if key == "adam_lr": # adam_lr will override lr + accepted_parameters['lr'] = optimizer_parameters[key] + else: + accepted_parameters[key] = optimizer_parameters[key] param_groups.append(dict(params=non_muon_params, use_muon=False, **accepted_parameters)) optimizer = MuonWithAuxAdam(param_groups) else: From 78ada8c616e11dc2902c0aa4f3e344ba9234db25 Mon Sep 17 00:00:00 2001 From: Luke Friedrichs Date: Tue, 11 Nov 2025 20:23:09 +0100 Subject: [PATCH 7/8] if no expert found in parameter that have expert in name it should continue Otherwise: `TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'` Signed-off-by: Luke Friedrichs --- deepspeed/runtime/engine.py | 1 + 1 file changed, 1 insertion(+) diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py index 12867437d9dd..94496984c580 100755 --- a/deepspeed/runtime/engine.py +++ b/deepspeed/runtime/engine.py @@ -3601,6 +3601,7 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa local_expert_id = None if not m: logger.warning(f'No expert found in key {key}.') + continue else: local_expert_id = m.group(1) From eba17528fbbe80cd686dcf4404619dcc84287c0f Mon Sep 17 00:00:00 2001 From: Luke Friedrichs Date: Tue, 11 Nov 2025 20:23:44 +0100 Subject: [PATCH 8/8] Revert "fixed some weird behaivoir in deepspeed which does allow me to import moe" This reverts commit 2f232b9dabe8ef59bfc9fa4e8f7f9b74e7451013. Signed-off-by: Luke Friedrichs --- .../inference/v2/model_implementations/layer_container_base.py | 2 -- 1 file changed, 2 deletions(-) diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py index aefd57a3742a..feb65b4a5f5d 100644 --- a/deepspeed/inference/v2/model_implementations/layer_container_base.py +++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py @@ -94,7 +94,6 @@ def __new__(cls, clsname, bases, attrs): # Check for invalid mappings if base_dependency not in all_names: - continue raise ValueError( "Target parameter \"{}\" not found in this layer. Valid targets are {}".format( base_dependency, all_names)) @@ -123,7 +122,6 @@ def __new__(cls, clsname, bases, attrs): raise ValueError( "ParametrizedList index inference can only work with a single glob: {}".format(src_name)) elif glob_count == 0: - continue raise ValueError( "Must have wildcard (*) in source name for ParametrizedList mapping: {}".format(src_name))