From b3888f342adb890f960186f5e44b790292e2bc48 Mon Sep 17 00:00:00 2001
From: Luke Friedrichs <lukefriedrichs@gmail.com>
Date: Wed, 5 Nov 2025 15:46:56 +0100
Subject: [PATCH 1/8] fixed some weird behaivoir in deepspeed which does allow
 me to import moe

Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 .../inference/v2/model_implementations/layer_container_base.py  | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py
index feb65b4a5f5d..aefd57a3742a 100644
--- a/deepspeed/inference/v2/model_implementations/layer_container_base.py
+++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py
@@ -94,6 +94,7 @@ def __new__(cls, clsname, bases, attrs):
 
                     # Check for invalid mappings
                     if base_dependency not in all_names:
+                        continue
                         raise ValueError(
                             "Target parameter \"{}\" not found in this layer. Valid targets are {}".format(
                                 base_dependency, all_names))
@@ -122,6 +123,7 @@ def __new__(cls, clsname, bases, attrs):
                         raise ValueError(
                             "ParametrizedList index inference can only work with a single glob: {}".format(src_name))
                     elif glob_count == 0:
+                        continue
                         raise ValueError(
                             "Must have wildcard (*) in source name for ParametrizedList mapping: {}".format(src_name))
 

From cb0cde732f3250033b606352e0dd32da532c9218 Mon Sep 17 00:00:00 2001
From: Logan Adams <114770087+loadams@users.noreply.github.com>
Date: Wed, 5 Nov 2025 17:22:32 -0800
Subject: [PATCH 2/8] Update version.txt after release (#7675)

Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 version.txt | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/version.txt b/version.txt
index 503a21deb47d..267d7e011fea 100644
--- a/version.txt
+++ b/version.txt
@@ -1 +1 @@
-0.18.2
+0.18.3

From e37bdefcb8d035587e18a559836157e79a10a559 Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Thu, 6 Nov 2025 11:42:22 -0800
Subject: [PATCH 3/8] [modal ci] fixes (#7676)

1. `modal-accelerate` needs now `uv` installed explicitly since the
image change to 2025 one.
2. moved accelerate repo cloning into the job, since the original way
was incorrect as it was caching some accelerate version and not updating
it.
3. annotated that how to actually test the ci work when changing the
workflow as `pull_request_target` will not run the updated .py+.yaml
files.

---------

Signed-off-by: Stas Bekman <stas@stason.org>
Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 .github/workflows/modal-accelerate.yml |  5 +++++
 ci/accelerate.py                       | 29 +++++++++++++++++++-------
 2 files changed, 27 insertions(+), 7 deletions(-)

diff --git a/.github/workflows/modal-accelerate.yml b/.github/workflows/modal-accelerate.yml
index 342fcd4707f4..f188e5f3e0e2 100644
--- a/.github/workflows/modal-accelerate.yml
+++ b/.github/workflows/modal-accelerate.yml
@@ -20,6 +20,11 @@ on:
     branches:
       - master
 
+  # you have to switch to `pull_request` if you need to change the CI job's python script,
+  # otherwise GH will use a master version of the CI files, ignoring the modifications in the PR -
+  # the other way is to use modal cli to test this job from one's host - it'd require setting up
+  # modal secrets
+  # pull_request:
   pull_request_target:
     paths-ignore:
       - 'docs/**'
diff --git a/ci/accelerate.py b/ci/accelerate.py
index f9fc09d75f19..cbf396a991f1 100644
--- a/ci/accelerate.py
+++ b/ci/accelerate.py
@@ -12,13 +12,10 @@
 # yapf: disable
 image = (modal.Image
          .from_registry("pytorch/pytorch:2.6.0-cuda12.4-cudnn9-devel", add_python="3.10")
-         .run_commands("apt update && apt install -y libaio-dev")
          .apt_install("git")
-         .run_commands("uv pip install --system --compile-bytecode datasets==3.6.0")
-         .run_commands(
-                "git clone https://github.com/huggingface/accelerate && \
-                uv pip install --system --compile-bytecode ./accelerate[testing]"
-            )
+         .pip_install("uv")
+         # uv_pip_install already includes --compile-bytecode
+         .uv_pip_install("datasets==3.6.0", extra_options="--system")
          .pip_install_from_requirements(ROOT_PATH / "requirements/requirements.txt", gpu="any")
          .pip_install_from_requirements(ROOT_PATH / "requirements/requirements-dev.txt", gpu="any")
          .add_local_dir(ROOT_PATH , remote_path="/root/", copy=True)
@@ -36,8 +33,26 @@
 )
 def pytest():
     import subprocess
+
+    cmd = "git clone https://github.com/huggingface/accelerate"
+    print(f"running: {cmd}")
+    subprocess.run(
+        cmd.split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
+    cmd = "uv pip install --system --compile-bytecode ./accelerate[testing]"
+    print(f"running: {cmd}")
+    subprocess.run(
+        cmd.split(),
+        check=True,
+        cwd=ROOT_PATH / ".",
+    )
+
+    cmd = "pytest ./accelerate/tests/deepspeed"
+    print(f"running: {cmd}")
     subprocess.run(
-        "pytest /accelerate/tests/deepspeed".split(),
+        cmd.split(),
         check=True,
         cwd=ROOT_PATH / ".",
     )

From 1834cbbd868e09bffba24aa5f1b3447cbd5f05ba Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Fri, 7 Nov 2025 15:25:10 -0800
Subject: [PATCH 4/8] leaf modules: explain better (#7674)

add Masahiro's explanation to why that code is there.

---------

Signed-off-by: Stas Bekman <stas@stason.org>
Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 deepspeed/runtime/zero/stage3.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepspeed/runtime/zero/stage3.py b/deepspeed/runtime/zero/stage3.py
index f8c146c81d2f..dc3974e1680e 100644
--- a/deepspeed/runtime/zero/stage3.py
+++ b/deepspeed/runtime/zero/stage3.py
@@ -1264,6 +1264,7 @@ def make_hook(params):
 
                 def reduce_leaf_module_grads(module, grad_input, grad_output):
                     for param in params:
+                        # this takes care of grads for MoE experts that didn't participate in the current iteration/layer
                         if param.grad is None:
                             param.grad = torch.zeros_like(param)
                         self.reduce_ready_partitions_and_remove_grads(param)

From ad66ab294e57c1a357019b07ed220446237a96af Mon Sep 17 00:00:00 2001
From: Stas Bekman <stas00@users.noreply.github.com>
Date: Sat, 8 Nov 2025 06:05:21 -0800
Subject: [PATCH 5/8] disable nv-lightning-v100.yml cI (#7681)

as we lost v100s - disable first so that it stops interfering with PRs,
then port to modal.

Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 .github/workflows/nv-lightning-v100.yml | 104 ++++++++++++------------
 1 file changed, 53 insertions(+), 51 deletions(-)

diff --git a/.github/workflows/nv-lightning-v100.yml b/.github/workflows/nv-lightning-v100.yml
index d31ae5569848..eeb8516a324d 100644
--- a/.github/workflows/nv-lightning-v100.yml
+++ b/.github/workflows/nv-lightning-v100.yml
@@ -1,51 +1,53 @@
-name: nv-lightning-v100
-
-on:
-  workflow_dispatch:
-  pull_request:
-    paths-ignore:
-      - 'docs/**'
-      - 'blogs/**'
-      - 'deepspeed/inference/v2/**'
-      - 'tests/unit/inference/v2/**'
-  merge_group:
-    branches: [ master ]
-  schedule:
-    - cron: "0 0 * * *"
-
-concurrency:
-  group: ${{ github.workflow }}-${{ github.ref }}
-  cancel-in-progress: true
-
-jobs:
-  unit-tests:
-    runs-on: [self-hosted, nvidia, cu124, v100]
-
-    steps:
-      - uses: actions/checkout@v4
-
-      - id: setup-venv
-        uses: ./.github/workflows/setup-venv
-
-      - name: Install pytorch
-        run: |
-          pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
-          python -c "import torch; print('torch:', torch.__version__, torch)"
-          python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
-
-      - name: Install deepspeed
-        run: |
-          pip install .[dev,autotuning]
-          ds_report
-
-      - name: Python environment
-        run: |
-          pip list
-
-      - name: PyTorch Lightning Tests
-        run: |
-          unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
-          pip install pytorch-lightning
-          pip install "protobuf<4.21.0"
-          cd tests
-          pytest $PYTEST_OPTS lightning/
+# name: nv-lightning-v100
+
+# disabled as the v100s are no more - need to port to modal while removing v100
+
+# on:
+#   workflow_dispatch:
+#   pull_request:
+#     paths-ignore:
+#       - 'docs/**'
+#       - 'blogs/**'
+#       - 'deepspeed/inference/v2/**'
+#       - 'tests/unit/inference/v2/**'
+#   merge_group:
+#     branches: [ master ]
+#   schedule:
+#     - cron: "0 0 * * *"
+
+# concurrency:
+#   group: ${{ github.workflow }}-${{ github.ref }}
+#   cancel-in-progress: true
+
+# jobs:
+#   unit-tests:
+#     runs-on: [self-hosted, nvidia, cu124, v100]
+
+#     steps:
+#       - uses: actions/checkout@v4
+
+#       - id: setup-venv
+#         uses: ./.github/workflows/setup-venv
+
+#       - name: Install pytorch
+#         run: |
+#           pip install -U --cache-dir $TORCH_CACHE torch torchvision --index-url https://download.pytorch.org/whl/cu124
+#           python -c "import torch; print('torch:', torch.__version__, torch)"
+#           python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
+
+#       - name: Install deepspeed
+#         run: |
+#           pip install .[dev,autotuning]
+#           ds_report
+
+#       - name: Python environment
+#         run: |
+#           pip list
+
+#       - name: PyTorch Lightning Tests
+#         run: |
+#           unset TORCH_CUDA_ARCH_LIST # only jit compile for current arch
+#           pip install pytorch-lightning
+#           pip install "protobuf<4.21.0"
+#           cd tests
+#           pytest $PYTEST_OPTS lightning/

From ed8c43630e8ae76e00d11b5d8ef3779b16ad6469 Mon Sep 17 00:00:00 2001
From: "Ma, Guokai" <guokai.ma@gmail.com>
Date: Tue, 11 Nov 2025 13:26:36 +0800
Subject: [PATCH 6/8] allow seperate learning rate "muon_lr" and "adam_lr" for
 muon optimizer (#7658)

This PR allows seperate learning rate for muon and adam part of the Muon
optimizer. Following up
https://github.com/deepspeedai/DeepSpeed/issues/7657

Signed-off-by: Guokai Ma <guokai.ma@intel.com>
Co-authored-by: Olatunji Ruwase <tunji.ruwase@snowflake.com>
Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 deepspeed/runtime/engine.py | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index bc1ec7bc1880..12867437d9dd 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -1599,15 +1599,21 @@ def _configure_basic_optimizer(self, model_parameters):
             param_groups = []
             if muon_params:
                 accepted_parameters = dict()
-                for key in ["lr", "momentum", "weight_decay"]:
+                for key in ["lr", "momentum", "weight_decay", "muon_lr"]:
                     if key in optimizer_parameters:
-                        accepted_parameters[key] = optimizer_parameters[key]
+                        if key == "muon_lr":  # muon_lr will override lr
+                            accepted_parameters['lr'] = optimizer_parameters[key]
+                        else:
+                            accepted_parameters[key] = optimizer_parameters[key]
                 param_groups.append(dict(params=muon_params, use_muon=True, **accepted_parameters))
             if non_muon_params:
                 accepted_parameters = dict()
-                for key in ["lr", "betas", "eps", "weight_decay"]:
+                for key in ["lr", "betas", "eps", "weight_decay", "adam_lr"]:
                     if key in optimizer_parameters:
-                        accepted_parameters[key] = optimizer_parameters[key]
+                        if key == "adam_lr":  # adam_lr will override lr
+                            accepted_parameters['lr'] = optimizer_parameters[key]
+                        else:
+                            accepted_parameters[key] = optimizer_parameters[key]
                 param_groups.append(dict(params=non_muon_params, use_muon=False, **accepted_parameters))
             optimizer = MuonWithAuxAdam(param_groups)
         else:

From 78ada8c616e11dc2902c0aa4f3e344ba9234db25 Mon Sep 17 00:00:00 2001
From: Luke Friedrichs <lukefriedrichs@gmail.com>
Date: Tue, 11 Nov 2025 20:23:09 +0100
Subject: [PATCH 7/8] if no expert found in parameter that have expert in name
 it should continue

Otherwise: `TypeError: int() argument must be a string, a bytes-like object or a real number, not 'NoneType'`
Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 deepspeed/runtime/engine.py | 1 +
 1 file changed, 1 insertion(+)

diff --git a/deepspeed/runtime/engine.py b/deepspeed/runtime/engine.py
index 12867437d9dd..94496984c580 100755
--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -3601,6 +3601,7 @@ def _save_moe_checkpoint(self, save_dir, tag, client_state={}, exclude_frozen_pa
                     local_expert_id = None
                     if not m:
                         logger.warning(f'No expert found in key {key}.')
+                        continue
                     else:
                         local_expert_id = m.group(1)
 

From eba17528fbbe80cd686dcf4404619dcc84287c0f Mon Sep 17 00:00:00 2001
From: Luke Friedrichs <lukefriedrichs@gmail.com>
Date: Tue, 11 Nov 2025 20:23:44 +0100
Subject: [PATCH 8/8] Revert "fixed some weird behaivoir in deepspeed which
 does allow me to import moe"

This reverts commit 2f232b9dabe8ef59bfc9fa4e8f7f9b74e7451013.

Signed-off-by: Luke Friedrichs <lukefriedrichs@gmail.com>
---
 .../inference/v2/model_implementations/layer_container_base.py  | 2 --
 1 file changed, 2 deletions(-)

diff --git a/deepspeed/inference/v2/model_implementations/layer_container_base.py b/deepspeed/inference/v2/model_implementations/layer_container_base.py
index aefd57a3742a..feb65b4a5f5d 100644
--- a/deepspeed/inference/v2/model_implementations/layer_container_base.py
+++ b/deepspeed/inference/v2/model_implementations/layer_container_base.py
@@ -94,7 +94,6 @@ def __new__(cls, clsname, bases, attrs):
 
                     # Check for invalid mappings
                     if base_dependency not in all_names:
-                        continue
                         raise ValueError(
                             "Target parameter \"{}\" not found in this layer. Valid targets are {}".format(
                                 base_dependency, all_names))
@@ -123,7 +122,6 @@ def __new__(cls, clsname, bases, attrs):
                         raise ValueError(
                             "ParametrizedList index inference can only work with a single glob: {}".format(src_name))
                     elif glob_count == 0:
-                        continue
                         raise ValueError(
                             "Must have wildcard (*) in source name for ParametrizedList mapping: {}".format(src_name))