diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml
index 53e2aad85a6b..9a90cc6602ba 100644
--- a/.github/workflows/nv-pre-compile-ops.yml
+++ b/.github/workflows/nv-pre-compile-ops.yml
@@ -23,11 +23,20 @@ jobs:
   unit-tests:
     runs-on: ubuntu-24.04
     container:
-      image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116
+      image: nvidia/cuda:12.6.3-devel-ubuntu22.04
 
     steps:
+        - name: Install system dependencies
+          run: |
+            apt-get update && apt-get install -y git python3 python3-pip libaio-dev ninja-build
+            ln -sf /usr/bin/python3 /usr/bin/python
+
         - uses: actions/checkout@v4
 
+        - name: Install PyTorch
+          run: |
+            pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu126
+
         - name: environment
           run: |
             which python
@@ -36,7 +45,7 @@ jobs:
             #python -c "import torch; print('CUDA available:', torch.cuda.is_available())"
         - name: Compile DeepSpeed Ops
           run: |
-            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
+            DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install .
         - name: DS Report
           run: |
-             ds_report
+             DS_ACCELERATOR=cuda ds_report
diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py
index 2a0770ac681b..4db8ae5ebdb7 100644
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
@@ -178,10 +178,6 @@ def supported_dtypes(self):
         ...
 
     # Misc
-    @abc.abstractmethod
-    def amp(self):
-        ...
-
     @abc.abstractmethod
     def is_available(self):
         ...
diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py
index d933041bed55..6b414a6e04ef 100644
--- a/accelerator/cpu_accelerator.py
+++ b/accelerator/cpu_accelerator.py
@@ -199,9 +199,6 @@ def available_memory(self, device_index=None):
         return psutil.virtual_memory().available
 
     # Misc
-    def amp(self):
-        return torch.cpu.amp
-
     def is_available(self):
         return True
 
diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py
index c45903421324..42cb93f9581d 100644
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
@@ -222,11 +222,6 @@ def supported_dtypes(self):
         return supported_dtypes
 
     # Misc
-    def amp(self):
-        if hasattr(torch.cuda, 'amp'):
-            return torch.cuda.amp
-        return None
-
     def is_available(self):
         return torch.cuda.is_available()
 
diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py
index 9d82eb590902..c6bc94f06149 100644
--- a/accelerator/hpu_accelerator.py
+++ b/accelerator/hpu_accelerator.py
@@ -173,9 +173,6 @@ def supported_dtypes(self):
         return supported_dtypes
 
     # Misc
-    def amp(self):
-        return None
-
     def is_available(self):
         return self.hpu.is_available()
 
diff --git a/accelerator/mlu_accelerator.py b/accelerator/mlu_accelerator.py
index bef716f0ee4e..4689034692d1 100644
--- a/accelerator/mlu_accelerator.py
+++ b/accelerator/mlu_accelerator.py
@@ -162,11 +162,6 @@ def supported_dtypes(self):
         return supported_dtypes
 
     # Misc
-    def amp(self):
-        if hasattr(torch.mlu, 'amp'):
-            return torch.mlu.amp
-        return None
-
     def is_available(self):
         return torch.mlu.is_available()
 
diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py
index aa8e86ef1ce0..0d67c9cc1f7e 100644
--- a/accelerator/mps_accelerator.py
+++ b/accelerator/mps_accelerator.py
@@ -156,9 +156,6 @@ def supported_dtypes(self):
         return [torch.float]
 
     # Misc
-    def amp(self):
-        return
-
     def is_available(self):
         return hasattr(torch.backends, "mps") and torch.backends.mps.is_available()
 
diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py
index 7cf30a349c57..421050d19f22 100644
--- a/accelerator/npu_accelerator.py
+++ b/accelerator/npu_accelerator.py
@@ -163,11 +163,6 @@ def supported_dtypes(self):
         return [torch.float, torch.half, torch.bfloat16]
 
     # Misc
-    def amp(self):
-        if hasattr(torch.npu, 'amp'):
-            return torch.npu.amp
-        return None
-
     def is_available(self):
         return torch.npu.is_available()
 
diff --git a/accelerator/sdaa_accelerator.py b/accelerator/sdaa_accelerator.py
index 26113d38dd15..f185731d9385 100755
--- a/accelerator/sdaa_accelerator.py
+++ b/accelerator/sdaa_accelerator.py
@@ -192,11 +192,6 @@ def supported_dtypes(self):
         return supported_dtypes
 
     # Misc
-    def amp(self):
-        if hasattr(torch.sdaa, 'amp'):
-            return torch.sdaa.amp
-        return None
-
     def is_available(self):
         return torch.sdaa.is_available()
 
diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py
index 0d8f0d0d2cc8..b2fb57d42cd1 100644
--- a/accelerator/xpu_accelerator.py
+++ b/accelerator/xpu_accelerator.py
@@ -153,9 +153,6 @@ def available_memory(self, device_index=None):
         return self.total_memory(device_index) - self.memory_allocated(device_index)
 
     # Misc
-    def amp(self):
-        return torch.amp
-
     def is_available(self):
         return torch.xpu.is_available()
 
diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py
index 0a86e3c389b1..0fd02cdc67ef 100644
--- a/deepspeed/runtime/zero/linear.py
+++ b/deepspeed/runtime/zero/linear.py
@@ -23,7 +23,6 @@
 from torch.nn.parameter import Parameter
 from torch.nn import init
 from torch.nn.modules.module import Module
-from deepspeed.runtime.utils import noop_decorator
 from deepspeed import comm as dist
 from deepspeed.accelerator import get_accelerator
 
@@ -33,18 +32,8 @@ def print_rank_0(message, debug=False, force=False):
         print(message)
 
 
-try:
-    # Fix `torch.[device].amp.custom_fwd/bwd` FutureWarning in torch 2.4
-    if hasattr(torch, 'amp') and hasattr(torch.amp, 'custom_fwd') and hasattr(torch.amp, 'custom_bwd'):
-        autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=get_accelerator().device_name())
-        autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=get_accelerator().device_name())
-    else:
-        # original implementation
-        autocast_custom_fwd = get_accelerator().amp().custom_fwd
-        autocast_custom_bwd = get_accelerator().amp().custom_bwd
-except (ImportError, AttributeError) as exp:
-    autocast_custom_fwd = noop_decorator
-    autocast_custom_bwd = noop_decorator
+autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=get_accelerator().device_name())
+autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=get_accelerator().device_name())
 
 
 class LinearFunctionForZeroStage3(torch.autograd.Function):