diff --git a/.github/workflows/nv-pre-compile-ops.yml b/.github/workflows/nv-pre-compile-ops.yml index 53e2aad85a6b..9a90cc6602ba 100644 --- a/.github/workflows/nv-pre-compile-ops.yml +++ b/.github/workflows/nv-pre-compile-ops.yml @@ -23,11 +23,20 @@ jobs: unit-tests: runs-on: ubuntu-24.04 container: - image: deepspeed/gh-builder:ubuntu1804-py38-torch1131-cu116 + image: nvidia/cuda:12.6.3-devel-ubuntu22.04 steps: + - name: Install system dependencies + run: | + apt-get update && apt-get install -y git python3 python3-pip libaio-dev ninja-build + ln -sf /usr/bin/python3 /usr/bin/python + - uses: actions/checkout@v4 + - name: Install PyTorch + run: | + pip install torch==2.10.0 --index-url https://download.pytorch.org/whl/cu126 + - name: environment run: | which python @@ -36,7 +45,7 @@ jobs: #python -c "import torch; print('CUDA available:', torch.cuda.is_available())" - name: Compile DeepSpeed Ops run: | - DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install . + DS_ACCELERATOR=cuda DS_ENABLE_NINJA=1 TORCH_CUDA_ARCH_LIST="7.0;7.5;8.0;8.6;8.9;9.0" DS_BUILD_OPS=1 DS_BUILD_SPARSE_ATTN=0 DS_BUILD_FP_QUANTIZER=0 DS_BUILD_CUTLASS_OPS=0 DS_BUILD_GDS=0 DS_BUILD_RAGGED_DEVICE_OPS=0 DS_BUILD_EVOFORMER_ATTN=0 DS_BUILD_DEEP_COMPILE=0 pip3 install . - name: DS Report run: | - ds_report + DS_ACCELERATOR=cuda ds_report diff --git a/accelerator/abstract_accelerator.py b/accelerator/abstract_accelerator.py index 2a0770ac681b..4db8ae5ebdb7 100644 --- a/accelerator/abstract_accelerator.py +++ b/accelerator/abstract_accelerator.py @@ -178,10 +178,6 @@ def supported_dtypes(self): ... # Misc - @abc.abstractmethod - def amp(self): - ... - @abc.abstractmethod def is_available(self): ... diff --git a/accelerator/cpu_accelerator.py b/accelerator/cpu_accelerator.py index d933041bed55..6b414a6e04ef 100644 --- a/accelerator/cpu_accelerator.py +++ b/accelerator/cpu_accelerator.py @@ -199,9 +199,6 @@ def available_memory(self, device_index=None): return psutil.virtual_memory().available # Misc - def amp(self): - return torch.cpu.amp - def is_available(self): return True diff --git a/accelerator/cuda_accelerator.py b/accelerator/cuda_accelerator.py index c45903421324..42cb93f9581d 100644 --- a/accelerator/cuda_accelerator.py +++ b/accelerator/cuda_accelerator.py @@ -222,11 +222,6 @@ def supported_dtypes(self): return supported_dtypes # Misc - def amp(self): - if hasattr(torch.cuda, 'amp'): - return torch.cuda.amp - return None - def is_available(self): return torch.cuda.is_available() diff --git a/accelerator/hpu_accelerator.py b/accelerator/hpu_accelerator.py index 9d82eb590902..c6bc94f06149 100644 --- a/accelerator/hpu_accelerator.py +++ b/accelerator/hpu_accelerator.py @@ -173,9 +173,6 @@ def supported_dtypes(self): return supported_dtypes # Misc - def amp(self): - return None - def is_available(self): return self.hpu.is_available() diff --git a/accelerator/mlu_accelerator.py b/accelerator/mlu_accelerator.py index bef716f0ee4e..4689034692d1 100644 --- a/accelerator/mlu_accelerator.py +++ b/accelerator/mlu_accelerator.py @@ -162,11 +162,6 @@ def supported_dtypes(self): return supported_dtypes # Misc - def amp(self): - if hasattr(torch.mlu, 'amp'): - return torch.mlu.amp - return None - def is_available(self): return torch.mlu.is_available() diff --git a/accelerator/mps_accelerator.py b/accelerator/mps_accelerator.py index aa8e86ef1ce0..0d67c9cc1f7e 100644 --- a/accelerator/mps_accelerator.py +++ b/accelerator/mps_accelerator.py @@ -156,9 +156,6 @@ def supported_dtypes(self): return [torch.float] # Misc - def amp(self): - return - def is_available(self): return hasattr(torch.backends, "mps") and torch.backends.mps.is_available() diff --git a/accelerator/npu_accelerator.py b/accelerator/npu_accelerator.py index 7cf30a349c57..421050d19f22 100644 --- a/accelerator/npu_accelerator.py +++ b/accelerator/npu_accelerator.py @@ -163,11 +163,6 @@ def supported_dtypes(self): return [torch.float, torch.half, torch.bfloat16] # Misc - def amp(self): - if hasattr(torch.npu, 'amp'): - return torch.npu.amp - return None - def is_available(self): return torch.npu.is_available() diff --git a/accelerator/sdaa_accelerator.py b/accelerator/sdaa_accelerator.py index 26113d38dd15..f185731d9385 100755 --- a/accelerator/sdaa_accelerator.py +++ b/accelerator/sdaa_accelerator.py @@ -192,11 +192,6 @@ def supported_dtypes(self): return supported_dtypes # Misc - def amp(self): - if hasattr(torch.sdaa, 'amp'): - return torch.sdaa.amp - return None - def is_available(self): return torch.sdaa.is_available() diff --git a/accelerator/xpu_accelerator.py b/accelerator/xpu_accelerator.py index 0d8f0d0d2cc8..b2fb57d42cd1 100644 --- a/accelerator/xpu_accelerator.py +++ b/accelerator/xpu_accelerator.py @@ -153,9 +153,6 @@ def available_memory(self, device_index=None): return self.total_memory(device_index) - self.memory_allocated(device_index) # Misc - def amp(self): - return torch.amp - def is_available(self): return torch.xpu.is_available() diff --git a/deepspeed/runtime/zero/linear.py b/deepspeed/runtime/zero/linear.py index 0a86e3c389b1..0fd02cdc67ef 100644 --- a/deepspeed/runtime/zero/linear.py +++ b/deepspeed/runtime/zero/linear.py @@ -23,7 +23,6 @@ from torch.nn.parameter import Parameter from torch.nn import init from torch.nn.modules.module import Module -from deepspeed.runtime.utils import noop_decorator from deepspeed import comm as dist from deepspeed.accelerator import get_accelerator @@ -33,18 +32,8 @@ def print_rank_0(message, debug=False, force=False): print(message) -try: - # Fix `torch.[device].amp.custom_fwd/bwd` FutureWarning in torch 2.4 - if hasattr(torch, 'amp') and hasattr(torch.amp, 'custom_fwd') and hasattr(torch.amp, 'custom_bwd'): - autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=get_accelerator().device_name()) - autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=get_accelerator().device_name()) - else: - # original implementation - autocast_custom_fwd = get_accelerator().amp().custom_fwd - autocast_custom_bwd = get_accelerator().amp().custom_bwd -except (ImportError, AttributeError) as exp: - autocast_custom_fwd = noop_decorator - autocast_custom_bwd = noop_decorator +autocast_custom_fwd = functools.partial(torch.amp.custom_fwd, device_type=get_accelerator().device_name()) +autocast_custom_bwd = functools.partial(torch.amp.custom_bwd, device_type=get_accelerator().device_name()) class LinearFunctionForZeroStage3(torch.autograd.Function):