[CI] Add multi-node test case for a2 (#3805)

Potabk · web-flow · commit f846bd20e4eb · 2025-10-27T23:10:17.000+08:00
### What this PR does / why we need it? This patch add multi-node test case for a2 ### Does this PR introduce _any_ user-facing change? ### How was this patch tested? - vLLM version: v0.11.0rc3 - vLLM main: vllm-project/vllm@c9461e0 --------- Signed-off-by: wangli <wangli858794774@gmail.com>
diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml
@@ -7,6 +7,10 @@ on:
         required: true
         type: string
         description: use a2 or a3
+      runner:
+        required: false
+        type: string
+        default: linux-aarch64-a3-0
       image:
         required: false
         type: string
@@ -62,7 +66,7 @@ concurrency:
 jobs:
   e2e:
     # This is a runner with no NPU for k8s controller
-    runs-on: linux-aarch64-a3-0
+    runs-on: ${{ inputs.runner }}
     container:
       image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
       env:
@@ -90,8 +94,7 @@ jobs:
             kubectl version --client=true
 
         # TODO: Add A2 tests
-        - name: Setup kubeconfig for A3
-          if: inputs.soc_version == 'a3'
+        - name: Decode kubeconfig from secrets
           run: |
             # Decode and save kubeconfig
             echo "${{ secrets.KUBECONFIG_B64 }}" | base64 -d > $KUBECONFIG
@@ -129,6 +132,12 @@ jobs:
               fi
             done
 
+            if [ "${{ inputs.soc_version }}" = "a3" ]; then
+              npu_per_node=16
+            else
+              npu_per_node=8
+            fi
+
             jinja2 tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 \
               -D size="$size" \
               -D replicas="$replicas" \
@@ -138,6 +147,7 @@ jobs:
               -D vllm_ascend_remote_url="$vllm_ascend_remote_url" \
               -D vllm_ascend_ref="$vllm_ascend_ref" \
               -D result_file_path="$result_file_path" \
+              -D npu_per_node="$npu_per_node" \
               --outfile lws.yaml
 
             kubectl apply -f ./lws.yaml
diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml
@@ -61,3 +61,25 @@ jobs:
       vllm: v0.11.0
       runner: ${{ matrix.test_config.os }}
       tests: ${{ matrix.test_config.tests }}
+
+  multi-node-tests:
+    needs: single-node-tests
+    if: always() && (github.event_name == 'schedule' || github.event_name == 'workflow_dispatch')
+    strategy:
+      fail-fast: false
+      max-parallel: 1
+      matrix:
+        test_config:
+          - name: multi-node-deepseek-dp
+            config_file_path: tests/e2e/nightly/multi_node/config/models/DeepSeek-R1-W8A8-A2.yaml
+            size: 2
+    uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
+    with:
+      soc_version: a2
+      runner: linux-aarch64-a2-0
+      image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
+      replicas: 1
+      size: ${{ matrix.test_config.size }}
+      config_file_path: ${{ matrix.test_config.config_file_path }}
+    secrets:
+      KUBECONFIG_B64: ${{ secrets.KUBECONFIG_A2_B64 }}
diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml
@@ -104,10 +104,10 @@ jobs:
     uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
     with:
       soc_version: a3
+      runner: linux-aarch64-a3-0
       image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
       replicas: 1
       size: ${{ matrix.test_config.size }}
       config_file_path: ${{ matrix.test_config.config_file_path }}
     secrets:
       KUBECONFIG_B64: ${{ secrets.KUBECONFIG_B64 }}
-
diff --git a/tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2.yaml b/tests/e2e/nightly/multi_node/config/models/DeepSeep-R1-W8A8-A2.yaml
@@ -0,0 +1,57 @@
+test_name: "test DeepSeek-R1-W8A8 on A2"
+model: "vllm-ascend/DeepSeek-R1-0528-W8A8"
+num_nodes: 2
+npu_per_node: 8
+env_common:
+  VLLM_USE_MODELSCOPE: true
+  HCCL_BUFFSIZE: 1024
+  SERVER_PORT: 8080
+  OMP_PROC_BIND: false
+  OMP_NUM_THREADS: 10
+
+
+deployment:
+  -
+    server_cmd: >
+      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
+      --host 0.0.0.0
+      --port $SERVER_PORT
+      --data-parallel-size 4
+      --data-parallel-size-local 2
+      --data-parallel-address $LOCAL_IP
+      --data-parallel-rpc-port 13399
+      --no-enable-prefix-caching
+      --max-num-seqs 16
+      --tensor-parallel-size 4
+      --max-model-len 36864
+      --max-num-batched-tokens 6000
+      --enable-expert-parallel
+      --trust-remote-code
+      --quantization ascend
+      --gpu-memory-utilization 0.9
+      --enforce-eager
+      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
+      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+
+  -
+    server_cmd: >
+      vllm serve vllm-ascend/DeepSeek-R1-0528-W8A8
+      --headless
+      --data-parallel-size 4
+      --data-parallel-rpc-port 13399
+      --data-parallel-size-local 2
+      --data-parallel-start-rank 2
+      --data-parallel-address $MASTER_IP
+      --no-enable-prefix-caching
+      --max-num-seqs 16
+      --tensor-parallel-size 4
+      --max-model-len 36864
+      --max-num-batched-tokens 6000
+      --enable-expert-parallel
+      --trust-remote-code
+      --quantization ascend
+      --gpu-memory-utilization 0.9
+      --enforce-eager
+      --speculative-config '{"num_speculative_tokens": 1, "method":"deepseek_mtp"}' \
+      --additional-config '{"ascend_scheduler_config":{"enabled":false},"chunked_prefill_for_mla":true,"enable_weight_nz_layout":true}'
+benchmarks:
diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2
@@ -37,11 +37,11 @@ spec:
                 bash /root/.cache/tests/run.sh
             resources:
               limits:
-                huawei.com/ascend-1980: "16"
+                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                 memory: 512Gi
                 ephemeral-storage: 100Gi
               requests:
-                huawei.com/ascend-1980: "16"
+                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                 ephemeral-storage: 100Gi
                 cpu: 125
             ports:
@@ -95,11 +95,11 @@ spec:
                 bash /root/.cache/tests/run.sh
             resources:
               limits:
-                huawei.com/ascend-1980: "16"
+                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                 memory: 512Gi
                 ephemeral-storage: 100Gi
               requests:
-                huawei.com/ascend-1980: "16"
+                huawei.com/ascend-1980: {{ npu_per_node | default("16") }}
                 ephemeral-storage: 100Gi
                 cpu: 125
             volumeMounts: