diff --git a/.github/Dockerfile.buildwheel b/.github/Dockerfile.buildwheel index 960e397434f..5326935a874 100644 --- a/.github/Dockerfile.buildwheel +++ b/.github/Dockerfile.buildwheel @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # ARG PY_VERSION=3.11 -FROM quay.io/ascend/manylinux:8.2.rc1-910b-manylinux_2_28-py${PY_VERSION} +FROM quay.io/ascend/manylinux:8.3.rc1-910b-manylinux_2_28-py${PY_VERSION} ARG COMPILE_CUSTOM_KERNELS=1 diff --git a/.github/workflows/_accuracy_test.yaml b/.github/workflows/_accuracy_test.yaml index da02559e186..b9d155f2319 100644 --- a/.github/workflows/_accuracy_test.yaml +++ b/.github/workflows/_accuracy_test.yaml @@ -30,7 +30,7 @@ jobs: runs-on: ${{ inputs.runner }} name: ${{ inputs.model_name }} accuracy container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 env: VLLM_USE_MODELSCOPE: True # 1. If version specified (work_dispatch), do specified branch accuracy test diff --git a/.github/workflows/_e2e_nightly_multi_node.yaml b/.github/workflows/_e2e_nightly_multi_node.yaml index db1a6414b85..3fa400e8a92 100644 --- a/.github/workflows/_e2e_nightly_multi_node.yaml +++ b/.github/workflows/_e2e_nightly_multi_node.yaml @@ -15,7 +15,7 @@ on: required: false type: string description: base image for pods - default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11" + default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11" config_file_path: required: true type: string @@ -69,7 +69,7 @@ jobs: # This is the runner with no NPU for k8s controller runs-on: ${{ inputs.runner }} container: - image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11 env: KUBECONFIG: /tmp/kubeconfig KUBECTL: /root/.cache/.kube/kubectl diff --git a/.github/workflows/_e2e_nightly_single_node.yaml b/.github/workflows/_e2e_nightly_single_node.yaml index e77f5623328..ab77649f429 100644 --- a/.github/workflows/_e2e_nightly_single_node.yaml +++ b/.github/workflows/_e2e_nightly_single_node.yaml @@ -29,7 +29,7 @@ on: image: required: false type: string - default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11" + default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11" tests: required: true type: string diff --git a/.github/workflows/_kill_lws_resources.yaml b/.github/workflows/_kill_lws_resources.yaml index dd6549bde52..4c70ff2d7d2 100644 --- a/.github/workflows/_kill_lws_resources.yaml +++ b/.github/workflows/_kill_lws_resources.yaml @@ -24,7 +24,7 @@ jobs: # This is a runner with no NPU for k8s controller runs-on: ${{ inputs.runner }} container: - image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11 env: KUBECONFIG: /tmp/kubeconfig KUBECTL: /root/.cache/.kube/kubectl diff --git a/.github/workflows/accuracy_test.yaml b/.github/workflows/accuracy_test.yaml index a7d3c786b80..695cf70e48e 100644 --- a/.github/workflows/accuracy_test.yaml +++ b/.github/workflows/accuracy_test.yaml @@ -70,5 +70,5 @@ jobs: with: vllm: v0.11.0 runner: linux-aarch64-${{ matrix.runner }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 model_name: ${{ matrix.model_name }} diff --git a/.github/workflows/nightly_benchmarks.yaml b/.github/workflows/nightly_benchmarks.yaml index 580e6333e62..a768cd66f5b 100644 --- a/.github/workflows/nightly_benchmarks.yaml +++ b/.github/workflows/nightly_benchmarks.yaml @@ -56,7 +56,7 @@ jobs: vllm_use_v1: 1 max-parallel: 1 container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 volumes: - /usr/local/dcmi:/usr/local/dcmi - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi diff --git a/.github/workflows/vllm_ascend_dist.yaml b/.github/workflows/vllm_ascend_dist.yaml index f78486e19e8..2d7c8ef235f 100644 --- a/.github/workflows/vllm_ascend_dist.yaml +++ b/.github/workflows/vllm_ascend_dist.yaml @@ -75,7 +75,7 @@ jobs: name: vLLM Ascend test runs-on: ${{ matrix.os }} container: - image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11 env: DEBIAN_FRONTEND: noninteractive steps: diff --git a/.github/workflows/vllm_ascend_test.yaml b/.github/workflows/vllm_ascend_test.yaml index 23fb6ea6517..619e87158af 100644 --- a/.github/workflows/vllm_ascend_test.yaml +++ b/.github/workflows/vllm_ascend_test.yaml @@ -76,6 +76,7 @@ jobs: if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }} runs-on: ubuntu-latest container: + # fixme: vllm-ascend install failed with 8.3.rc1 on github action image: quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11 env: VLLM_LOGGING_LEVEL: ERROR @@ -146,5 +147,5 @@ jobs: with: vllm: ${{ matrix.vllm_version }} runner: linux-aarch64-a2 - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 type: light diff --git a/.github/workflows/vllm_ascend_test_310p.yaml b/.github/workflows/vllm_ascend_test_310p.yaml index 1de447fc31d..099f3e07fe0 100644 --- a/.github/workflows/vllm_ascend_test_310p.yaml +++ b/.github/workflows/vllm_ascend_test_310p.yaml @@ -58,7 +58,7 @@ jobs: runs-on: ${{ matrix.os }} container: # TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-310p-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-310p-ubuntu22.04-py3.11 env: VLLM_LOGGING_LEVEL: ERROR VLLM_USE_MODELSCOPE: True diff --git a/.github/workflows/vllm_ascend_test_full.yaml b/.github/workflows/vllm_ascend_test_full.yaml index 088036e6d8a..e16b7619670 100644 --- a/.github/workflows/vllm_ascend_test_full.yaml +++ b/.github/workflows/vllm_ascend_test_full.yaml @@ -76,5 +76,5 @@ jobs: with: vllm: ${{ matrix.vllm_version }} runner: linux-aarch64-a2 - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 type: full diff --git a/.github/workflows/vllm_ascend_test_full_vllm_main.yaml b/.github/workflows/vllm_ascend_test_full_vllm_main.yaml index 48dc695f8ca..dbd632912af 100644 --- a/.github/workflows/vllm_ascend_test_full_vllm_main.yaml +++ b/.github/workflows/vllm_ascend_test_full_vllm_main.yaml @@ -41,5 +41,5 @@ jobs: with: vllm: main runner: linux-aarch64-a2 - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 type: full diff --git a/.github/workflows/vllm_ascend_test_models.yaml b/.github/workflows/vllm_ascend_test_models.yaml index a26c5a3e232..beba0e44649 100644 --- a/.github/workflows/vllm_ascend_test_models.yaml +++ b/.github/workflows/vllm_ascend_test_models.yaml @@ -79,7 +79,7 @@ jobs: with: vllm: v0.11.0 runner: linux-aarch64-${{ matrix.runner }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 model_name: ${{ matrix.model_name }} upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }} diff --git a/.github/workflows/vllm_ascend_test_nightly_a2.yaml b/.github/workflows/vllm_ascend_test_nightly_a2.yaml index 0842e7e10f1..19fc3b5dc9c 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a2.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a2.yaml @@ -82,7 +82,7 @@ jobs: with: soc_version: a2 runner: linux-aarch64-a2-0 - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 replicas: 1 size: ${{ matrix.test_config.size }} config_file_path: ${{ matrix.test_config.config_file_path }} diff --git a/.github/workflows/vllm_ascend_test_nightly_a3.yaml b/.github/workflows/vllm_ascend_test_nightly_a3.yaml index 2cd6d817e9f..d880a8bffbe 100644 --- a/.github/workflows/vllm_ascend_test_nightly_a3.yaml +++ b/.github/workflows/vllm_ascend_test_nightly_a3.yaml @@ -82,7 +82,7 @@ jobs: with: vllm: v0.11.0 runner: ${{ matrix.test_config.os }} - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11 tests: ${{ matrix.test_config.tests }} multi-node-tests: @@ -113,7 +113,7 @@ jobs: with: soc_version: a3 runner: linux-aarch64-a3-0 - image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11 replicas: 1 size: ${{ matrix.test_config.size }} config_file_path: ${{ matrix.test_config.config_file_path }} diff --git a/.github/workflows/vllm_ascend_test_pd.yaml b/.github/workflows/vllm_ascend_test_pd.yaml index fee06bee829..778d83b6375 100644 --- a/.github/workflows/vllm_ascend_test_pd.yaml +++ b/.github/workflows/vllm_ascend_test_pd.yaml @@ -49,7 +49,7 @@ jobs: runs-on: linux-arm64-npu-static-8 container: - image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11 + image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11 volumes: - /usr/local/dcmi:/usr/local/dcmi - /usr/local/bin/npu-smi:/usr/local/bin/npu-smi diff --git a/Dockerfile b/Dockerfile index e988028ddc4..f01284b4911 100644 --- a/Dockerfile +++ b/Dockerfile @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11 +FROM quay.io/ascend/cann:8.3.rc1-910b-ubuntu22.04-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG COMPILE_CUSTOM_KERNELS=1 diff --git a/Dockerfile.310p b/Dockerfile.310p index 212058954c4..423f2143baa 100644 --- a/Dockerfile.310p +++ b/Dockerfile.310p @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11 +FROM quay.io/ascend/cann:8.3.rc1-310p-ubuntu22.04-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG COMPILE_CUSTOM_KERNELS=1 diff --git a/Dockerfile.310p.openEuler b/Dockerfile.310p.openEuler index 38015239871..4da46b10602 100644 --- a/Dockerfile.310p.openEuler +++ b/Dockerfile.310p.openEuler @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11 +FROM quay.io/ascend/cann:8.3.rc1-310p-openeuler24.03-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG COMPILE_CUSTOM_KERNELS=1 diff --git a/Dockerfile.a3 b/Dockerfile.a3 index 320e8fc75c2..59c4cbbbebd 100644 --- a/Dockerfile.a3 +++ b/Dockerfile.a3 @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 +FROM quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG COMPILE_CUSTOM_KERNELS=1 diff --git a/Dockerfile.a3.openEuler b/Dockerfile.a3.openEuler index 1e6ba351cb8..2fabb3069a0 100644 --- a/Dockerfile.a3.openEuler +++ b/Dockerfile.a3.openEuler @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11 +FROM quay.io/ascend/cann:8.3.rc1-a3-openeuler24.03-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG COMPILE_CUSTOM_KERNELS=1 diff --git a/Dockerfile.openEuler b/Dockerfile.openEuler index 2486dbae642..e665518d97d 100644 --- a/Dockerfile.openEuler +++ b/Dockerfile.openEuler @@ -15,7 +15,7 @@ # This file is a part of the vllm-ascend project. # -FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11 +FROM quay.io/ascend/cann:8.3.rc1-910b-openeuler24.03-py3.11 ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple" ARG COMPILE_CUSTOM_KERNELS=1 diff --git a/README.md b/README.md index b7952e54b30..1d0529bfd0f 100644 --- a/README.md +++ b/README.md @@ -42,7 +42,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l - OS: Linux - Software: * Python >= 3.9, < 3.12 - * CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html)) + * CANN >= 8.3.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html)) * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM (the same version as vllm-ascend) diff --git a/README.zh.md b/README.zh.md index 1840de021df..a28056fa3db 100644 --- a/README.zh.md +++ b/README.zh.md @@ -43,7 +43,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP - 操作系统:Linux - 软件: * Python >= 3.9, < 3.12 - * CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html)) + * CANN >= 8.3.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html)) * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM (与vllm-ascend版本一致) diff --git a/docs/source/conf.py b/docs/source/conf.py index 766f1e2f282..149f327581d 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -75,7 +75,7 @@ 'pip_vllm_ascend_version': "0.11.0rc0", 'pip_vllm_version': "0.11.0", # CANN image tag - 'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11", + 'cann_image_tag': "8.3.rc1-910b-ubuntu22.04-py3.11", # vllm version in ci 'ci_vllm_version': 'v0.11.0', } diff --git a/docs/source/developer_guide/contribution/multi_node_test.md b/docs/source/developer_guide/contribution/multi_node_test.md index 6e96084cca7..1d78c8e3536 100644 --- a/docs/source/developer_guide/contribution/multi_node_test.md +++ b/docs/source/developer_guide/contribution/multi_node_test.md @@ -90,7 +90,7 @@ currently, the multi-node test workflow defined in the [vllm_ascend_test_nightly uses: ./.github/workflows/_e2e_nightly_multi_node.yaml with: soc_version: a3 - image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11 + image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11 replicas: 1 size: ${{ matrix.test_config.size }} config_file_path: ${{ matrix.test_config.config_file_path }} diff --git a/docs/source/installation.md b/docs/source/installation.md index a08e9e22b35..e8bc8cbbeac 100644 --- a/docs/source/installation.md +++ b/docs/source/installation.md @@ -11,8 +11,8 @@ This document describes how to install vllm-ascend manually. | Software | Supported version | Note | |---------------|----------------------------------|-------------------------------------------| - | Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html) | Required for CANN | - | CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu | + | Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html) | Required for CANN | + | CANN | >= 8.3.RC1 | Required for vllm-ascend and torch-npu | | torch-npu | == 2.7.1 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps | | torch | == 2.7.1 | Required for torch-npu and vllm | @@ -80,19 +80,19 @@ source vllm-ascend-env/bin/activate pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions # Download and install the CANN package. -wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run -chmod +x ./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run -./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run --full -# https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.2.rc1_linux-aarch64.run +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run +./Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run --full +# https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.3.rc1_linux-aarch64.run source /usr/local/Ascend/ascend-toolkit/set_env.sh -wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run -chmod +x ./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run -./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run --install +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run +./Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run --install -wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run -chmod +x ./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run -./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run --install +wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run +chmod +x ./Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run +./Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run --install source /usr/local/Ascend/nnal/atb/set_env.sh ``` diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md index ea76f0d511d..563357f8ba5 100644 --- a/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md +++ b/examples/disaggregated_prefill_v1/mooncake_connector_deployment_guide.md @@ -4,7 +4,7 @@ * Software: * Python >= 3.9, < 3.12 - * CANN >= 8.2.rc1 + * CANN >= 8.3.rc1 * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM (same version as vllm-ascend) * mooncake-transfer-engine reference documentation: https://github.com/kvcache-ai/Mooncake/blob/main/doc/zh/ascend_transport.md diff --git a/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md b/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md index 82640215a84..28dd83b7cd0 100644 --- a/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md +++ b/examples/disaggregated_prefill_v1/mooncake_connector_store_deployment_guide.md @@ -4,7 +4,7 @@ * Software: * Python >= 3.9, < 3.12 - * CANN >= 8.2.rc1 + * CANN >= 8.3.rc1 * PyTorch == 2.7.1, torch-npu == 2.7.1 * vLLM:main branch * vLLM-Ascend:main branch diff --git a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 index f619b59754d..712fe3f06b0 100644 --- a/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 +++ b/tests/e2e/nightly/multi_node/scripts/lws.yaml.jinja2 @@ -15,7 +15,7 @@ spec: spec: containers: - name: vllm-leader - image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }} + image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11") }} env: - name: CONFIG_YAML_PATH value: {{ config_file_path | default("DeepSeek-V3.yaml") }} @@ -75,7 +75,7 @@ spec: spec: containers: - name: vllm-worker - image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }} + image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11") }} env: - name: CONFIG_YAML_PATH value: {{ config_file_path | default("DeepSeek-V3.yaml") }} diff --git a/tests/e2e/vllm_interface/vllm_test.cfg b/tests/e2e/vllm_interface/vllm_test.cfg index 4d077b09092..9723d49cad7 100644 --- a/tests/e2e/vllm_interface/vllm_test.cfg +++ b/tests/e2e/vllm_interface/vllm_test.cfg @@ -1,2 +1,2 @@ # Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository -BASE_IMAGE_NAME="quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11" +BASE_IMAGE_NAME="quay.io/ascend/cann:8.3.rc1-910b-ubuntu22.04-py3.11" diff --git a/tests/ut/attention/test_attention_mask.py b/tests/ut/attention/test_attention_mask.py index a87d21bd742..c8139b7167b 100644 --- a/tests/ut/attention/test_attention_mask.py +++ b/tests/ut/attention/test_attention_mask.py @@ -74,11 +74,10 @@ def test_get_attn_mask(self): attn_mask = attention_mask_builder.get_attn_mask( max_seq_len=2048, dtype=torch.float16, device=torch.device("cpu")) self.assertEqual(attn_mask.shape, (2048, 2048)) - self.assertEqual(attn_mask[0][-1], - torch.tensor(float("-inf"), dtype=torch.float16)) - self.assertEqual(attention_mask_builder._seq_len_cached, 2048) + self.assertEqual(attn_mask[0][-1], torch.tensor(True)) + self.assertEqual(attention_mask_builder._seq_len_cached, 1024) self.assertEqual(attention_mask_builder.attn_mask_cache.shape, - (2048, 2048)) + (1024, 1024)) self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1], torch.tensor(float("-inf"), dtype=torch.float16)) @@ -91,43 +90,5 @@ def test_get_splitfuse_attn_mask(self): dtype=torch.float16, device=torch.device("cpu"), ) - self.assertEqual(attn_mask.shape, (6, 100)) + self.assertEqual(attn_mask.shape, (2048, 2048)) self.assertEqual(attention_mask_builder._seq_len_cached, 1024) - - attn_mask = attention_mask_builder.get_splitfuse_attn_mask( - seq_lens=torch.tensor([10, 3000, 2000]), - position=torch.tensor([7, 8, 9, 2999, 1999]), - dtype=torch.float16, - device=torch.device("cpu"), - ) - self.assertEqual(attn_mask.shape, (5, 3000)) - self.assertEqual(attention_mask_builder._seq_len_cached, 3000) - - # splitfuse_attn_mask now only supports data types: torch.float16 and torch.bfloat16 - # otherwise raise ValueError - with self.assertRaises(ValueError): - attn_mask = attention_mask_builder.get_splitfuse_attn_mask( - seq_lens=torch.tensor([10, 20, 100]), - position=torch.tensor([7, 8, 9, 18, 19, 99]), - dtype=torch.int8, - device=torch.device("cpu"), - ) - - def test_mask_value_cleanliness(self): - attention_mask_builder = AttentionMaskBuilder(max_seq_len=6, - dtype=torch.bfloat16) - self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1], - torch.tensor(1, dtype=torch.bfloat16)) - - attn_mask = attention_mask_builder.get_splitfuse_attn_mask( - seq_lens=torch.tensor([6]), - position=torch.tensor([3, 4, 5]), - dtype=torch.bfloat16, - device=torch.device("cpu"), - ) - self.assertEqual( - attn_mask[-2][-1], - torch.tensor(-10000, dtype=torch.bfloat16, - device=attn_mask.device)) - self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1], - torch.tensor(1, dtype=torch.bfloat16)) diff --git a/tests/ut/attention/test_attention_v1.py b/tests/ut/attention/test_attention_v1.py index 237fd299fd9..dfb9a2a07f8 100644 --- a/tests/ut/attention/test_attention_v1.py +++ b/tests/ut/attention/test_attention_v1.py @@ -298,8 +298,9 @@ def test_forward_prefill_no_cache(self, mock_flash_attention, assert output.shape == (10, 8 * 64) @patch('torch_npu._npu_reshape_and_cache') - @patch('torch_npu._npu_flash_attention_qlens') - def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens, + @patch('torch_npu.npu_fused_infer_attention_score') + def test_forward_prefill_cache_hit(self, + mock_npu_fused_infer_attention_score, mock_npu_reshape_and_cache): """Test forward pass in PrefillCacheHit state""" query = torch.randn(10, 8 * 64) @@ -308,6 +309,8 @@ def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens, kv_cache = torch.empty(2, 5, 128, 8, 64) output = torch.empty_like(query) + mock_npu_fused_infer_attention_score.return_value = (output, 1) + metadata = self.attn_metadata metadata.attn_state = AscendAttentionState.PrefillCacheHit metadata.attn_mask = torch.randn(1, 1, 10, 10) @@ -323,7 +326,7 @@ def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens, output = self.impl.forward(layer, query, key, value, kv_cache, metadata, output) - mock_flash_attention_qlens.assert_called_once() + mock_npu_fused_infer_attention_score.assert_called_once() assert output.shape == (10, 8 * 64) @patch('vllm_ascend.attention.attention_v1.get_forward_context') @@ -528,13 +531,11 @@ def test_forward_decode_only_swa_seq_len_mismatch( assert output.shape == (10, 8 * 64) - @patch('torch.version') @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False) @patch('torch_npu._npu_reshape_and_cache') @patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill') def test_forward_head_size_192(self, mock_vanilla_prefill, - mock_npu_reshape_and_cache, mock_is_310p, - mock_version): + mock_npu_reshape_and_cache, mock_is_310p): """Test forward pass when head_size is 192""" self.impl.head_size = 192 @@ -554,7 +555,6 @@ def test_forward_head_size_192(self, mock_vanilla_prefill, metadata.num_decodes = 10 metadata.num_prefills = 0 layer = self.layer_no_quant - mock_version.cann = "8.4.RC1" mock_vanilla_prefill.return_value = MagicMock() output = self.impl_192.forward(layer, query, key, value, kv_cache, @@ -563,12 +563,11 @@ def test_forward_head_size_192(self, mock_vanilla_prefill, mock_vanilla_prefill.assert_called_once() assert output.shape == (10, 8 * 192) - @patch('torch.version') @patch('torch_npu._npu_reshape_and_cache') - @patch('torch_npu._npu_paged_attention_splitfuse') - def test_forward_normal_v1_situation(self, mock_paged_attention, - mock_npu_reshape_and_cache, - mock_version): + @patch('torch_npu.npu_fused_infer_attention_score') + def test_forward_normal_v1_situation(self, + mock_npu_fused_infer_attention_score, + mock_npu_reshape_and_cache): """Test forward pass in normal V1 situation""" query = torch.randn(10, 8 * 64) key = torch.randn(10, 8 * 64) @@ -576,6 +575,8 @@ def test_forward_normal_v1_situation(self, mock_paged_attention, kv_cache = torch.empty(2, 5, 128, 8, 64) output = torch.empty_like(query) + mock_npu_fused_infer_attention_score.return_value = (output, 1) + metadata = self.attn_metadata metadata.attn_mask = torch.randn(1, 1, 10, 10) metadata.query_lens = torch.tensor([10]) @@ -587,22 +588,20 @@ def test_forward_normal_v1_situation(self, mock_paged_attention, metadata.num_prefills = 10 layer = self.layer_no_quant - mock_version.cann = "8.4.RC1" - output = self.impl.forward(layer, query, key, value, kv_cache, metadata, output) - mock_paged_attention.assert_called_once() + mock_npu_fused_infer_attention_score.assert_called_once() assert output.shape == (10, 8 * 64) - @patch('torch.version') @patch('torch_npu.npu_format_cast') @patch('torch_npu._npu_reshape_and_cache') - @patch('torch_npu._npu_paged_attention_splitfuse') + @patch('torch_npu.npu_fused_infer_attention_score') @patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True) - def test_forward_310p_device(self, mock_is_310p, mock_paged_attention, + def test_forward_310p_device(self, mock_is_310p, + mock_npu_fused_infer_attention_score, mock_npu_reshape_and_cache, - mock_npu_format_cast, mock_version): + mock_npu_format_cast): """Test forward pass on 310P device""" query = torch.randn(10, 8 * 64) key = torch.randn(10, 8 * 64) @@ -610,6 +609,8 @@ def test_forward_310p_device(self, mock_is_310p, mock_paged_attention, kv_cache = torch.empty(2, 5, 128, 8, 64) output = torch.empty_like(query) + mock_npu_fused_infer_attention_score.return_value = (output, 1) + metadata = self.attn_metadata metadata.attn_mask = torch.randn(1, 1, 10, 10) metadata.query_lens = torch.tensor([10]) @@ -622,12 +623,11 @@ def test_forward_310p_device(self, mock_is_310p, mock_paged_attention, layer = self.layer_no_quant mock_npu_format_cast.return_value = metadata.attn_mask - mock_version.cann = "8.4.RC1" output = self.impl.forward(layer, query, key, value, kv_cache, metadata, output) - mock_paged_attention.assert_called_once() + mock_npu_fused_infer_attention_score.assert_called_once() assert output.shape == (10, 8 * 64) @patch('torch_npu._npu_reshape_and_cache') diff --git a/tests/ut/ops/test_linear.py b/tests/ut/ops/test_linear.py index bc1751ac873..1b3a7268fc6 100644 --- a/tests/ut/ops/test_linear.py +++ b/tests/ut/ops/test_linear.py @@ -63,33 +63,20 @@ def setUp(self): @mock.patch("vllm_ascend.ops.linear.is_enable_nz") @mock.patch("torch_npu.npu_format_cast") - @mock.patch("torch.version") - def test_process_weights_after_loading_is_8_3_enable_nz( - self, mock_version, mock_format_cast, mock_is_nz): - mock_version.cann = "8.3.RC1" + def test_process_weights_after_loading_enable_nz(self, mock_format_cast, + mock_is_nz): mock_is_nz.return_value = 1 self.method.process_weights_after_loading(self.layer) mock_format_cast.assert_called_once() @mock.patch("vllm_ascend.ops.linear.is_enable_nz") @mock.patch("torch_npu.npu_format_cast") - @mock.patch("torch.version") - def test_process_weights_after_loading_is_8_3_disable_nz( - self, mock_version, mock_format_cast, mock_is_nz): - mock_version.cann = "8.3.RC1" + def test_process_weights_after_loading_disable_nz(self, mock_format_cast, + mock_is_nz): mock_is_nz.return_value = 0 self.method.process_weights_after_loading(self.layer) mock_format_cast.assert_not_called() - @mock.patch("vllm_ascend.ops.linear.is_enable_nz") - @mock.patch("torch.version") - def test_process_weights_after_loading_not_8_3(self, mock_version, - mock_is_nz): - mock_version.cann = "8.2.RC1" - mock_is_nz.return_value = 1 - # Should not raise exception - self.method.process_weights_after_loading(self.layer) - class TestAscendRowParallelLinear(BaseLinearTest): diff --git a/vllm_ascend/attention/attention_mask.py b/vllm_ascend/attention/attention_mask.py index 5a94b82b149..3514984d826 100644 --- a/vllm_ascend/attention/attention_mask.py +++ b/vllm_ascend/attention/attention_mask.py @@ -47,11 +47,10 @@ def __init__( self.attn_mask_cache = attn_mask self.device = device self.pooling_mask = None - if torch.version.cann.startswith("8.3"): - assigned_mask_dim = 2048 - self.chunked_prefill_attn_mask = torch.triu( - torch.ones(assigned_mask_dim, assigned_mask_dim), - diagonal=1).to(torch.int8).to(device) + assigned_mask_dim = 2048 + self.chunked_prefill_attn_mask = torch.triu( + torch.ones(assigned_mask_dim, assigned_mask_dim), + diagonal=1).to(torch.int8).to(device) @staticmethod def get_mask_scale_factor(dtype: torch.dtype = torch.float16): @@ -68,7 +67,7 @@ def get_mask_scale_factor(dtype: torch.dtype = torch.float16): def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype, device: torch.device): - if max_seq_len == 2048 and torch.version.cann.startswith("8.3"): + if max_seq_len == 2048: return self.chunked_prefill_attn_mask.to(torch.bool) self._update_attn_cache(max_seq_len, dtype) return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous( @@ -89,23 +88,7 @@ def get_splitfuse_attn_mask( dtype: torch.dtype = None, device: torch.device = None, ) -> torch.Tensor: - if torch.version.cann.startswith("8.3"): - return self.chunked_prefill_attn_mask - else: - if dtype not in [torch.float16, torch.bfloat16]: - raise ValueError( - "splitfuse_attn_mask now only supports bf16 and fp16") - max_seq_len = max(seq_lens, default=0) - self._update_attn_cache(max_seq_len, dtype) - # FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation - # is not the same. Fix this in the future when kernel is ready. - mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor( - dtype) - attn_mask = torch.index_select(self.attn_mask_cache, - dim=0, - index=position)[:, :max_seq_len] - attn_mask *= mask_scale_factor - return attn_mask.contiguous().to(device, non_blocking=True) + return self.chunked_prefill_attn_mask def _update_attn_cache(self, seqlen: int, dtype: torch.dtype): if seqlen > self._seq_len_cached: diff --git a/vllm_ascend/attention/attention_v1.py b/vllm_ascend/attention/attention_v1.py index fb29c9dd6e0..59d6fcb2c1e 100644 --- a/vllm_ascend/attention/attention_v1.py +++ b/vllm_ascend/attention/attention_v1.py @@ -500,7 +500,7 @@ def _forward_prefill_cache_hit( block_table = attn_metadata.block_tables[:batch_size, :] num_block, block_size, _, _ = self.key_cache.shape # type: ignore - if torch.version.cann.startswith("8.3") and block_size == 128: + if block_size == 128: # TODO:The npu_fused_infer_attention_score op is planned to # be utilized in a wider range in upcoming versions. key = self.key_cache.view( # type: ignore @@ -680,43 +680,30 @@ def _forward_v1_style( attn_metadata.seq_lens = \ attn_metadata.seq_lens.to(device=query.device) - if torch.version.cann.startswith("8.3"): - # TODO:The npu_fused_infer_attention_score op is planned to - # be utilized in a wider range in upcoming versions. - num_block, block_size, _, _ = self.key_cache.shape # type: ignore - key = self.key_cache.view( # type: ignore - num_block, block_size, -1) - value = self.value_cache.view( # type: ignore - num_block, block_size, -1) + # TODO:The npu_fused_infer_attention_score op is planned to + # be utilized in a wider range in upcoming versions. + num_block, block_size, _, _ = self.key_cache.shape # type: ignore + key = self.key_cache.view( # type: ignore + num_block, block_size, -1) + value = self.value_cache.view( # type: ignore + num_block, block_size, -1) + + output, _ = torch_npu.npu_fused_infer_attention_score( + query=query, + key=key, + value=value, + atten_mask=attn_metadata.attn_mask, + block_table=attn_metadata.block_tables, + input_layout="TND", + block_size=block_size, + actual_seq_lengths=attn_metadata.actual_seq_lengths_q, + actual_seq_lengths_kv=attn_metadata.seq_lens_list, + num_key_value_heads=self.num_kv_heads, + num_heads=self.num_heads, + scale=self.scale, + sparse_mode=3, + ) - output, _ = torch_npu.npu_fused_infer_attention_score( - query=query, - key=key, - value=value, - atten_mask=attn_metadata.attn_mask, - block_table=attn_metadata.block_tables, - input_layout="TND", - block_size=block_size, - actual_seq_lengths=attn_metadata.actual_seq_lengths_q, - actual_seq_lengths_kv=attn_metadata.seq_lens_list, - num_key_value_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale=self.scale, - sparse_mode=3, - ) - else: - torch_npu._npu_paged_attention_splitfuse( - query=query, - key_cache=self.key_cache, - value_cache=self.value_cache, - mask=attn_metadata.attn_mask, - block_table=attn_metadata.block_tables, - seq_len=attn_metadata.query_lens, - context_lens=attn_metadata.seq_lens, - num_kv_heads=self.num_kv_heads, - num_heads=self.num_heads, - scale_value=self.scale, - out=output) return output def _attention_with_nomask_and_mask(self, q: torch.Tensor, @@ -1155,12 +1142,11 @@ def forward( query, attn_metadata, output) # Normal V1 situation. else: - if torch.version.cann.startswith("8.3"): - # npu_fused_infer_attention_score does not support cases - # where query.shape[0] != attn_metadata.query_start_loc[-1]. - # Thus we need unpad it here. - num_tokens = attn_metadata.query_start_loc[-1] - query = query[:num_tokens] + # npu_fused_infer_attention_score does not support cases + # where query.shape[0] != attn_metadata.query_start_loc[-1]. + # Thus we need unpad it here. + num_tokens = attn_metadata.query_start_loc[-1] + query = query[:num_tokens] intermediate_output = self._forward_v1_style( query, attn_metadata, output) diff --git a/vllm_ascend/ops/linear.py b/vllm_ascend/ops/linear.py index cb738d10c12..eab312d5cf8 100644 --- a/vllm_ascend/ops/linear.py +++ b/vllm_ascend/ops/linear.py @@ -45,8 +45,8 @@ class AscendUnquantizedLinearMethod(UnquantizedLinearMethod): def process_weights_after_loading(self, layer: torch.nn.Module) -> None: super().process_weights_after_loading(layer) - if (is_enable_nz() and torch.version.cann.startswith("8.3") and - layer.weight.data.dtype in [torch.float16, torch.bfloat16]): + if (is_enable_nz() and layer.weight.data.dtype + in [torch.float16, torch.bfloat16]): layer.weight.data = torch_npu.npu_format_cast( layer.weight.data, ACL_FORMAT_FRACTAL_NZ) diff --git a/vllm_ascend/ops/linear_op.py b/vllm_ascend/ops/linear_op.py index be7fa316bf2..1271f8e986f 100644 --- a/vllm_ascend/ops/linear_op.py +++ b/vllm_ascend/ops/linear_op.py @@ -411,9 +411,8 @@ def matmul_and_reduce(self, input_parallel: torch.Tensor, quant_per_tensor) # For unquant - if mmrs_fusion and isinstance( - self.layer.quant_method, UnquantizedLinearMethod - ) and torch.version.cann.startswith("8.3"): + if mmrs_fusion and isinstance(self.layer.quant_method, + UnquantizedLinearMethod): output = torch_npu.npu_mm_reduce_scatter_base( x, self.layer.weight.t(), @@ -429,8 +428,7 @@ def matmul_and_reduce(self, input_parallel: torch.Tensor, elif mmrs_fusion and ( isinstance(self.layer.quant_method, AscendLinearMethod) and isinstance(self.layer.quant_method.quant_method, - AscendW8A8LinearMethod) - ) and torch.version.cann.startswith("8.3"): + AscendW8A8LinearMethod)): if x.dtype != torch.int8: x_quant = quant_per_tensor( x, self.layer.aclnn_input_scale_reciprocal, diff --git a/vllm_ascend/worker/model_runner_v1.py b/vllm_ascend/worker/model_runner_v1.py index a85d4cda026..a9ab3a4acde 100644 --- a/vllm_ascend/worker/model_runner_v1.py +++ b/vllm_ascend/worker/model_runner_v1.py @@ -367,13 +367,10 @@ def __init__(self, vllm_config: VllmConfig, device: torch.device): use_sparse=self.use_sparse) if self.pcp_size > 1: self.attn_mask_builder = None - elif torch.version.cann.startswith("8.3"): + else: self.attn_mask_builder = AttentionMaskBuilder( self.scheduler_config.max_num_batched_tokens, self.dtype, self.device) - else: - self.attn_mask_builder = AttentionMaskBuilder( - self.model_config.max_model_len, self.dtype) self._set_up_drafter() @@ -988,11 +985,8 @@ def _make_attention_mask(self, seq_lens, position, max_seq_len = max(seq_lens.max().item(), 0) return self.attn_mask_builder.get_attn_mask( max_seq_len, self.dtype, self.device) - elif torch.version.cann.startswith("8.3"): - return self.attn_mask_builder.get_splitfuse_attn_mask() else: - return self.attn_mask_builder.get_splitfuse_attn_mask( - seq_lens, position, self.dtype, self.device) + return self.attn_mask_builder.get_splitfuse_attn_mask() # Prefill without cache situation. elif attn_state == AscendAttentionState.PrefillNoCache: @@ -1001,12 +995,8 @@ def _make_attention_mask(self, seq_lens, position, max_seq_len, self.dtype, self.device) # Prefill with cache hit. elif attn_state == AscendAttentionState.PrefillCacheHit: - if torch.version.cann.startswith("8.3"): - return self.attn_mask_builder.get_attn_mask( - 2048, self.dtype, self.device) - else: - return self.attn_mask_builder.get_attn_mask( - 128, self.dtype, self.device) + return self.attn_mask_builder.get_attn_mask( + 2048, self.dtype, self.device) # Decode-only situation. else: return None