Upgrade CANN to 8.3.rc1 (#3945)
### What this PR does / why we need it?
This PR upgrade CANN from 8.2rc1 to 8.3rc1 and remove the CANN version
check logic.
TODO: we notice that UT runs failed with CANN 8.3 image. So the base
image for UT is still 8.2. We'll fix it later.
- vLLM version: v0.11.0
- vLLM main:
83f478bb19
Signed-off-by: wangxiyuan <wangxiyuan1007@gmail.com>
This commit is contained in:
2
.github/Dockerfile.buildwheel
vendored
2
.github/Dockerfile.buildwheel
vendored
@@ -15,7 +15,7 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
ARG PY_VERSION=3.11
|
ARG PY_VERSION=3.11
|
||||||
FROM quay.io/ascend/manylinux:8.2.rc1-910b-manylinux_2_28-py${PY_VERSION}
|
FROM quay.io/ascend/manylinux:8.3.rc1-910b-manylinux_2_28-py${PY_VERSION}
|
||||||
|
|
||||||
ARG COMPILE_CUSTOM_KERNELS=1
|
ARG COMPILE_CUSTOM_KERNELS=1
|
||||||
|
|
||||||
|
|||||||
2
.github/workflows/_accuracy_test.yaml
vendored
2
.github/workflows/_accuracy_test.yaml
vendored
@@ -30,7 +30,7 @@ jobs:
|
|||||||
runs-on: ${{ inputs.runner }}
|
runs-on: ${{ inputs.runner }}
|
||||||
name: ${{ inputs.model_name }} accuracy
|
name: ${{ inputs.model_name }} accuracy
|
||||||
container:
|
container:
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
# 1. If version specified (work_dispatch), do specified branch accuracy test
|
# 1. If version specified (work_dispatch), do specified branch accuracy test
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ on:
|
|||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
description: base image for pods
|
description: base image for pods
|
||||||
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
|
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
|
||||||
config_file_path:
|
config_file_path:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
@@ -69,7 +69,7 @@ jobs:
|
|||||||
# This is the runner with no NPU for k8s controller
|
# This is the runner with no NPU for k8s controller
|
||||||
runs-on: ${{ inputs.runner }}
|
runs-on: ${{ inputs.runner }}
|
||||||
container:
|
container:
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
KUBECONFIG: /tmp/kubeconfig
|
KUBECONFIG: /tmp/kubeconfig
|
||||||
KUBECTL: /root/.cache/.kube/kubectl
|
KUBECTL: /root/.cache/.kube/kubectl
|
||||||
|
|||||||
@@ -29,7 +29,7 @@ on:
|
|||||||
image:
|
image:
|
||||||
required: false
|
required: false
|
||||||
type: string
|
type: string
|
||||||
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
|
default: "swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
|
||||||
tests:
|
tests:
|
||||||
required: true
|
required: true
|
||||||
type: string
|
type: string
|
||||||
|
|||||||
2
.github/workflows/_kill_lws_resources.yaml
vendored
2
.github/workflows/_kill_lws_resources.yaml
vendored
@@ -24,7 +24,7 @@ jobs:
|
|||||||
# This is a runner with no NPU for k8s controller
|
# This is a runner with no NPU for k8s controller
|
||||||
runs-on: ${{ inputs.runner }}
|
runs-on: ${{ inputs.runner }}
|
||||||
container:
|
container:
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
KUBECONFIG: /tmp/kubeconfig
|
KUBECONFIG: /tmp/kubeconfig
|
||||||
KUBECTL: /root/.cache/.kube/kubectl
|
KUBECTL: /root/.cache/.kube/kubectl
|
||||||
|
|||||||
2
.github/workflows/accuracy_test.yaml
vendored
2
.github/workflows/accuracy_test.yaml
vendored
@@ -70,5 +70,5 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
vllm: v0.11.0
|
vllm: v0.11.0
|
||||||
runner: linux-aarch64-${{ matrix.runner }}
|
runner: linux-aarch64-${{ matrix.runner }}
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
model_name: ${{ matrix.model_name }}
|
model_name: ${{ matrix.model_name }}
|
||||||
|
|||||||
2
.github/workflows/nightly_benchmarks.yaml
vendored
2
.github/workflows/nightly_benchmarks.yaml
vendored
@@ -56,7 +56,7 @@ jobs:
|
|||||||
vllm_use_v1: 1
|
vllm_use_v1: 1
|
||||||
max-parallel: 1
|
max-parallel: 1
|
||||||
container:
|
container:
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
volumes:
|
volumes:
|
||||||
- /usr/local/dcmi:/usr/local/dcmi
|
- /usr/local/dcmi:/usr/local/dcmi
|
||||||
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
|
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_dist.yaml
vendored
2
.github/workflows/vllm_ascend_dist.yaml
vendored
@@ -75,7 +75,7 @@ jobs:
|
|||||||
name: vLLM Ascend test
|
name: vLLM Ascend test
|
||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
container:
|
container:
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
DEBIAN_FRONTEND: noninteractive
|
DEBIAN_FRONTEND: noninteractive
|
||||||
steps:
|
steps:
|
||||||
|
|||||||
3
.github/workflows/vllm_ascend_test.yaml
vendored
3
.github/workflows/vllm_ascend_test.yaml
vendored
@@ -76,6 +76,7 @@ jobs:
|
|||||||
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
if: ${{ needs.lint.result == 'success' && (needs.changes.outputs.e2e_tracker == 'true' || needs.changes.outputs.ut_tracker == 'true') }}
|
||||||
runs-on: ubuntu-latest
|
runs-on: ubuntu-latest
|
||||||
container:
|
container:
|
||||||
|
# fixme: vllm-ascend install failed with 8.3.rc1 on github action
|
||||||
image: quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
VLLM_LOGGING_LEVEL: ERROR
|
VLLM_LOGGING_LEVEL: ERROR
|
||||||
@@ -146,5 +147,5 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
vllm: ${{ matrix.vllm_version }}
|
vllm: ${{ matrix.vllm_version }}
|
||||||
runner: linux-aarch64-a2
|
runner: linux-aarch64-a2
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
type: light
|
type: light
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_test_310p.yaml
vendored
2
.github/workflows/vllm_ascend_test_310p.yaml
vendored
@@ -58,7 +58,7 @@ jobs:
|
|||||||
runs-on: ${{ matrix.os }}
|
runs-on: ${{ matrix.os }}
|
||||||
container:
|
container:
|
||||||
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
|
# TODO(yikun): Remove m.daocloud.io prefix when infra proxy ready
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-310p-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-310p-ubuntu22.04-py3.11
|
||||||
env:
|
env:
|
||||||
VLLM_LOGGING_LEVEL: ERROR
|
VLLM_LOGGING_LEVEL: ERROR
|
||||||
VLLM_USE_MODELSCOPE: True
|
VLLM_USE_MODELSCOPE: True
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
2
.github/workflows/vllm_ascend_test_full.yaml
vendored
@@ -76,5 +76,5 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
vllm: ${{ matrix.vllm_version }}
|
vllm: ${{ matrix.vllm_version }}
|
||||||
runner: linux-aarch64-a2
|
runner: linux-aarch64-a2
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
type: full
|
type: full
|
||||||
|
|||||||
@@ -41,5 +41,5 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
vllm: main
|
vllm: main
|
||||||
runner: linux-aarch64-a2
|
runner: linux-aarch64-a2
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
type: full
|
type: full
|
||||||
|
|||||||
@@ -79,7 +79,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
vllm: v0.11.0
|
vllm: v0.11.0
|
||||||
runner: linux-aarch64-${{ matrix.runner }}
|
runner: linux-aarch64-${{ matrix.runner }}
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
model_name: ${{ matrix.model_name }}
|
model_name: ${{ matrix.model_name }}
|
||||||
upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
|
upload: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.vllm-ascend-version == 'latest' }}
|
||||||
|
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
soc_version: a2
|
soc_version: a2
|
||||||
runner: linux-aarch64-a2-0
|
runner: linux-aarch64-a2-0
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
replicas: 1
|
replicas: 1
|
||||||
size: ${{ matrix.test_config.size }}
|
size: ${{ matrix.test_config.size }}
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
|
|||||||
@@ -82,7 +82,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
vllm: v0.11.0
|
vllm: v0.11.0
|
||||||
runner: ${{ matrix.test_config.os }}
|
runner: ${{ matrix.test_config.os }}
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||||
tests: ${{ matrix.test_config.tests }}
|
tests: ${{ matrix.test_config.tests }}
|
||||||
|
|
||||||
multi-node-tests:
|
multi-node-tests:
|
||||||
@@ -113,7 +113,7 @@ jobs:
|
|||||||
with:
|
with:
|
||||||
soc_version: a3
|
soc_version: a3
|
||||||
runner: linux-aarch64-a3-0
|
runner: linux-aarch64-a3-0
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||||
replicas: 1
|
replicas: 1
|
||||||
size: ${{ matrix.test_config.size }}
|
size: ${{ matrix.test_config.size }}
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
|
|||||||
2
.github/workflows/vllm_ascend_test_pd.yaml
vendored
2
.github/workflows/vllm_ascend_test_pd.yaml
vendored
@@ -49,7 +49,7 @@ jobs:
|
|||||||
runs-on: linux-arm64-npu-static-8
|
runs-on: linux-arm64-npu-static-8
|
||||||
|
|
||||||
container:
|
container:
|
||||||
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
image: swr.cn-southwest-2.myhuaweicloud.com/base_image/ascend-ci/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
volumes:
|
volumes:
|
||||||
- /usr/local/dcmi:/usr/local/dcmi
|
- /usr/local/dcmi:/usr/local/dcmi
|
||||||
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
|
- /usr/local/bin/npu-smi:/usr/local/bin/npu-smi
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
FROM quay.io/ascend/cann:8.3.rc1-910b-ubuntu22.04-py3.11
|
||||||
|
|
||||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||||
ARG COMPILE_CUSTOM_KERNELS=1
|
ARG COMPILE_CUSTOM_KERNELS=1
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11
|
FROM quay.io/ascend/cann:8.3.rc1-310p-ubuntu22.04-py3.11
|
||||||
|
|
||||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||||
ARG COMPILE_CUSTOM_KERNELS=1
|
ARG COMPILE_CUSTOM_KERNELS=1
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11
|
FROM quay.io/ascend/cann:8.3.rc1-310p-openeuler24.03-py3.11
|
||||||
|
|
||||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||||
ARG COMPILE_CUSTOM_KERNELS=1
|
ARG COMPILE_CUSTOM_KERNELS=1
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
FROM quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||||
|
|
||||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||||
ARG COMPILE_CUSTOM_KERNELS=1
|
ARG COMPILE_CUSTOM_KERNELS=1
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11
|
FROM quay.io/ascend/cann:8.3.rc1-a3-openeuler24.03-py3.11
|
||||||
|
|
||||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||||
ARG COMPILE_CUSTOM_KERNELS=1
|
ARG COMPILE_CUSTOM_KERNELS=1
|
||||||
|
|||||||
@@ -15,7 +15,7 @@
|
|||||||
# This file is a part of the vllm-ascend project.
|
# This file is a part of the vllm-ascend project.
|
||||||
#
|
#
|
||||||
|
|
||||||
FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11
|
FROM quay.io/ascend/cann:8.3.rc1-910b-openeuler24.03-py3.11
|
||||||
|
|
||||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||||
ARG COMPILE_CUSTOM_KERNELS=1
|
ARG COMPILE_CUSTOM_KERNELS=1
|
||||||
|
|||||||
@@ -42,7 +42,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
|
|||||||
- OS: Linux
|
- OS: Linux
|
||||||
- Software:
|
- Software:
|
||||||
* Python >= 3.9, < 3.12
|
* Python >= 3.9, < 3.12
|
||||||
* CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
|
* CANN >= 8.3.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
|
||||||
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
||||||
* vLLM (the same version as vllm-ascend)
|
* vLLM (the same version as vllm-ascend)
|
||||||
|
|
||||||
|
|||||||
@@ -43,7 +43,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
|
|||||||
- 操作系统:Linux
|
- 操作系统:Linux
|
||||||
- 软件:
|
- 软件:
|
||||||
* Python >= 3.9, < 3.12
|
* Python >= 3.9, < 3.12
|
||||||
* CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
|
* CANN >= 8.3.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html))
|
||||||
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
||||||
* vLLM (与vllm-ascend版本一致)
|
* vLLM (与vllm-ascend版本一致)
|
||||||
|
|
||||||
|
|||||||
@@ -75,7 +75,7 @@ myst_substitutions = {
|
|||||||
'pip_vllm_ascend_version': "0.11.0rc0",
|
'pip_vllm_ascend_version': "0.11.0rc0",
|
||||||
'pip_vllm_version': "0.11.0",
|
'pip_vllm_version': "0.11.0",
|
||||||
# CANN image tag
|
# CANN image tag
|
||||||
'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11",
|
'cann_image_tag': "8.3.rc1-910b-ubuntu22.04-py3.11",
|
||||||
# vllm version in ci
|
# vllm version in ci
|
||||||
'ci_vllm_version': 'v0.11.0',
|
'ci_vllm_version': 'v0.11.0',
|
||||||
}
|
}
|
||||||
|
|||||||
@@ -90,7 +90,7 @@ currently, the multi-node test workflow defined in the [vllm_ascend_test_nightly
|
|||||||
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
uses: ./.github/workflows/_e2e_nightly_multi_node.yaml
|
||||||
with:
|
with:
|
||||||
soc_version: a3
|
soc_version: a3
|
||||||
image: m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
image: m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11
|
||||||
replicas: 1
|
replicas: 1
|
||||||
size: ${{ matrix.test_config.size }}
|
size: ${{ matrix.test_config.size }}
|
||||||
config_file_path: ${{ matrix.test_config.config_file_path }}
|
config_file_path: ${{ matrix.test_config.config_file_path }}
|
||||||
|
|||||||
@@ -11,8 +11,8 @@ This document describes how to install vllm-ascend manually.
|
|||||||
|
|
||||||
| Software | Supported version | Note |
|
| Software | Supported version | Note |
|
||||||
|---------------|----------------------------------|-------------------------------------------|
|
|---------------|----------------------------------|-------------------------------------------|
|
||||||
| Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html) | Required for CANN |
|
| Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/83RC1/releasenote/releasenote_0000.html) | Required for CANN |
|
||||||
| CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu |
|
| CANN | >= 8.3.RC1 | Required for vllm-ascend and torch-npu |
|
||||||
| torch-npu | == 2.7.1 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
|
| torch-npu | == 2.7.1 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
|
||||||
| torch | == 2.7.1 | Required for torch-npu and vllm |
|
| torch | == 2.7.1 | Required for torch-npu and vllm |
|
||||||
|
|
||||||
@@ -80,19 +80,19 @@ source vllm-ascend-env/bin/activate
|
|||||||
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
|
pip3 install -i https://pypi.tuna.tsinghua.edu.cn/simple attrs 'numpy<2.0.0' decorator sympy cffi pyyaml pathlib2 psutil protobuf scipy requests absl-py wheel typing_extensions
|
||||||
|
|
||||||
# Download and install the CANN package.
|
# Download and install the CANN package.
|
||||||
wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run
|
wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run
|
||||||
chmod +x ./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run
|
chmod +x ./Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run
|
||||||
./Ascend-cann-toolkit_8.2.RC1_linux-"$(uname -i)".run --full
|
./Ascend-cann-toolkit_8.3.RC1_linux-"$(uname -i)".run --full
|
||||||
# https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.2.rc1_linux-aarch64.run
|
# https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/Milan-ASL/Milan-ASL%20V100R001C22B800TP052/Ascend-cann-kernels-910b_8.3.rc1_linux-aarch64.run
|
||||||
|
|
||||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
source /usr/local/Ascend/ascend-toolkit/set_env.sh
|
||||||
wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run
|
wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run
|
||||||
chmod +x ./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run
|
chmod +x ./Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run
|
||||||
./Ascend-cann-kernels-910b_8.2.RC1_linux-"$(uname -i)".run --install
|
./Ascend-cann-kernels-910b_8.3.RC1_linux-"$(uname -i)".run --install
|
||||||
|
|
||||||
wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.2.RC1/Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run
|
wget --header="Referer: https://www.hiascend.com/" https://ascend-repo.obs.cn-east-2.myhuaweicloud.com/CANN/CANN%208.3.RC1/Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run
|
||||||
chmod +x ./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run
|
chmod +x ./Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run
|
||||||
./Ascend-cann-nnal_8.2.RC1_linux-"$(uname -i)".run --install
|
./Ascend-cann-nnal_8.3.RC1_linux-"$(uname -i)".run --install
|
||||||
|
|
||||||
source /usr/local/Ascend/nnal/atb/set_env.sh
|
source /usr/local/Ascend/nnal/atb/set_env.sh
|
||||||
```
|
```
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
* Software:
|
* Software:
|
||||||
* Python >= 3.9, < 3.12
|
* Python >= 3.9, < 3.12
|
||||||
* CANN >= 8.2.rc1
|
* CANN >= 8.3.rc1
|
||||||
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
||||||
* vLLM (same version as vllm-ascend)
|
* vLLM (same version as vllm-ascend)
|
||||||
* mooncake-transfer-engine reference documentation: https://github.com/kvcache-ai/Mooncake/blob/main/doc/zh/ascend_transport.md
|
* mooncake-transfer-engine reference documentation: https://github.com/kvcache-ai/Mooncake/blob/main/doc/zh/ascend_transport.md
|
||||||
|
|||||||
@@ -4,7 +4,7 @@
|
|||||||
|
|
||||||
* Software:
|
* Software:
|
||||||
* Python >= 3.9, < 3.12
|
* Python >= 3.9, < 3.12
|
||||||
* CANN >= 8.2.rc1
|
* CANN >= 8.3.rc1
|
||||||
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
* PyTorch == 2.7.1, torch-npu == 2.7.1
|
||||||
* vLLM:main branch
|
* vLLM:main branch
|
||||||
* vLLM-Ascend:main branch
|
* vLLM-Ascend:main branch
|
||||||
|
|||||||
@@ -15,7 +15,7 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: vllm-leader
|
- name: vllm-leader
|
||||||
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
|
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11") }}
|
||||||
env:
|
env:
|
||||||
- name: CONFIG_YAML_PATH
|
- name: CONFIG_YAML_PATH
|
||||||
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
||||||
@@ -75,7 +75,7 @@ spec:
|
|||||||
spec:
|
spec:
|
||||||
containers:
|
containers:
|
||||||
- name: vllm-worker
|
- name: vllm-worker
|
||||||
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11") }}
|
image: {{ image | default("m.daocloud.io/quay.io/ascend/cann:8.3.rc1-a3-ubuntu22.04-py3.11") }}
|
||||||
env:
|
env:
|
||||||
- name: CONFIG_YAML_PATH
|
- name: CONFIG_YAML_PATH
|
||||||
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
value: {{ config_file_path | default("DeepSeek-V3.yaml") }}
|
||||||
|
|||||||
@@ -1,2 +1,2 @@
|
|||||||
# Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
|
# Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
|
||||||
BASE_IMAGE_NAME="quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
|
BASE_IMAGE_NAME="quay.io/ascend/cann:8.3.rc1-910b-ubuntu22.04-py3.11"
|
||||||
|
|||||||
@@ -74,11 +74,10 @@ class TestAttentionMaskBuilder(TestBase):
|
|||||||
attn_mask = attention_mask_builder.get_attn_mask(
|
attn_mask = attention_mask_builder.get_attn_mask(
|
||||||
max_seq_len=2048, dtype=torch.float16, device=torch.device("cpu"))
|
max_seq_len=2048, dtype=torch.float16, device=torch.device("cpu"))
|
||||||
self.assertEqual(attn_mask.shape, (2048, 2048))
|
self.assertEqual(attn_mask.shape, (2048, 2048))
|
||||||
self.assertEqual(attn_mask[0][-1],
|
self.assertEqual(attn_mask[0][-1], torch.tensor(True))
|
||||||
torch.tensor(float("-inf"), dtype=torch.float16))
|
self.assertEqual(attention_mask_builder._seq_len_cached, 1024)
|
||||||
self.assertEqual(attention_mask_builder._seq_len_cached, 2048)
|
|
||||||
self.assertEqual(attention_mask_builder.attn_mask_cache.shape,
|
self.assertEqual(attention_mask_builder.attn_mask_cache.shape,
|
||||||
(2048, 2048))
|
(1024, 1024))
|
||||||
self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1],
|
self.assertEqual(attention_mask_builder.attn_mask_cache[0][-1],
|
||||||
torch.tensor(float("-inf"), dtype=torch.float16))
|
torch.tensor(float("-inf"), dtype=torch.float16))
|
||||||
|
|
||||||
@@ -91,43 +90,5 @@ class TestAttentionMaskBuilder(TestBase):
|
|||||||
dtype=torch.float16,
|
dtype=torch.float16,
|
||||||
device=torch.device("cpu"),
|
device=torch.device("cpu"),
|
||||||
)
|
)
|
||||||
self.assertEqual(attn_mask.shape, (6, 100))
|
self.assertEqual(attn_mask.shape, (2048, 2048))
|
||||||
self.assertEqual(attention_mask_builder._seq_len_cached, 1024)
|
self.assertEqual(attention_mask_builder._seq_len_cached, 1024)
|
||||||
|
|
||||||
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
|
|
||||||
seq_lens=torch.tensor([10, 3000, 2000]),
|
|
||||||
position=torch.tensor([7, 8, 9, 2999, 1999]),
|
|
||||||
dtype=torch.float16,
|
|
||||||
device=torch.device("cpu"),
|
|
||||||
)
|
|
||||||
self.assertEqual(attn_mask.shape, (5, 3000))
|
|
||||||
self.assertEqual(attention_mask_builder._seq_len_cached, 3000)
|
|
||||||
|
|
||||||
# splitfuse_attn_mask now only supports data types: torch.float16 and torch.bfloat16
|
|
||||||
# otherwise raise ValueError
|
|
||||||
with self.assertRaises(ValueError):
|
|
||||||
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
|
|
||||||
seq_lens=torch.tensor([10, 20, 100]),
|
|
||||||
position=torch.tensor([7, 8, 9, 18, 19, 99]),
|
|
||||||
dtype=torch.int8,
|
|
||||||
device=torch.device("cpu"),
|
|
||||||
)
|
|
||||||
|
|
||||||
def test_mask_value_cleanliness(self):
|
|
||||||
attention_mask_builder = AttentionMaskBuilder(max_seq_len=6,
|
|
||||||
dtype=torch.bfloat16)
|
|
||||||
self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1],
|
|
||||||
torch.tensor(1, dtype=torch.bfloat16))
|
|
||||||
|
|
||||||
attn_mask = attention_mask_builder.get_splitfuse_attn_mask(
|
|
||||||
seq_lens=torch.tensor([6]),
|
|
||||||
position=torch.tensor([3, 4, 5]),
|
|
||||||
dtype=torch.bfloat16,
|
|
||||||
device=torch.device("cpu"),
|
|
||||||
)
|
|
||||||
self.assertEqual(
|
|
||||||
attn_mask[-2][-1],
|
|
||||||
torch.tensor(-10000, dtype=torch.bfloat16,
|
|
||||||
device=attn_mask.device))
|
|
||||||
self.assertEqual(attention_mask_builder.attn_mask_cache[-2][-1],
|
|
||||||
torch.tensor(1, dtype=torch.bfloat16))
|
|
||||||
|
|||||||
@@ -298,8 +298,9 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
assert output.shape == (10, 8 * 64)
|
assert output.shape == (10, 8 * 64)
|
||||||
|
|
||||||
@patch('torch_npu._npu_reshape_and_cache')
|
@patch('torch_npu._npu_reshape_and_cache')
|
||||||
@patch('torch_npu._npu_flash_attention_qlens')
|
@patch('torch_npu.npu_fused_infer_attention_score')
|
||||||
def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens,
|
def test_forward_prefill_cache_hit(self,
|
||||||
|
mock_npu_fused_infer_attention_score,
|
||||||
mock_npu_reshape_and_cache):
|
mock_npu_reshape_and_cache):
|
||||||
"""Test forward pass in PrefillCacheHit state"""
|
"""Test forward pass in PrefillCacheHit state"""
|
||||||
query = torch.randn(10, 8 * 64)
|
query = torch.randn(10, 8 * 64)
|
||||||
@@ -308,6 +309,8 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||||
output = torch.empty_like(query)
|
output = torch.empty_like(query)
|
||||||
|
|
||||||
|
mock_npu_fused_infer_attention_score.return_value = (output, 1)
|
||||||
|
|
||||||
metadata = self.attn_metadata
|
metadata = self.attn_metadata
|
||||||
metadata.attn_state = AscendAttentionState.PrefillCacheHit
|
metadata.attn_state = AscendAttentionState.PrefillCacheHit
|
||||||
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
||||||
@@ -323,7 +326,7 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
output = self.impl.forward(layer, query, key, value, kv_cache,
|
output = self.impl.forward(layer, query, key, value, kv_cache,
|
||||||
metadata, output)
|
metadata, output)
|
||||||
|
|
||||||
mock_flash_attention_qlens.assert_called_once()
|
mock_npu_fused_infer_attention_score.assert_called_once()
|
||||||
assert output.shape == (10, 8 * 64)
|
assert output.shape == (10, 8 * 64)
|
||||||
|
|
||||||
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
|
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
|
||||||
@@ -528,13 +531,11 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
|
|
||||||
assert output.shape == (10, 8 * 64)
|
assert output.shape == (10, 8 * 64)
|
||||||
|
|
||||||
@patch('torch.version')
|
|
||||||
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
|
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
|
||||||
@patch('torch_npu._npu_reshape_and_cache')
|
@patch('torch_npu._npu_reshape_and_cache')
|
||||||
@patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
|
@patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
|
||||||
def test_forward_head_size_192(self, mock_vanilla_prefill,
|
def test_forward_head_size_192(self, mock_vanilla_prefill,
|
||||||
mock_npu_reshape_and_cache, mock_is_310p,
|
mock_npu_reshape_and_cache, mock_is_310p):
|
||||||
mock_version):
|
|
||||||
"""Test forward pass when head_size is 192"""
|
"""Test forward pass when head_size is 192"""
|
||||||
|
|
||||||
self.impl.head_size = 192
|
self.impl.head_size = 192
|
||||||
@@ -554,7 +555,6 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
metadata.num_decodes = 10
|
metadata.num_decodes = 10
|
||||||
metadata.num_prefills = 0
|
metadata.num_prefills = 0
|
||||||
layer = self.layer_no_quant
|
layer = self.layer_no_quant
|
||||||
mock_version.cann = "8.4.RC1"
|
|
||||||
mock_vanilla_prefill.return_value = MagicMock()
|
mock_vanilla_prefill.return_value = MagicMock()
|
||||||
|
|
||||||
output = self.impl_192.forward(layer, query, key, value, kv_cache,
|
output = self.impl_192.forward(layer, query, key, value, kv_cache,
|
||||||
@@ -563,12 +563,11 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
mock_vanilla_prefill.assert_called_once()
|
mock_vanilla_prefill.assert_called_once()
|
||||||
assert output.shape == (10, 8 * 192)
|
assert output.shape == (10, 8 * 192)
|
||||||
|
|
||||||
@patch('torch.version')
|
|
||||||
@patch('torch_npu._npu_reshape_and_cache')
|
@patch('torch_npu._npu_reshape_and_cache')
|
||||||
@patch('torch_npu._npu_paged_attention_splitfuse')
|
@patch('torch_npu.npu_fused_infer_attention_score')
|
||||||
def test_forward_normal_v1_situation(self, mock_paged_attention,
|
def test_forward_normal_v1_situation(self,
|
||||||
mock_npu_reshape_and_cache,
|
mock_npu_fused_infer_attention_score,
|
||||||
mock_version):
|
mock_npu_reshape_and_cache):
|
||||||
"""Test forward pass in normal V1 situation"""
|
"""Test forward pass in normal V1 situation"""
|
||||||
query = torch.randn(10, 8 * 64)
|
query = torch.randn(10, 8 * 64)
|
||||||
key = torch.randn(10, 8 * 64)
|
key = torch.randn(10, 8 * 64)
|
||||||
@@ -576,6 +575,8 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||||
output = torch.empty_like(query)
|
output = torch.empty_like(query)
|
||||||
|
|
||||||
|
mock_npu_fused_infer_attention_score.return_value = (output, 1)
|
||||||
|
|
||||||
metadata = self.attn_metadata
|
metadata = self.attn_metadata
|
||||||
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
||||||
metadata.query_lens = torch.tensor([10])
|
metadata.query_lens = torch.tensor([10])
|
||||||
@@ -587,22 +588,20 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
metadata.num_prefills = 10
|
metadata.num_prefills = 10
|
||||||
layer = self.layer_no_quant
|
layer = self.layer_no_quant
|
||||||
|
|
||||||
mock_version.cann = "8.4.RC1"
|
|
||||||
|
|
||||||
output = self.impl.forward(layer, query, key, value, kv_cache,
|
output = self.impl.forward(layer, query, key, value, kv_cache,
|
||||||
metadata, output)
|
metadata, output)
|
||||||
|
|
||||||
mock_paged_attention.assert_called_once()
|
mock_npu_fused_infer_attention_score.assert_called_once()
|
||||||
assert output.shape == (10, 8 * 64)
|
assert output.shape == (10, 8 * 64)
|
||||||
|
|
||||||
@patch('torch.version')
|
|
||||||
@patch('torch_npu.npu_format_cast')
|
@patch('torch_npu.npu_format_cast')
|
||||||
@patch('torch_npu._npu_reshape_and_cache')
|
@patch('torch_npu._npu_reshape_and_cache')
|
||||||
@patch('torch_npu._npu_paged_attention_splitfuse')
|
@patch('torch_npu.npu_fused_infer_attention_score')
|
||||||
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
|
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=True)
|
||||||
def test_forward_310p_device(self, mock_is_310p, mock_paged_attention,
|
def test_forward_310p_device(self, mock_is_310p,
|
||||||
|
mock_npu_fused_infer_attention_score,
|
||||||
mock_npu_reshape_and_cache,
|
mock_npu_reshape_and_cache,
|
||||||
mock_npu_format_cast, mock_version):
|
mock_npu_format_cast):
|
||||||
"""Test forward pass on 310P device"""
|
"""Test forward pass on 310P device"""
|
||||||
query = torch.randn(10, 8 * 64)
|
query = torch.randn(10, 8 * 64)
|
||||||
key = torch.randn(10, 8 * 64)
|
key = torch.randn(10, 8 * 64)
|
||||||
@@ -610,6 +609,8 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||||
output = torch.empty_like(query)
|
output = torch.empty_like(query)
|
||||||
|
|
||||||
|
mock_npu_fused_infer_attention_score.return_value = (output, 1)
|
||||||
|
|
||||||
metadata = self.attn_metadata
|
metadata = self.attn_metadata
|
||||||
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
||||||
metadata.query_lens = torch.tensor([10])
|
metadata.query_lens = torch.tensor([10])
|
||||||
@@ -622,12 +623,11 @@ class TestAscendAttentionBackendImpl(TestBase):
|
|||||||
layer = self.layer_no_quant
|
layer = self.layer_no_quant
|
||||||
|
|
||||||
mock_npu_format_cast.return_value = metadata.attn_mask
|
mock_npu_format_cast.return_value = metadata.attn_mask
|
||||||
mock_version.cann = "8.4.RC1"
|
|
||||||
|
|
||||||
output = self.impl.forward(layer, query, key, value, kv_cache,
|
output = self.impl.forward(layer, query, key, value, kv_cache,
|
||||||
metadata, output)
|
metadata, output)
|
||||||
|
|
||||||
mock_paged_attention.assert_called_once()
|
mock_npu_fused_infer_attention_score.assert_called_once()
|
||||||
assert output.shape == (10, 8 * 64)
|
assert output.shape == (10, 8 * 64)
|
||||||
|
|
||||||
@patch('torch_npu._npu_reshape_and_cache')
|
@patch('torch_npu._npu_reshape_and_cache')
|
||||||
|
|||||||
@@ -63,33 +63,20 @@ class TestAscendUnquantizedLinearMethod(TestBase):
|
|||||||
|
|
||||||
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
||||||
@mock.patch("torch_npu.npu_format_cast")
|
@mock.patch("torch_npu.npu_format_cast")
|
||||||
@mock.patch("torch.version")
|
def test_process_weights_after_loading_enable_nz(self, mock_format_cast,
|
||||||
def test_process_weights_after_loading_is_8_3_enable_nz(
|
mock_is_nz):
|
||||||
self, mock_version, mock_format_cast, mock_is_nz):
|
|
||||||
mock_version.cann = "8.3.RC1"
|
|
||||||
mock_is_nz.return_value = 1
|
mock_is_nz.return_value = 1
|
||||||
self.method.process_weights_after_loading(self.layer)
|
self.method.process_weights_after_loading(self.layer)
|
||||||
mock_format_cast.assert_called_once()
|
mock_format_cast.assert_called_once()
|
||||||
|
|
||||||
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
||||||
@mock.patch("torch_npu.npu_format_cast")
|
@mock.patch("torch_npu.npu_format_cast")
|
||||||
@mock.patch("torch.version")
|
def test_process_weights_after_loading_disable_nz(self, mock_format_cast,
|
||||||
def test_process_weights_after_loading_is_8_3_disable_nz(
|
mock_is_nz):
|
||||||
self, mock_version, mock_format_cast, mock_is_nz):
|
|
||||||
mock_version.cann = "8.3.RC1"
|
|
||||||
mock_is_nz.return_value = 0
|
mock_is_nz.return_value = 0
|
||||||
self.method.process_weights_after_loading(self.layer)
|
self.method.process_weights_after_loading(self.layer)
|
||||||
mock_format_cast.assert_not_called()
|
mock_format_cast.assert_not_called()
|
||||||
|
|
||||||
@mock.patch("vllm_ascend.ops.linear.is_enable_nz")
|
|
||||||
@mock.patch("torch.version")
|
|
||||||
def test_process_weights_after_loading_not_8_3(self, mock_version,
|
|
||||||
mock_is_nz):
|
|
||||||
mock_version.cann = "8.2.RC1"
|
|
||||||
mock_is_nz.return_value = 1
|
|
||||||
# Should not raise exception
|
|
||||||
self.method.process_weights_after_loading(self.layer)
|
|
||||||
|
|
||||||
|
|
||||||
class TestAscendRowParallelLinear(BaseLinearTest):
|
class TestAscendRowParallelLinear(BaseLinearTest):
|
||||||
|
|
||||||
|
|||||||
@@ -47,11 +47,10 @@ class AttentionMaskBuilder:
|
|||||||
self.attn_mask_cache = attn_mask
|
self.attn_mask_cache = attn_mask
|
||||||
self.device = device
|
self.device = device
|
||||||
self.pooling_mask = None
|
self.pooling_mask = None
|
||||||
if torch.version.cann.startswith("8.3"):
|
assigned_mask_dim = 2048
|
||||||
assigned_mask_dim = 2048
|
self.chunked_prefill_attn_mask = torch.triu(
|
||||||
self.chunked_prefill_attn_mask = torch.triu(
|
torch.ones(assigned_mask_dim, assigned_mask_dim),
|
||||||
torch.ones(assigned_mask_dim, assigned_mask_dim),
|
diagonal=1).to(torch.int8).to(device)
|
||||||
diagonal=1).to(torch.int8).to(device)
|
|
||||||
|
|
||||||
@staticmethod
|
@staticmethod
|
||||||
def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
|
def get_mask_scale_factor(dtype: torch.dtype = torch.float16):
|
||||||
@@ -68,7 +67,7 @@ class AttentionMaskBuilder:
|
|||||||
|
|
||||||
def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
|
def get_attn_mask(self, max_seq_len: int, dtype: torch.dtype,
|
||||||
device: torch.device):
|
device: torch.device):
|
||||||
if max_seq_len == 2048 and torch.version.cann.startswith("8.3"):
|
if max_seq_len == 2048:
|
||||||
return self.chunked_prefill_attn_mask.to(torch.bool)
|
return self.chunked_prefill_attn_mask.to(torch.bool)
|
||||||
self._update_attn_cache(max_seq_len, dtype)
|
self._update_attn_cache(max_seq_len, dtype)
|
||||||
return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous(
|
return self.attn_mask_cache[:max_seq_len, :max_seq_len].contiguous(
|
||||||
@@ -89,23 +88,7 @@ class AttentionMaskBuilder:
|
|||||||
dtype: torch.dtype = None,
|
dtype: torch.dtype = None,
|
||||||
device: torch.device = None,
|
device: torch.device = None,
|
||||||
) -> torch.Tensor:
|
) -> torch.Tensor:
|
||||||
if torch.version.cann.startswith("8.3"):
|
return self.chunked_prefill_attn_mask
|
||||||
return self.chunked_prefill_attn_mask
|
|
||||||
else:
|
|
||||||
if dtype not in [torch.float16, torch.bfloat16]:
|
|
||||||
raise ValueError(
|
|
||||||
"splitfuse_attn_mask now only supports bf16 and fp16")
|
|
||||||
max_seq_len = max(seq_lens, default=0)
|
|
||||||
self._update_attn_cache(max_seq_len, dtype)
|
|
||||||
# FIXME: Currently the mask value of chunked-prefill situation and Prefill-Only situation
|
|
||||||
# is not the same. Fix this in the future when kernel is ready.
|
|
||||||
mask_scale_factor = AttentionMaskBuilder.get_mask_scale_factor(
|
|
||||||
dtype)
|
|
||||||
attn_mask = torch.index_select(self.attn_mask_cache,
|
|
||||||
dim=0,
|
|
||||||
index=position)[:, :max_seq_len]
|
|
||||||
attn_mask *= mask_scale_factor
|
|
||||||
return attn_mask.contiguous().to(device, non_blocking=True)
|
|
||||||
|
|
||||||
def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
|
def _update_attn_cache(self, seqlen: int, dtype: torch.dtype):
|
||||||
if seqlen > self._seq_len_cached:
|
if seqlen > self._seq_len_cached:
|
||||||
|
|||||||
@@ -500,7 +500,7 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
|||||||
block_table = attn_metadata.block_tables[:batch_size, :]
|
block_table = attn_metadata.block_tables[:batch_size, :]
|
||||||
num_block, block_size, _, _ = self.key_cache.shape # type: ignore
|
num_block, block_size, _, _ = self.key_cache.shape # type: ignore
|
||||||
|
|
||||||
if torch.version.cann.startswith("8.3") and block_size == 128:
|
if block_size == 128:
|
||||||
# TODO:The npu_fused_infer_attention_score op is planned to
|
# TODO:The npu_fused_infer_attention_score op is planned to
|
||||||
# be utilized in a wider range in upcoming versions.
|
# be utilized in a wider range in upcoming versions.
|
||||||
key = self.key_cache.view( # type: ignore
|
key = self.key_cache.view( # type: ignore
|
||||||
@@ -680,43 +680,30 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
|||||||
attn_metadata.seq_lens = \
|
attn_metadata.seq_lens = \
|
||||||
attn_metadata.seq_lens.to(device=query.device)
|
attn_metadata.seq_lens.to(device=query.device)
|
||||||
|
|
||||||
if torch.version.cann.startswith("8.3"):
|
# TODO:The npu_fused_infer_attention_score op is planned to
|
||||||
# TODO:The npu_fused_infer_attention_score op is planned to
|
# be utilized in a wider range in upcoming versions.
|
||||||
# be utilized in a wider range in upcoming versions.
|
num_block, block_size, _, _ = self.key_cache.shape # type: ignore
|
||||||
num_block, block_size, _, _ = self.key_cache.shape # type: ignore
|
key = self.key_cache.view( # type: ignore
|
||||||
key = self.key_cache.view( # type: ignore
|
num_block, block_size, -1)
|
||||||
num_block, block_size, -1)
|
value = self.value_cache.view( # type: ignore
|
||||||
value = self.value_cache.view( # type: ignore
|
num_block, block_size, -1)
|
||||||
num_block, block_size, -1)
|
|
||||||
|
output, _ = torch_npu.npu_fused_infer_attention_score(
|
||||||
|
query=query,
|
||||||
|
key=key,
|
||||||
|
value=value,
|
||||||
|
atten_mask=attn_metadata.attn_mask,
|
||||||
|
block_table=attn_metadata.block_tables,
|
||||||
|
input_layout="TND",
|
||||||
|
block_size=block_size,
|
||||||
|
actual_seq_lengths=attn_metadata.actual_seq_lengths_q,
|
||||||
|
actual_seq_lengths_kv=attn_metadata.seq_lens_list,
|
||||||
|
num_key_value_heads=self.num_kv_heads,
|
||||||
|
num_heads=self.num_heads,
|
||||||
|
scale=self.scale,
|
||||||
|
sparse_mode=3,
|
||||||
|
)
|
||||||
|
|
||||||
output, _ = torch_npu.npu_fused_infer_attention_score(
|
|
||||||
query=query,
|
|
||||||
key=key,
|
|
||||||
value=value,
|
|
||||||
atten_mask=attn_metadata.attn_mask,
|
|
||||||
block_table=attn_metadata.block_tables,
|
|
||||||
input_layout="TND",
|
|
||||||
block_size=block_size,
|
|
||||||
actual_seq_lengths=attn_metadata.actual_seq_lengths_q,
|
|
||||||
actual_seq_lengths_kv=attn_metadata.seq_lens_list,
|
|
||||||
num_key_value_heads=self.num_kv_heads,
|
|
||||||
num_heads=self.num_heads,
|
|
||||||
scale=self.scale,
|
|
||||||
sparse_mode=3,
|
|
||||||
)
|
|
||||||
else:
|
|
||||||
torch_npu._npu_paged_attention_splitfuse(
|
|
||||||
query=query,
|
|
||||||
key_cache=self.key_cache,
|
|
||||||
value_cache=self.value_cache,
|
|
||||||
mask=attn_metadata.attn_mask,
|
|
||||||
block_table=attn_metadata.block_tables,
|
|
||||||
seq_len=attn_metadata.query_lens,
|
|
||||||
context_lens=attn_metadata.seq_lens,
|
|
||||||
num_kv_heads=self.num_kv_heads,
|
|
||||||
num_heads=self.num_heads,
|
|
||||||
scale_value=self.scale,
|
|
||||||
out=output)
|
|
||||||
return output
|
return output
|
||||||
|
|
||||||
def _attention_with_nomask_and_mask(self, q: torch.Tensor,
|
def _attention_with_nomask_and_mask(self, q: torch.Tensor,
|
||||||
@@ -1155,12 +1142,11 @@ class AscendAttentionBackendImpl(AttentionImpl):
|
|||||||
query, attn_metadata, output)
|
query, attn_metadata, output)
|
||||||
# Normal V1 situation.
|
# Normal V1 situation.
|
||||||
else:
|
else:
|
||||||
if torch.version.cann.startswith("8.3"):
|
# npu_fused_infer_attention_score does not support cases
|
||||||
# npu_fused_infer_attention_score does not support cases
|
# where query.shape[0] != attn_metadata.query_start_loc[-1].
|
||||||
# where query.shape[0] != attn_metadata.query_start_loc[-1].
|
# Thus we need unpad it here.
|
||||||
# Thus we need unpad it here.
|
num_tokens = attn_metadata.query_start_loc[-1]
|
||||||
num_tokens = attn_metadata.query_start_loc[-1]
|
query = query[:num_tokens]
|
||||||
query = query[:num_tokens]
|
|
||||||
intermediate_output = self._forward_v1_style(
|
intermediate_output = self._forward_v1_style(
|
||||||
query, attn_metadata, output)
|
query, attn_metadata, output)
|
||||||
|
|
||||||
|
|||||||
@@ -45,8 +45,8 @@ class AscendUnquantizedLinearMethod(UnquantizedLinearMethod):
|
|||||||
|
|
||||||
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
def process_weights_after_loading(self, layer: torch.nn.Module) -> None:
|
||||||
super().process_weights_after_loading(layer)
|
super().process_weights_after_loading(layer)
|
||||||
if (is_enable_nz() and torch.version.cann.startswith("8.3") and
|
if (is_enable_nz() and layer.weight.data.dtype
|
||||||
layer.weight.data.dtype in [torch.float16, torch.bfloat16]):
|
in [torch.float16, torch.bfloat16]):
|
||||||
layer.weight.data = torch_npu.npu_format_cast(
|
layer.weight.data = torch_npu.npu_format_cast(
|
||||||
layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
|
layer.weight.data, ACL_FORMAT_FRACTAL_NZ)
|
||||||
|
|
||||||
|
|||||||
@@ -411,9 +411,8 @@ class SequenceRowParallelOp(CustomRowParallelOp):
|
|||||||
quant_per_tensor)
|
quant_per_tensor)
|
||||||
|
|
||||||
# For unquant
|
# For unquant
|
||||||
if mmrs_fusion and isinstance(
|
if mmrs_fusion and isinstance(self.layer.quant_method,
|
||||||
self.layer.quant_method, UnquantizedLinearMethod
|
UnquantizedLinearMethod):
|
||||||
) and torch.version.cann.startswith("8.3"):
|
|
||||||
output = torch_npu.npu_mm_reduce_scatter_base(
|
output = torch_npu.npu_mm_reduce_scatter_base(
|
||||||
x,
|
x,
|
||||||
self.layer.weight.t(),
|
self.layer.weight.t(),
|
||||||
@@ -429,8 +428,7 @@ class SequenceRowParallelOp(CustomRowParallelOp):
|
|||||||
elif mmrs_fusion and (
|
elif mmrs_fusion and (
|
||||||
isinstance(self.layer.quant_method, AscendLinearMethod)
|
isinstance(self.layer.quant_method, AscendLinearMethod)
|
||||||
and isinstance(self.layer.quant_method.quant_method,
|
and isinstance(self.layer.quant_method.quant_method,
|
||||||
AscendW8A8LinearMethod)
|
AscendW8A8LinearMethod)):
|
||||||
) and torch.version.cann.startswith("8.3"):
|
|
||||||
if x.dtype != torch.int8:
|
if x.dtype != torch.int8:
|
||||||
x_quant = quant_per_tensor(
|
x_quant = quant_per_tensor(
|
||||||
x, self.layer.aclnn_input_scale_reciprocal,
|
x, self.layer.aclnn_input_scale_reciprocal,
|
||||||
|
|||||||
@@ -367,13 +367,10 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
use_sparse=self.use_sparse)
|
use_sparse=self.use_sparse)
|
||||||
if self.pcp_size > 1:
|
if self.pcp_size > 1:
|
||||||
self.attn_mask_builder = None
|
self.attn_mask_builder = None
|
||||||
elif torch.version.cann.startswith("8.3"):
|
else:
|
||||||
self.attn_mask_builder = AttentionMaskBuilder(
|
self.attn_mask_builder = AttentionMaskBuilder(
|
||||||
self.scheduler_config.max_num_batched_tokens, self.dtype,
|
self.scheduler_config.max_num_batched_tokens, self.dtype,
|
||||||
self.device)
|
self.device)
|
||||||
else:
|
|
||||||
self.attn_mask_builder = AttentionMaskBuilder(
|
|
||||||
self.model_config.max_model_len, self.dtype)
|
|
||||||
|
|
||||||
self._set_up_drafter()
|
self._set_up_drafter()
|
||||||
|
|
||||||
@@ -988,11 +985,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
max_seq_len = max(seq_lens.max().item(), 0)
|
max_seq_len = max(seq_lens.max().item(), 0)
|
||||||
return self.attn_mask_builder.get_attn_mask(
|
return self.attn_mask_builder.get_attn_mask(
|
||||||
max_seq_len, self.dtype, self.device)
|
max_seq_len, self.dtype, self.device)
|
||||||
elif torch.version.cann.startswith("8.3"):
|
|
||||||
return self.attn_mask_builder.get_splitfuse_attn_mask()
|
|
||||||
else:
|
else:
|
||||||
return self.attn_mask_builder.get_splitfuse_attn_mask(
|
return self.attn_mask_builder.get_splitfuse_attn_mask()
|
||||||
seq_lens, position, self.dtype, self.device)
|
|
||||||
|
|
||||||
# Prefill without cache situation.
|
# Prefill without cache situation.
|
||||||
elif attn_state == AscendAttentionState.PrefillNoCache:
|
elif attn_state == AscendAttentionState.PrefillNoCache:
|
||||||
@@ -1001,12 +995,8 @@ class NPUModelRunner(LoRAModelRunnerMixin):
|
|||||||
max_seq_len, self.dtype, self.device)
|
max_seq_len, self.dtype, self.device)
|
||||||
# Prefill with cache hit.
|
# Prefill with cache hit.
|
||||||
elif attn_state == AscendAttentionState.PrefillCacheHit:
|
elif attn_state == AscendAttentionState.PrefillCacheHit:
|
||||||
if torch.version.cann.startswith("8.3"):
|
return self.attn_mask_builder.get_attn_mask(
|
||||||
return self.attn_mask_builder.get_attn_mask(
|
2048, self.dtype, self.device)
|
||||||
2048, self.dtype, self.device)
|
|
||||||
else:
|
|
||||||
return self.attn_mask_builder.get_attn_mask(
|
|
||||||
128, self.dtype, self.device)
|
|
||||||
# Decode-only situation.
|
# Decode-only situation.
|
||||||
else:
|
else:
|
||||||
return None
|
return None
|
||||||
|
|||||||
Reference in New Issue
Block a user