Merge pull request 'v0.11.0rc0' (#1) from v0.11.0rc0 into main

Reviewed-on: http://git.modelhub.org.cn:980/EngineX-Ascend/enginex-ascend-910-vllm/pulls/1
This commit is contained in:
2025-10-21 10:18:25 +08:00
278 changed files with 28131 additions and 11709 deletions

View File

@@ -15,7 +15,7 @@
# This file is a part of the vllm-ascend project.
#
FROM git.modelhub.org.cn:9443/enginex-ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.10.1.1
ARG VLLM_TAG=v0.11.0rc3
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \

61
Dockerfile.310p Normal file
View File

@@ -0,0 +1,61 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0rc3
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export SOC_VERSION=ASCEND310P3 && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

59
Dockerfile.310p.openEuler Normal file
View File

@@ -0,0 +1,59 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
rm -rf /var/cache/yum
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0rc3
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
export SOC_VERSION=ASCEND310P3 && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

60
Dockerfile.a3 Normal file
View File

@@ -0,0 +1,60 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
# Define environments
ENV DEBIAN_FRONTEND=noninteractive
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN apt-get update -y && \
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
rm -rf /var/cache/apt/* && \
rm -rf /var/lib/apt/lists/*
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
RUN pip config set global.index-url ${PIP_INDEX_URL}
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0rc3
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

58
Dockerfile.a3.openEuler Normal file
View File

@@ -0,0 +1,58 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
rm -rf /var/cache/yum
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0rc3
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

58
Dockerfile.openEuler Normal file
View File

@@ -0,0 +1,58 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
ARG COMPILE_CUSTOM_KERNELS=1
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
RUN yum update -y && \
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
rm -rf /var/cache/yum
RUN pip config set global.index-url ${PIP_INDEX_URL}
WORKDIR /workspace
COPY . /vllm-workspace/vllm-ascend/
# Install vLLM
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
ARG VLLM_TAG=v0.11.0rc3
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip uninstall -y triton && \
python3 -m pip cache purge
# Install vllm-ascend
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
source /usr/local/Ascend/nnal/atb/set_env.sh && \
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
python3 -m pip cache purge
# Install modelscope (for fast download) and ray (for multinode)
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
python3 -m pip cache purge
CMD ["/bin/bash"]

View File

@@ -4,7 +4,7 @@
## 镜像
Latest RC Version: git.modelhub.org.cn:9443/enginex-ascend/vllm-ascend:v0.10.0rc1
Latest RC Version: git.modelhub.org.cn:9443/enginex-ascend/vllm-ascend:v0.11.0rc0
## 总览
@@ -77,5 +77,5 @@ curl -X POST http://localhost:10086/v1/chat/completions \
| Version | Release type | Doc |
|------------|--------------|--------------------------------------|
|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
|v0.11.0rc0| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|

View File

@@ -42,7 +42,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
- OS: Linux
- Software:
* Python >= 3.9, < 3.12
* CANN >= 8.2.rc1
* CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
* vLLM (the same version as vllm-ascend)
@@ -52,7 +52,7 @@ Please use the following recommended versions to get started quickly:
| Version | Release type | Doc |
|------------|--------------|--------------------------------------|
|v0.10.1rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
## Contributing
@@ -73,7 +73,7 @@ Below is maintained branches:
| Branch | Status | Note |
|------------|--------------|--------------------------------------|
| main | Maintained | CI commitment for vLLM main branch and vLLM 0.10.x branch |
| main | Maintained | CI commitment for vLLM main branch and vLLM v0.11.0 tag |
| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
| v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version |

View File

@@ -43,7 +43,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
- 操作系统Linux
- 软件:
* Python >= 3.9, < 3.12
* CANN >= 8.2.rc1
* CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
* vLLM (与vllm-ascend版本一致)
@@ -53,7 +53,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
| Version | Release type | Doc |
|------------|--------------|--------------------------------------|
|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|v0.11.0rc0| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
## 贡献
@@ -73,7 +73,7 @@ vllm-ascend有主干分支和开发分支。
| 分支 | 状态 | 备注 |
|------------|------------|---------------------|
| main | Maintained | 基于vLLM main分支CI看护 |
| main | Maintained | 基于vLLM main分支和vLLM最新版本v0.11.0CI看护 |
| v0.7.1-dev | Unmaintained | 只允许文档修复 |
| v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护, 只允许Bug修复不会再发布新版本 |
| v0.9.1-dev | Maintained | 基于vLLM v0.9.1版本CI看护 |

View File

@@ -112,7 +112,7 @@ def test_get_masked_input_and_mask(
# Define custom function
def custom_fn():
return torch.ops._C.get_masked_input_and_mask(
return torch.ops._C_ascend.get_masked_input_and_mask(
input_tensor,
test_case["org_start"],
test_case["org_end"],

View File

@@ -78,7 +78,9 @@ kill_npu_processes() {
ps -aux
lsof -t -i:8000 | xargs -r kill -9
pgrep python3 | xargs -r kill -9
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
pgrep VLLM | xargs -r kill -9
sleep 4
rm -rf ~/.config/vllm

View File

@@ -23,7 +23,8 @@
"hf_split": "train",
"endpoint": "/v1/chat/completions",
"dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
"num_prompts": 200
"num_prompts": 200,
"no_stream": ""
}
},
{

View File

@@ -20,7 +20,6 @@
#include <torch_npu/csrc/core/npu/NPUStream.h>
#include <torch_npu/csrc/framework/OpCommand.h>
#include <torch_npu/csrc/npu/Module.h>
#include <pybind11/pybind11.h>
#include "acl/acl.h"
#include "ops.h"
#include "utils.h"
@@ -142,7 +141,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
TP2, rank 1:
|< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 |
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 |
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 |
Parameters:
org_vocab_start_index //base embeddings start
org_vocab_end_index //base embeddings end
@@ -165,22 +164,22 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
// Create output tensors
at::Tensor masked_input = at::empty_like(input);
at::Tensor mask = at::empty_like(input).to(at::kBool);
// Get data pointers
void *input_ptr = input.data_ptr();
void *masked_input_ptr = masked_input.data_ptr();
void *mask_ptr = mask.data_ptr();
// Get current stream
aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
// Get scalar type
at::ScalarType scalar_type = input.scalar_type();
// Create and configure OpCommand
at_npu::native::OpCommand cmd;
cmd.Name("get_masked_input_and_mask");
cmd.SetCustomHandler([scalar_type, size, stream,
cmd.SetCustomHandler([scalar_type, size, stream,
input_ptr, masked_input_ptr, mask_ptr,
org_vocab_start_index, org_vocab_end_index,
num_org_vocab_padding, added_vocab_start_index,
@@ -194,7 +193,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
get_masked_input_and_mask_impl(
stream,
input_ptr,
masked_input_ptr,
masked_input_ptr,
mask_ptr,
org_vocab_start_index,
org_vocab_end_index,
@@ -204,7 +203,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
size,
loop_cnt,
aiv_num);
return 0;
});
cmd.Run();
@@ -321,8 +320,8 @@ void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at
aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
at_npu::native::OpCommand cmd;
cmd.Name("sgmv_shrink");
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size,
seq_len_ptr, seq_len_size, y_ptr,
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size,
seq_len_ptr, seq_len_size, y_ptr,
batch_size, input_hidden_token, lora_rank, scale_f]() -> int {
auto dtype = get_dtype_from_torch(scalar_type);
int device_id = 0;
@@ -331,7 +330,7 @@ void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at
int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
sgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size,
y_ptr, batch_size,
y_ptr, batch_size,
num_tokens_per_core, input_hidden_token, lora_rank, scale_f);
return 0;
});
@@ -368,7 +367,7 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
at_npu::native::OpCommand cmd;
cmd.Name("sgmv_expand");
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
batch_size, lora_rank, slice_offset, slice_size, output_full_dim]() -> int {
auto dtype = get_dtype_from_torch(scalar_type);
int device_id = 0;
@@ -376,7 +375,7 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
batch_size, num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim);
return 0;
});
@@ -385,7 +384,7 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
}
} // namespace vllm_ascend
TORCH_LIBRARY_EXPAND(_C, ops)
TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
{
// vLLM-Ascend custom ops
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
@@ -424,5 +423,3 @@ TORCH_LIBRARY_EXPAND(_C, ops)
" int slice_offset, int slice_size) -> Tensor");
ops.impl("sgmv_expand", torch::kPrivateUse1, &vllm_ascend::sgmv_expand);
}
REGISTER_EXTENSION(_C)

View File

@@ -40,7 +40,7 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding_meta(
at::Tensor &positions,
at::Tensor &query,
at::Tensor &key,
int64_t head_size,
int64_t head_size,
at::Tensor &cos_sin_cache,
bool is_neox) {
auto num_tokens = positions.sym_numel();
@@ -86,9 +86,9 @@ at::Tensor sgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_
} // namespace vllm_ascend
namespace {
// Register the meta implementations of the custom kernels for symbolic tracing, this will also
// Register the meta implementations of the custom kernels for symbolic tracing, this will also
// the custom kernel been captured into aclgraph
TORCH_LIBRARY_IMPL_EXPAND(_C, Meta, ops) {
TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
// Rotary embedding meta implementation
ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta);
// Masked input and mask meta implementation
@@ -99,4 +99,4 @@ namespace {
ops.impl("sgmv_expand", &vllm_ascend::meta::sgmv_expand_meta);
}
}
}

View File

@@ -22,6 +22,8 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo |
|-------------|--------------|------------------|-------------|--------------------|--------------|
| v0.11.0rc0 | v0.11.0rc3 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
| v0.10.2rc1 | v0.10.2 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
| v0.10.1rc1 | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
| v0.10.0rc1 | v0.10.0 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
| v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | |
@@ -42,6 +44,8 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
| Date | Event |
|------------|-------------------------------------------|
| 2025.09.30 | Release candidates, v0.11.0rc0 |
| 2025.09.16 | Release candidates, v0.10.2rc1 |
| 2025.09.04 | Release candidates, v0.10.1rc1 |
| 2025.09.03 | v0.9.1 Final release |
| 2025.08.22 | Release candidates, v0.9.1rc3 |

View File

@@ -65,19 +65,19 @@ myst_substitutions = {
# the branch of vllm, used in vllm clone
# - main branch: 'main'
# - vX.Y.Z branch: 'vX.Y.Z'
'vllm_version': 'v0.10.1.1',
'vllm_version': 'v0.11.0rc3',
# the branch of vllm-ascend, used in vllm-ascend clone and image tag
# - main branch: 'main'
# - vX.Y.Z branch: latest vllm-ascend release tag
'vllm_ascend_version': 'v0.10.1rc1',
'vllm_ascend_version': 'v0.11.0rc0',
# the newest release version of vllm-ascend and matched vLLM, used in pip install.
# This value should be updated when cut down release.
'pip_vllm_ascend_version': "0.10.1rc1",
'pip_vllm_version': "0.10.1.1",
'pip_vllm_ascend_version': "0.11.0rc0",
'pip_vllm_version': "0.11.0",
# CANN image tag
'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11",
# vllm version in ci
'ci_vllm_version': 'v0.10.1.1',
'ci_vllm_version': 'v0.11.0rc3',
}
# Add any paths that contain templates here, relative to this directory.

View File

@@ -0,0 +1,20 @@
# deepseek-ai/DeepSeek-V2-Lite
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
- **Hardware Environment**: Atlas A2 Series
- **Parallel mode**: TP2
- **Execution mode**: ACLGraph
**Command**:
```bash
export MODEL_ARGS='pretrained=deepseek-ai/DeepSeek-V2-Lite,tensor_parallel_size=2,dtype=auto,trust_remote_code=True,max_model_len=4096,enforce_eager=True'
lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k \
--batch_size auto
```
| Task | Metric | Value | Stderr |
|-----------------------|-------------|----------:|-------:|
| gsm8k | exact_match,strict-match | ✅0.3813 | ± 0.0134 |
| gsm8k | exact_match,flexible-extract | ✅0.3836 | ± 0.0134 |

View File

@@ -0,0 +1,19 @@
# Qwen/Qwen2.5-VL-7B-Instruct
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
- **Hardware Environment**: Atlas A2 Series
- **Parallel mode**: TP1
- **Execution mode**: ACLGraph
**Command**:
```bash
export MODEL_ARGS='pretrained=Qwen/Qwen2.5-VL-7B-Instruct,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=8192'
lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks mmmu_val \
--apply_chat_template True --fewshot_as_multiturn True --batch_size auto
```
| Task | Metric | Value | Stderr |
|-----------------------|-------------|----------:|-------:|
| mmmu_val | acc,none | ✅0.52 | ± 0.0162 |

View File

@@ -0,0 +1,21 @@
# Qwen/Qwen3-30B-A3B
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
- **Hardware Environment**: Atlas A2 Series
- **Parallel mode**: TP2 + EP
- **Execution mode**: ACLGraph
**Command**:
```bash
export MODEL_ARGS='pretrained=Qwen/Qwen3-30B-A3B,tensor_parallel_size=2,dtype=auto,trust_remote_code=False,max_model_len=4096,gpu_memory_utilization=0.6,enable_expert_parallel=True'
lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \
--num_fewshot 5 --batch_size auto
```
| Task | Metric | Value | Stderr |
|-----------------------|-------------|----------:|-------:|
| gsm8k | exact_match,strict-match | ✅0.8923 | ± 0.0085 |
| gsm8k | exact_match,flexible-extract | ✅0.8506 | ± 0.0098 |
| ceval-valid | acc,none | ✅0.8358 | ± 0.0099 |

View File

@@ -0,0 +1,21 @@
# Qwen/Qwen3-8B-Base
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
- **Hardware Environment**: Atlas A2 Series
- **Parallel mode**: TP1
- **Execution mode**: ACLGraph
**Command**:
```bash
export MODEL_ARGS='pretrained=Qwen/Qwen3-8B-Base,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=4096'
lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \
--apply_chat_template True --fewshot_as_multiturn True --num_fewshot 5 --batch_size auto
```
| Task | Metric | Value | Stderr |
|-----------------------|-------------|----------:|-------:|
| gsm8k | exact_match,strict-match | ✅0.8271 | ± 0.0104 |
| gsm8k | exact_match,flexible-extract | ✅0.8294 | ± 0.0104 |
| ceval-valid | acc,none | ✅0.815 | ± 0.0103 |

View File

@@ -3,4 +3,8 @@
:::{toctree}
:caption: Accuracy Report
:maxdepth: 1
DeepSeek-V2-Lite
Qwen2.5-VL-7B-Instruct
Qwen3-30B-A3B
Qwen3-8B-Base
:::

View File

@@ -61,7 +61,6 @@ from torch import nn
from vllm.attention import Attention
from vllm.config import VllmConfig
from vllm.sequence import IntermediateTensors
from vllm.model_executor.sampling_metadata import SamplingMetadata
class CustomAttention(nn.Module):
def __init__(self, vllm_config: VllmConfig, prefix: str):

View File

@@ -3,7 +3,7 @@
## Version Specific FAQs
- [[v0.9.1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2643)
- [[v0.10.1rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2630)
- [[v0.11.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/3222)
## General FAQs
@@ -196,3 +196,21 @@ export ATB_LLM_LCOC_ENABLE=0
### 19. How to fix the error "ImportError: Please install vllm[audio] for audio support" for Qwen2.5-Omni model
The `Qwen2.5-Omni` model requires the `librosa` package to be installed, you need to install the `qwen-omni-utils` package to ensure all dependencies are met `pip install qwen-omni-utils`,
this package will install `librosa` and its related dependencies, resolving the `ImportError: No module named 'librosa'` issue and ensuring audio processing functionality works correctly.
### 20. How to troubleshoot and resolve size capture failures resulting from stream resource exhaustion, and what are the underlying causes?
```
error example in detail:
ERROR 09-26 10:48:07 [model_runner_v1.py:3029] ACLgraph sizes capture fail: RuntimeError:
ERROR 09-26 10:48:07 [model_runner_v1.py:3029] ACLgraph has insufficient available streams to capture the configured number of sizes.Please verify both the availability of adequate streams and the appropriateness of the configured size count.
```
Recommended mitigation strategies:
1. Manually configure the compilation_config parameter with a reduced size set: '{"cudagraph_capture_sizes":[size1, size2, size3, ...]}'.
2. Employ ACLgraph's full graph mode as an alternative to the piece-wise approach.
Root cause analysis:
The current stream requirement calculation for size captures only accounts for measurable factors including: data parallel size, tensor parallel size, expert parallel configuration, piece graph count, multistream overlap shared expert settings, and HCCL communication mode (AIV/AICPU). However, numerous unquantifiable elements - such as operator characteristics and specific hardware features - consume additional streams outside of this calculation framework, resulting in stream resource exhaustion during size capture operations.
### 21. Installing vllm-ascend will overwrite the existing torch-npu package
Installing vllm-ascend will overwrite the existing torch-npu package. If you need to install a specific version of torch-npu, you can manually install the specified version of torch-npu after installing vllm-ascend.

View File

@@ -11,6 +11,7 @@ This document describes how to install vllm-ascend manually.
| Software | Supported version | Note |
|---------------|----------------------------------|-------------------------------------------|
| Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html) | Required for CANN |
| CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu |
| torch-npu | >= 2.7.1.dev20250724 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
| torch | >= 2.7.1 | Required for torch-npu and vllm |

View File

@@ -148,10 +148,6 @@ msgid ""
" to be passed in."
msgstr "在为MOE模型使用专家负载均衡时需要传入专家映射路径。"
#: ../../user_guide/configuration/additional_config.md
msgid "`chunked_prefill_for_mla`"
msgstr "`chunked_prefill_for_mla`"
#: ../../user_guide/configuration/additional_config.md
msgid "`False`"
msgstr "`False`"
@@ -199,8 +195,8 @@ msgid ""
msgstr "是否将MLA的向量操作放到另一个流中。此选项仅对使用MLA的模型例如DeepSeek有效。"
#: ../../user_guide/configuration/additional_config.md
msgid "`enable_multistream_moe`"
msgstr "`enable_multistream_moe`"
msgid "`multistream_overlap_shared_expert`"
msgstr "`multistream_overlap_shared_expert`"
#: ../../user_guide/configuration/additional_config.md
msgid ""

View File

@@ -8,6 +8,7 @@ single_npu_multimodal
single_npu_audio
single_npu_qwen3_embedding
single_npu_qwen3_quantization
multi_npu_qwen3_next
multi_npu
multi_npu_moge
multi_npu_qwen3_moe
@@ -15,4 +16,7 @@ multi_npu_quantization
single_node_300i
multi_node
multi_node_kimi
multi_node_qwen3vl
multi_node_pd_disaggregation
multi_node_ray
:::

View File

@@ -0,0 +1,244 @@
# Prefill-Decode Disaggregation Verification (Qwen)
## Getting Start
vLLM-Ascend now supports prefill-decode (PD) disaggregation with EP (Expert Parallel) options. This guide take one-by-one steps to verify these features with constrained resources.
Take the Qwen3-30B-A3B model as an example, use vllm-ascend v0.10.1rc1 (with vLLM v0.10.1.1) on 3 Atlas 800T A2 servers to deploy the "1P2D" architecture. Assume the ip of the prefiller server is 192.0.0.1, and the decoder servers are 192.0.0.2 (decoder 1) and 192.0.0.3 (decoder 2). On each server, use 2 NPUs to deploy one service instance.
## Verify Multi-Node Communication Environment
### Physical Layer Requirements
- The physical machines must be located on the same WLAN, with network connectivity.
- All NPUs must be interconnected. Intra-node connectivity is via HCCS, and inter-node connectivity is via RDMA.
### Verification Process
1. Single Node Verification:
Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`:
```bash
# Check the remote switch ports
for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done
# Get the link status of the Ethernet ports (UP or DOWN)
for i in {0..7}; do hccn_tool -i $i -link -g ; done
# Check the network health status
for i in {0..7}; do hccn_tool -i $i -net_health -g ; done
# View the network detected IP configuration
for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done
# View gateway configuration
for i in {0..7}; do hccn_tool -i $i -gateway -g ; done
# View NPU network configuration
cat /etc/hccn.conf
```
2. Get NPU IP Addresses
```bash
for i in {0..7}; do hccn_tool -i $i -ip -g;done
```
3. Cross-Node PING Test
```bash
# Execute on the target node (replace 'x.x.x.x' with actual npu ip address)
for i in {0..7}; do hccn_tool -i $i -ping -g address x.x.x.x;done
```
## Generate Ranktable
The rank table is a JSON file that specifies the mapping of Ascend NPU ranks to nodes. For more details please refer to the [vllm-ascend examples](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/README.md). Execute the following commands for reference.
```shell
cd vllm-ascend/examples/disaggregate_prefill_v1/
bash gen_ranktable.sh --ips <prefiller_node1_local_ip> <prefiller_node2_local_ip> <decoder_node1_local_ip> <decoder_node2_local_ip> \
--npus-per-node <npu_clips> --network-card-name <nic_name> --prefill-device-cnt <prefiller_npu_clips> --decode-device-cnt <decode_npu_clips> \
[--local-device-ids <id_1>,<id_2>,<id_3>...]
```
Assume that we use device 0,1 on the prefiller server node and device 6,7 on both of the decoder server nodes. Take the following commands as an example. (`--local-device-ids` is necessary if you specify certain NPU devices on the local server.)
```shell
# On the prefiller node
cd vllm-ascend/examples/disaggregate_prefill_v1/
bash gen_ranktable.sh --ips 192.0.0.1 192.0.0.2 192.0.0.3 \
--npus-per-node 2 --network-card-name eth0 --prefill-device-cnt 2 --decode-device-cnt 4 --local-device-ids 0,1
# On the decoder 1
cd vllm-ascend/examples/disaggregate_prefill_v1/
bash gen_ranktable.sh --ips 192.0.0.1 192.0.0.2 192.0.0.3 \
--npus-per-node 2 --network-card-name eth0 --prefill-device-cnt 2 --decode-device-cnt 4 --local-device-ids 6,7
# On the decoder 2
cd vllm-ascend/examples/disaggregate_prefill_v1/
bash gen_ranktable.sh --ips 192.0.0.1 192.0.0.2 192.0.0.3 \
--npus-per-node 2 --network-card-name eth0 --prefill-device-cnt 2 --decode-device-cnt 4 --local-device-ids 6,7
```
Rank table will generated at /vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
|Parameter | meaning |
| --- | --- |
| --ips | Each node's local ip (prefiller nodes should be front of decoder nodes) |
| --npus-per-node | Each node's npu clips |
| --network-card-name | The physical machines' NIC |
|--prefill-device-cnt | Npu clips used for prefill |
|--decode-device-cnt |Npu clips used for decode |
|--local-device-ids |Optional. No need if using all devices on the local node. |
## Prefiller / Decoder Deployment
We can run the following scripts to launch a server on the prefiller/decoder node respectively.
:::::{tab-set}
::::{tab-item} Prefiller node
```shell
export HCCL_IF_IP=192.0.0.1 # node ip
export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export VLLM_USE_V1=1
vllm serve /model/Qwen3-30B-A3B \
--host 0.0.0.0 \
--port 13700 \
--tensor-parallel-size 2 \
--no-enable-prefix-caching \
--seed 1024 \
--served-model-name qwen3-moe \
--max-model-len 6144 \
--max-num-batched-tokens 6144 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--enable-expert-parallel \
--kv-transfer-config \
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_producer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}' \
--enforce-eager
```
::::
::::{tab-item} Decoder node 1
```shell
export HCCL_IF_IP=192.0.0.2 # node ip
export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export VLLM_USE_V1=1
vllm serve /model/Qwen3-30B-A3B \
--host 0.0.0.0 \
--port 13700 \
--no-enable-prefix-caching \
--tensor-parallel-size 2 \
--seed 1024 \
--served-model-name qwen3-moe \
--max-model-len 6144 \
--max-num-batched-tokens 6144 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--enable-expert-parallel \
--kv-transfer-config \
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_consumer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}'
```
::::
::::{tab-item} Decoder node 2
```shell
export HCCL_IF_IP=192.0.0.3 # node ip
export GLOO_SOCKET_IFNAME="eth0" # network card name
export TP_SOCKET_IFNAME="eth0"
export HCCL_SOCKET_IFNAME="eth0"
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=10
export VLLM_USE_V1=1
vllm serve /model/Qwen3-30B-A3B \
--host 0.0.0.0 \
--port 13700 \
--no-enable-prefix-caching \
--tensor-parallel-size 2 \
--seed 1024 \
--served-model-name qwen3-moe \
--max-model-len 6144 \
--max-num-batched-tokens 6144 \
--trust-remote-code \
--gpu-memory-utilization 0.9 \
--enable-expert-parallel \
--kv-transfer-config \
'{"kv_connector": "LLMDataDistCMgrConnector",
"kv_buffer_device": "npu",
"kv_role": "kv_consumer",
"kv_parallel_size": 1,
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}'
```
::::
:::::
## Example proxy for Deployment
Run a proxy server on the same node with prefiller service instance. You can get the proxy program in the repository's examples: [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py)
```shell
python load_balance_proxy_server_example.py \
--host 192.0.0.1 \
--port 8080 \
--prefiller-hosts 192.0.0.1 \
--prefiller-port 13700 \
--decoder-hosts 192.0.0.2 192.0.0.3 \
--decoder-ports 13700 13700
```
## Verification
Check service health using the proxy server endpoint.
```shell
curl http://192.0.0.1:8080/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3-moe",
"prompt": "Who are you?",
"max_tokens": 100,
"temperature": 0
}'
```

View File

@@ -0,0 +1,156 @@
# Multi-Node-DP (Qwen3-VL-235B-A22B)
## Verify Multi-Node Communication Environment
referring to [multi_node.md](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html#verification-process)
## Run with docker
Assume you have an Atlas 800 A3(64G*16) nodes(or 2 * A2), and want to deploy the `Qwen3-VL-235B-A22B-Instruct` model across multi-node.
```{code-block} bash
:substitutions:
# Update the vllm-ascend image
export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
docker run --rm \
--name vllm-ascend \
--net=host \
--device /dev/davinci0 \
--device /dev/davinci1 \
--device /dev/davinci2 \
--device /dev/davinci3 \
--device /dev/davinci4 \
--device /dev/davinci5 \
--device /dev/davinci6 \
--device /dev/davinci7 \
--device /dev/davinci8 \
--device /dev/davinci9 \
--device /dev/davinci10 \
--device /dev/davinci11 \
--device /dev/davinci12 \
--device /dev/davinci13 \
--device /dev/davinci14 \
--device /dev/davinci15 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /root/.cache:/root/.cache \
-p 8000:8000 \
-it $IMAGE bash
```
Run the following scripts on two nodes respectively
:::{note}
Before launch the inference server, ensure the following environment variables are set for multi node communication
:::
node0
```shell
#!/bin/sh
# this obtained through ifconfig
# nic_name is the network interface name corresponding to local_ip
nic_name="xxxx"
local_ip="xxxx"
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
--host 0.0.0.0 \
--port 8000 \
--data-parallel-size 2 \
--api-server-count 2 \
--data-parallel-size-local 1 \
--data-parallel-address $local_ip \
--data-parallel-rpc-port 13389 \
--seed 1024 \
--served-model-name qwen3vl \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--max-num-seqs 16 \
--max-model-len 32768 \
--max-num-batched-tokens 4096 \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.8 \
```
node1
```shell
#!/bin/sh
nic_name="xxxx"
local_ip="xxxx"
node0_ip="xxxx"
export HCCL_IF_IP=$local_ip
export GLOO_SOCKET_IFNAME=$nic_name
export TP_SOCKET_IFNAME=$nic_name
export HCCL_SOCKET_IFNAME=$nic_name
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export HCCL_BUFFSIZE=1024
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
--host 0.0.0.0 \
--port 8000 \
--headless \
--data-parallel-size 2 \
--data-parallel-size-local 1 \
--data-parallel-start-rank 1 \
--data-parallel-address $node0_ip \
--data-parallel-rpc-port 13389 \
--seed 1024 \
--tensor-parallel-size 8 \
--served-model-name qwen3vl \
--max-num-seqs 16 \
--max-model-len 32768 \
--max-num-batched-tokens 4096 \
--enable-expert-parallel \
--trust-remote-code \
--no-enable-prefix-caching \
--gpu-memory-utilization 0.8 \
```
If the service starts successfully, the following information will be displayed on node0:
```shell
INFO: Started server process [44610]
INFO: Waiting for application startup.
INFO: Application startup complete.
INFO: Started server process [44611]
INFO: Waiting for application startup.
INFO: Application startup complete.
```
Once your server is started, you can query the model with input prompts:
```shell
curl http://localhost:8000/v1/chat/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen3vl",
"messages": [
{"role": "system", "content": "You are a helpful assistant."},
{"role": "user", "content": [
{"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png"}},
{"type": "text", "text": "What is the text in the illustrate?"}
]}
]
}'
```

View File

@@ -0,0 +1,182 @@
# Multi-Node-Ray (Qwen/Qwen3-235B-A22B)
Multi-node inference is suitable for the scenarios that the model cannot be deployed on a single machine. In such cases, the model can be distributed using tensor parallelism or pipeline parallelism. The specific parallelism strategies will be covered in the following sections. To successfully deploy multi-node inference, the following three steps need to be completed:
* **Verify Multi-Node Communication Environment**
* **Set Up and Start the Ray Cluster**
* **Start the Online Inference Service on multinode**
## Verify Multi-Node Communication Environment
### Physical Layer Requirements:
* The physical machines must be located on the same LAN, with network connectivity.
* All NPUs are connected with optical modules, and the connection status must be normal.
### Verification Process:
Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`:
```bash
# Check the remote switch ports
for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done
# Get the link status of the Ethernet ports (UP or DOWN)
for i in {0..7}; do hccn_tool -i $i -link -g ; done
# Check the network health status
for i in {0..7}; do hccn_tool -i $i -net_health -g ; done
# View the network detected IP configuration
for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done
# View gateway configuration
for i in {0..7}; do hccn_tool -i $i -gateway -g ; done
# View NPU network configuration
cat /etc/hccn.conf
```
### NPU Interconnect Verification:
#### 1. Get NPU IP Addresses
```bash
for i in {0..7}; do hccn_tool -i $i -ip -g | grep ipaddr; done
```
#### 2. Cross-Node PING Test
```bash
# Execute on the target node (replace with actual IP)
hccn_tool -i 0 -ping -g address 10.20.0.20
```
## Set Up and Start the Ray Cluster
### Setting Up the Basic Container
To ensure a consistent execution environment across all nodes, including the model path and Python environment, it is recommended to use Docker images.
For setting up a multi-node inference cluster with Ray, **containerized deployment** is the preferred approach. Containers should be started on both the master and worker nodes, with the `--net=host` option to enable proper network connectivity.
Below is the example container setup command, which should be executed on **all nodes** :
```{code-block} bash
:substitutions:
# Update the vllm-ascend image
export IMAGE=quay.nju.edu.cn/ascend/vllm-ascend:|vllm_ascend_version|
export NAME=vllm-ascend
# Run the container using the defined variables
# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance
docker run --rm \
--name $NAME \
--net=host \
--device /dev/davinci0 \
--device /dev/davinci1 \
--device /dev/davinci2 \
--device /dev/davinci3 \
--device /dev/davinci4 \
--device /dev/davinci5 \
--device /dev/davinci6 \
--device /dev/davinci7 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /path/to/shared/cache:/root/.cache \ # IMPORTANT: This must be a shared directory accessible by all nodes
-it $IMAGE bash
```
### Start Ray Cluster
After setting up the containers and installing vllm-ascend on each node, follow the steps below to start the Ray cluster and execute inference tasks.
Choose one machine as the head node and the others as worker nodes. Before proceeding, use `ip addr` to check your `nic_name` (network interface name).
Set the `ASCEND_RT_VISIBLE_DEVICES` environment variable to specify the NPU devices to use. For Ray versions above 2.1, also set the `RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES` variable to avoid device recognition issues.
Below are the commands for the head and worker nodes:
**Head node**:
:::{note}
When starting a Ray cluster for multi-node inference, the environment variables on each node must be set **before** starting the Ray cluster for them to take effect.
Updating the environment variables requires restarting the Ray cluster.
:::
```shell
# Head node
export HCCL_IF_IP={local_ip}
export GLOO_SOCKET_IFNAME={nic_name}
export TP_SOCKET_IFNAME={nic_name}
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
ray start --head
```
**Worker node**:
:::{note}
When starting a Ray cluster for multi-node inference, the environment variables on each node must be set **before** starting the Ray cluster for them to take effect. Updating the environment variables requires restarting the Ray cluster.
:::
```shell
# Worker node
export HCCL_IF_IP={local_ip}
export GLOO_SOCKET_IFNAME={nic_name}
export TP_SOCKET_IFNAME={nic_name}
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
ray start --address='{head_node_ip}:6379' --node-ip-address={local_ip}
```
Once the cluster is started on multiple nodes, execute `ray status` and `ray list nodes` to verify the Ray cluster's status. You should see the correct number of nodes and NPUs listed.
## Start the Online Inference Service on multinode scenario
In the container, you can use vLLM as if all NPUs were on a single node. vLLM will utilize NPU resources across all nodes in the Ray cluster.
**You only need to run the vllm command on one node.**
To set up parallelism, the common practice is to set the `tensor-parallel-size` to the number of NPUs per node, and the `pipeline-parallel-size` to the number of nodes.
For example, with 16 NPUs across 2 nodes (8 NPUs per node), set the tensor parallel size to 8 and the pipeline parallel size to 2:
```shell
vllm serve Qwen/Qwen3-235B-A22B \
--distributed-executor-backend ray \
--pipeline-parallel-size 2 \
--tensor-parallel-size 8 \
--enable-expert-parallel \
--seed 1024 \
--max-model-len 8192 \
--max-num-seqs 25 \
--served-model-name qwen \
--trust-remote-code \
--gpu-memory-utilization 0.9
```
Alternatively, if you want to use only tensor parallelism, set the tensor parallel size to the total number of NPUs in the cluster. For example, with 16 NPUs across 2 nodes, set the tensor parallel size to 16:
```shell
vllm serve Qwen/Qwen3-235B-A22B \
--distributed-executor-backend ray \
--tensor-parallel-size 16 \
--enable-expert-parallel \
--seed 1024 \
--max-model-len 8192 \
--max-num-seqs 25 \
--served-model-name qwen \
--trust-remote-code \
--gpu-memory-utilization 0.9
```
Once your server is started, you can query the model with input prompts:
```bash
curl http://localhost:8000/v1/completions \
-H "Content-Type: application/json" \
-d '{
"model": "qwen",
"prompt": "tell me how to sleep well",
"max_tokens": 100,
"temperature": 0
}'
```

View File

@@ -0,0 +1,156 @@
# Multi-NPU (Qwen3-Next)
```{note}
The Qwen3 Next are using [Triton Ascend](https://gitee.com/ascend/triton-ascend) which is currently experimental. In future versions, there may be behavioral changes around stability, accuracy and performance improvement.
```
## Run vllm-ascend on Multi-NPU with Qwen3 Next
Run docker container:
```{code-block} bash
:substitutions:
# Update the vllm-ascend image
export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
docker run --rm \
--name vllm-ascend-qwen3 \
--device /dev/davinci0 \
--device /dev/davinci1 \
--device /dev/davinci2 \
--device /dev/davinci3 \
--device /dev/davinci_manager \
--device /dev/devmm_svm \
--device /dev/hisi_hdc \
-v /usr/local/dcmi:/usr/local/dcmi \
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
-v /etc/ascend_install.info:/etc/ascend_install.info \
-v /root/.cache:/root/.cache \
-p 8000:8000 \
-it $IMAGE bash
```
Setup environment variables:
```bash
# Load model from ModelScope to speed up download
export VLLM_USE_MODELSCOPE=True
```
### Install Triton Ascend
:::::{tab-set}
::::{tab-item} Linux (aarch64)
The [Triton Ascend](https://gitee.com/ascend/triton-ascend) is required when you run Qwen3 Next, please follow the instructions below to install it and its dependency.
Install the Ascend BiSheng toolkit:
```bash
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/Ascend-BiSheng-toolkit_aarch64.run
chmod a+x Ascend-BiSheng-toolkit_aarch64.run
./Ascend-BiSheng-toolkit_aarch64.run --install
source /usr/local/Ascend/8.3.RC1/bisheng_toolkit/set_env.sh
```
Install Triton Ascend:
```bash
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl
pip install triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl
```
::::
::::{tab-item} Linux (x86_64)
Coming soon ...
::::
:::::
### Inference on Multi-NPU
Please make sure you already executed the command:
```bash
source /usr/local/Ascend/8.3.RC1/bisheng_toolkit/set_env.sh
```
:::::{tab-set}
::::{tab-item} Online Inference
Run the following script to start the vLLM server on Multi-NPU:
For an Atlas A2 with 64GB of NPU card memory, tensor-parallel-size should be at least 4, and for 32GB of memory, tensor-parallel-size should be at least 8.
```bash
vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct --tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.7 --enforce-eager
```
Once your server is started, you can query the model with input prompts
```bash
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
"model": "Qwen/Qwen3-Next-80B-A3B-Instruct",
"messages": [
{"role": "user", "content": "Who are you?"}
],
"temperature": 0.6,
"top_p": 0.95,
"top_k": 20,
"max_tokens": 32
}'
```
::::
::::{tab-item} Offline Inference
Run the following script to execute offline inference on multi-NPU:
```python
import gc
import torch
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import (destroy_distributed_environment,
destroy_model_parallel)
def clean_up():
destroy_model_parallel()
destroy_distributed_environment()
gc.collect()
torch.npu.empty_cache()
if __name__ == '__main__':
prompts = [
"Who are you?",
]
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40, max_tokens=32)
llm = LLM(model="Qwen/Qwen3-Next-80B-A3B-Instruct",
tensor_parallel_size=4,
enforce_eager=True,
distributed_executor_backend="mp",
gpu_memory_utilization=0.7,
max_model_len=4096)
outputs = llm.generate(prompts, sampling_params)
for output in outputs:
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
del llm
clean_up()
```
If you run this script successfully, you can see the info shown below:
```bash
Prompt: 'Who are you?', Generated text: ' What do you know about me?\n\nHello! I am Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. I am'
```
::::
:::::

View File

@@ -30,11 +30,18 @@ The following table lists the additional configuration options available in vLLM
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case. |
| `expert_map_path` | str | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
| `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
| `enable_prefetch` | bool | `False` | Whether to enable weight prefetch. |
| `kv_cache_dtype` | str | `None` | When using the kv cache quantization method, kv cache dtype needs to be set, currently only int8 is supported. |
| `enable_shared_expert_dp` | bool | `False` | When the shared expert in DP, it has better performance but consumes more memory. Currently only DeepSeek series models are supported to use. |
| `lmhead_tensor_parallel_size` | int | `None` | The custom tensor parallel size of lmhead. |
| `oproj_tensor_parallel_size` | int | `None` | The custom tensor parallel size of oproj. |
| `multistream_overlap_shared_expert`| bool | `False` | Whether to enable multistream shared expert. This option only takes effects on moe models with shared experts. |
| `dynamic_eplb` | bool | `False` | Whether to enable dynamic eplb |
|`num_iterations_eplb_update`| int | `400` | Forward iterations when eplb would begin |
|`gate_eplb`| bool | `False` | Whether to enale eplb only once. |
|`num_wait_worker_iterations`| int | `30` | The forward iterations when eplb worker will finish cpu task. In our test default value 30 would cover most cases. |
|`expert_map_record_path`| str | `None` | When dynamic eplb is completed, save the current expert load heatmap to the specified path. |
|`init_redundancy_expert`| int | `0` |Specify redundant experts during initialization.|
The details of each config option are as follows:
@@ -45,8 +52,8 @@ The details of each config option are as follows:
| `enabled` | bool | `False` | Whether to enable torchair graph mode. Currently only DeepSeek series models and PanguProMoE are supported to use torchair graph mode |
| `mode` | str | `None` | When using reduce-overhead mode for torchair, mode needs to be set |
| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream. This option only takes effects on models using MLA (e.g., DeepSeek). |
| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert. This option only takes effects on DeepSeek moe models. |
| `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
| `enable_frozen_parameter` | bool | `True` | Whether to fix the memory address of weights during inference to reduce the input address refresh time during graph execution. |
| `use_cached_graph` | bool | `False` | Whether to use cached graph |
| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
@@ -57,6 +64,10 @@ The details of each config option are as follows:
| Name | Type | Default | Description |
| ---- | ---- | ------- | ----------- |
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
| `enable_pd_transfer` | bool | `False` | Whether to enable pd transfer. When using it, decode is started only when prefill of all requests is done. This option only takes effects on offline inference. |
| `decode_max_num_seqs` | int | `0` | Whether to change max_num_seqs of decode phase when enable pd transfer. This option only takes effects when enable_pd_transfer is True. |
| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
@@ -71,13 +82,15 @@ An example of additional configuration is as follows:
"use_cached_graph": True,
"graph_batch_sizes": [1, 2, 4, 8],
"graph_batch_sizes_init": False,
"enable_multistream_moe": False,
"enable_kv_nz": False
},
"ascend_scheduler_config": {
"enabled": True,
"enable_chunked_prefill": True,
"max_long_partial_prefills": 1,
"long_prefill_token_threshold": 4096,
},
"multistream_overlap_shared_expert": True,
"refresh": False,
}
```

View File

@@ -0,0 +1,94 @@
# Expert Load Balance (EPLB)
## Overview
Expert balancing for MoE models in LLM serving is essential for optimal performance. Dynamically changing experts during inference can negatively impact TTFT (Time To First Token) and TPOT (Tokens Per Output Token) due to stop-the-world operations. SwiftBalancer enables asynchronous expert load balancing with zero-overhead expert movement, ensuring seamless service continuity.
## EPLB Effects
- Reduced Latency: Dynamically balances expert loads to minimize TTFT and TPOT by distributing workloads evenly across experts.
- Enhanced Throughput: Optimizes GPU utilization, increasing token generation speed under high-concurrency scenarios.
- Zero-Overhead Movement: Expert redistribution occurs asynchronously without interrupting ongoing inference requests.
- Adaptive Scaling: Automatically adjusts to workload fluctuations while maintaining stable performance.
- Fault Tolerance: Redundant expert placement ensures system resilience during hardware failures.
## How to Use EPLB
### Dynamic EPLB
Enable dynamic balancing with auto-tuned parameters. Adjust num_iterations_eplb_update and num_wait_worker_iterations based on workload patterns.
```shell
vllm serve Qwen/Qwen3-235B-A22 \
--tensor-parallel-size 16 \
--enable-expert-parallel \
--additional-config '{
"dynamic_eplb": true,
"num_iterations_eplb_update": 400,
"gate_eplb": true,
"num_wait_worker_iterations": 30
}'
```
### Static EPLB
#### Initial Setup (Record Expert Map)
Generate the initial expert distribution map using expert_map_record_path. This creates a baseline configuration for future deployments.
```shell
vllm serve Qwen/Qwen3-235B-A22 \
--tensor-parallel-size 16 \
--enable-expert-parallel \
--additional-config '{
"expert_map_record_path": "/path/to/eplb.json",
"init_redundancy_expert": 16,
"dynamic_eplb": true,
"num_iterations_eplb_update": 400,
"gate_eplb": true,
"num_wait_worker_iterations": 30
}'
```
#### Subsequent Deployments (Use Recorded Map)
Load the pre-recorded expert map for consistent performance. This avoids recalculating distributions at runtime.
```shell
vllm serve Qwen/Qwen3-235B-A22 \
--tensor-parallel-size 16 \
--enable-expert-parallel \
--additional-config '{
"expert_map_path": "/path/to/eplb.json"
}'
```
## Critical Considerations
1. Parameter Tuning:
- num_iterations_eplb_update: Higher values (e.g., 400+) for stable workloads; lower values (e.g., 100-200) for fluctuating traffic.
- num_wait_worker_iterations: Should be ≥30 to avoid premature balancing during startup.
- init_redundancy_expert: Must match tensor-parallel size (e.g., 16 for 16 GPUs) to ensure sufficient redundancy.
2. Hardware Requirements:
- Ensure all GPUs have identical memory capacity and compute capabilities.
- Network bandwidth must support expert redistribution traffic (≥10Gbps recommended).
3. Model Compatibility:
- Only MoE models with explicit expert parallelism support (e.g., Qwen3-235B-A22) are compatible.
- Verify model architecture supports dynamic expert routing via --enable-expert-parallel.
4. Gating Configuration:
- When gate_eplb=true, validate that the gating mechanism can handle expert movement without routing errors.
- Test with synthetic workloads before production deployment.
5. Monitoring & Validation:
- Track metrics: expert_load_balance_ratio, ttft_p99, tpot_avg, and gpu_utilization.
- Use vllm monitor to detect imbalances during runtime.
- Always verify expert map JSON structure before loading (validate with jq or similar tools).
6. Startup Behavior:
- Initial requests may experience higher latency during the first balancing cycle (typically 1-2 minutes).
- Avoid sudden traffic spikes during warm-up phase.
7. Common Pitfalls:
- Incorrect tensor-parallel-size vs. actual GPU count → causes resource underutilization.
- Using expert_map_path without generating the map first → runtime errors.
- Setting init_redundancy_expert > available GPUs → system failure.

Binary file not shown.

After

Width:  |  Height:  |  Size: 55 KiB

View File

@@ -10,4 +10,5 @@ quantization
sleep_mode
structured_output
lora
eplb_swift_balancer
:::

View File

@@ -108,18 +108,19 @@ Please convert DeepSeek series models using `br_release_MindStudio_8.1.RC2_TR5_2
### 3. When converting deepseek series models with modelslim, what should you pay attention?
When using the weight generated by modelslim with the `--dynamic` parameter, if torchair graph mode is enabled, please modify the configuration file in the CANN package to prevent incorrect inference results.
When the mla portion of the weights used `W8A8_DYNAMIC` quantization, if torchair graph mode is enabled, please modify the configuration file in the CANN package to prevent incorrect inference results.
The operation steps are as follows:
1. Search in the CANN package directory used, for example:
find /usr/local/Ascend/ -name fusion_config.json
2. Add `"AddRmsNormDynamicQuantFusionPass":"off",` to the fusion_config.json you find, the location is as follows:
2. Add `"AddRmsNormDynamicQuantFusionPass":"off",` and `"MultiAddRmsNormDynamicQuantFusionPass":"off",` to the fusion_config.json you find, the location is as follows:
```bash
{
"Switch":{
"GraphFusion":{
"AddRmsNormDynamicQuantFusionPass":"off",
"MultiAddRmsNormDynamicQuantFusionPass":"off",
```

View File

@@ -1,5 +1,70 @@
# Release note
## v0.11.0rc0 - 2025.09.30
This is the special release candidate of v0.11.0 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started.
### Highlights
- DeepSeek V3.2 is supported now. [#3270](https://github.com/vllm-project/vllm-ascend/pull/3270)
- Qwen3-vl is supported now. [#3103](https://github.com/vllm-project/vllm-ascend/pull/3103)
### Core
- DeepSeek works with aclgraph now. [#2707](https://github.com/vllm-project/vllm-ascend/pull/2707)
- MTP works with aclgraph now. [#2932](https://github.com/vllm-project/vllm-ascend/pull/2932)
- EPLB is supported now. [#2956](https://github.com/vllm-project/vllm-ascend/pull/2956)
- Mooncacke store kvcache connector is supported now. [#2913](https://github.com/vllm-project/vllm-ascend/pull/2913)
- CPU offload connector is supported now. [#1659](https://github.com/vllm-project/vllm-ascend/pull/1659)
### Other
- Qwen3-next is stable now. [#3007](https://github.com/vllm-project/vllm-ascend/pull/3007)
- Fixed a lot of bugs introduced in v0.10.2 by Qwen3-next. [#2964](https://github.com/vllm-project/vllm-ascend/pull/2964) [#2781](https://github.com/vllm-project/vllm-ascend/pull/2781) [#3070](https://github.com/vllm-project/vllm-ascend/pull/3070) [#3113](https://github.com/vllm-project/vllm-ascend/pull/3113)
- The LoRA feature is back now. [#3044](https://github.com/vllm-project/vllm-ascend/pull/3044)
- Eagle3 spec decode method is back now. [#2949](https://github.com/vllm-project/vllm-ascend/pull/2949)
## v0.10.2rc1 - 2025.09.16
This is the 1st release candidate of v0.10.2 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started.
### Highlights
- Add support for Qwen3 Next. Please note that expert parallel and MTP feature doesn't work with this release. We'll make it work enough soon. Follow the [official guide](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) to get start [#2917](https://github.com/vllm-project/vllm-ascend/pull/2917)
- Add quantization support for aclgraph [#2841](https://github.com/vllm-project/vllm-ascend/pull/2841)
### Core
- Aclgraph now works with Ray backend. [#2589](https://github.com/vllm-project/vllm-ascend/pull/2589)
- MTP now works with the token > 1. [#2708](https://github.com/vllm-project/vllm-ascend/pull/2708)
- Qwen2.5 VL now works with quantization. [#2778](https://github.com/vllm-project/vllm-ascend/pull/2778)
- Improved the performance with async scheduler enabled. [#2783](https://github.com/vllm-project/vllm-ascend/pull/2783)
- Fixed the performance regression with non MLA model when use default scheduler. [#2894](https://github.com/vllm-project/vllm-ascend/pull/2894)
### Other
- The performance of w8a8 quantization is improved. [#2275](https://github.com/vllm-project/vllm-ascend/pull/2275)
- The performance of moe model is improved. [#2689](https://github.com/vllm-project/vllm-ascend/pull/2689) [#2842](https://github.com/vllm-project/vllm-ascend/pull/2842)
- Fixed resources limit error when apply speculative decoding and aclgraph. [#2472](https://github.com/vllm-project/vllm-ascend/pull/2472)
- Fixed the git config error in docker images. [#2746](https://github.com/vllm-project/vllm-ascend/pull/2746)
- Fixed the sliding windows attention bug with prefill. [#2758](https://github.com/vllm-project/vllm-ascend/pull/2758)
- The official doc for Prefill Decode Disaggregation with Qwen3 is added. [#2751](https://github.com/vllm-project/vllm-ascend/pull/2751)
- `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP` env works again. [#2740](https://github.com/vllm-project/vllm-ascend/pull/2740)
- A new improvement for oproj in deepseek is added. Set `oproj_tensor_parallel_size` to enable this feature[#2167](https://github.com/vllm-project/vllm-ascend/pull/2167)
- Fix a bug that deepseek with torchair doesn't work as expect when `graph_batch_sizes` is set. [#2760](https://github.com/vllm-project/vllm-ascend/pull/2760)
- Avoid duplicate generation of sin_cos_cache in rope when kv_seqlen > 4k. [#2744](https://github.com/vllm-project/vllm-ascend/pull/2744)
- The performance of Qwen3 dense model is improved with flashcomm_v1. Set `VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1` and `VLLM_ASCEND_ENABLE_FLASHCOMM=1` to enable it. [#2779](https://github.com/vllm-project/vllm-ascend/pull/2779)
- The performance of Qwen3 dense model is improved with prefetch feature. Set `VLLM_ASCEND_ENABLE_PREFETCH_MLP=1` to enable it. [#2816](https://github.com/vllm-project/vllm-ascend/pull/2816)
- The performance of Qwen3 MoE model is improved with rope ops update. [#2571](https://github.com/vllm-project/vllm-ascend/pull/2571)
- Fix the weight load error for RLHF case. [#2756](https://github.com/vllm-project/vllm-ascend/pull/2756)
- Add warm_up_atb step to speed up the inference. [#2823](https://github.com/vllm-project/vllm-ascend/pull/2823)
- Fixed the aclgraph steam error for moe model. [#2827](https://github.com/vllm-project/vllm-ascend/pull/2827)
### Known issue
- The server will be hang when running Prefill Decode Disaggregation with different TP size for P and D. It's fixed by [vLLM commit](https://github.com/vllm-project/vllm/pull/23917) which is not included in v0.10.2. You can pick this commit to fix the issue.
- The HBM usage of Qwen3 Next is higher than expected. It's a [known issue](https://github.com/vllm-project/vllm-ascend/issues/2884) and we're working on it. You can set `max_model_len` and `gpu_memory_utilization` to suitable value basing on your parallel config to avoid oom error.
- We notice that lora doesn't work with this release due to the refactor of kv cache. We'll fix it soon. [2941](https://github.com/vllm-project/vllm-ascend/issues/2941)
- Please do not enable chunked prefill with prefix cache when running with Ascend scheduler. The performance and accuracy is not good/correct. [#2943](https://github.com/vllm-project/vllm-ascend/issues/2943)
## v0.10.1rc1 - 2025.09.04
This is the 1st release candidate of v0.10.1 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started.

View File

@@ -42,7 +42,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5559
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \
@@ -70,9 +70,7 @@ vllm serve /models/deepseek_r1_w8a8 \
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"chunked_prefill_for_mla":true}'
}'
```
Run prefill server P2 on second node:
@@ -85,7 +83,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5659
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \
@@ -114,9 +112,7 @@ vllm serve /models/deepseek_r1_w8a8 \
"kv_port": "20001",
"engine_id": "0",
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"chunked_prefill_for_mla":true}'
}'
```
Run decode server d1 on third node:
@@ -131,7 +127,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5759
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \
@@ -173,7 +169,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
export OMP_PROC_BIND=false
export OMP_NUM_THREADS=100
export VLLM_USE_V1=1
export VLLM_LLMDD_RPC_PORT=5859
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
vllm serve /models/deepseek_r1_w8a8 \
--host 0.0.0.0 \

View File

@@ -17,6 +17,10 @@ parser.add_argument("--decode-device-cnt",
type=int,
required=True,
help="number of decode devices")
parser.add_argument("--local-device-ids",
type=str,
required=False,
help="local device ids")
args = parser.parse_args()
local_host = args.local_host
prefill_device_cnt = args.prefill_device_cnt
@@ -54,39 +58,49 @@ chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split(
"\n")[0].split(":")[1].strip()
chips_per_card = int(chips_per_card)
if args.local_device_ids:
local_device_ids = args.local_device_ids.split(',')
else:
local_device_ids = []
for card_id in range(num_cards):
for chip_id in range(chips_per_card):
device_id = card_id * chips_per_card + chip_id
local_device_ids.append(device_id)
# generate local device list for local rank 0, and gather it to all ranks
local_device_list: list[dict[str, str]] = list()
if local_rank == "0":
super_pod_id = "0"
for card_id in range(num_cards):
for chip_id in range(chips_per_card):
device_id = card_id * chips_per_card + chip_id
if soc_info == AscendSocVersion.A3:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
).split(":")[1].strip()
super_device_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
).split(":")[1].strip()
super_pod_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
).split(":")[1].strip()
else:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
).split(":")[1].strip()
for idx in range(len(local_device_ids)):
device_id = local_device_ids[idx]
chip_id = device_id % chips_per_card
card_id = device_id // chips_per_card
if soc_info == AscendSocVersion.A3:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
).split(":")[1].strip()
super_device_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
).split(":")[1].strip()
super_pod_id = get_cmd_stdout(
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
).split(":")[1].strip()
else:
device_ip = get_cmd_stdout(
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
).split(":")[1].strip()
device_info = {
"server_id": local_host,
"device_id": str(device_id),
"device_ip": str(device_ip),
}
if soc_info == AscendSocVersion.A3:
device_info.update({
"super_pod_id": str(super_pod_id),
"super_device_id": str(super_device_id)
})
local_device_list.append(device_info)
device_info = {
"server_id": local_host,
"device_id": str(device_id),
"device_ip": str(device_ip),
}
if soc_info == AscendSocVersion.A3:
device_info.update({
"super_pod_id": str(super_pod_id),
"super_device_id": str(super_device_id)
})
local_device_list.append(device_info)
dist.init_process_group(backend=dist.Backend.GLOO)
global_device_list = [None] * dist.get_world_size()

View File

@@ -33,6 +33,11 @@ while [[ $# -gt 0 ]]; do
DECODE_DEVICE_CNT="$1"
shift
;;
--local-device-ids)
shift
LOCAL_DEVICE_IDS="$1"
shift
;;
esac
done
LOCAL_HOSTS=($(hostname -I))
@@ -68,6 +73,10 @@ echo "NNODES": $NNODES
echo "NODE_RANK": $NODE_RANK
echo "==============="
if [ -n "$LOCAL_DEVICE_IDS" ]; then
OPTIONAL_SECTION=" --local-device-ids $LOCAL_DEVICE_IDS"
fi
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
--nproc_per_node 1 \
@@ -75,5 +84,5 @@ if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
--node_rank ${NODE_RANK} \
--master_addr ${MASTER_ADDR} \
--master_port ${MASTER_PORT} \
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT $OPTIONAL_SECTION
fi

View File

@@ -363,6 +363,7 @@ async def send_request_to_service(client: httpx.AsyncClient,
}
req_data["stream"] = False
req_data["max_tokens"] = 1
req_data["min_tokens"] = 1
if "stream_options" in req_data:
del req_data["stream_options"]
headers = {

View File

@@ -0,0 +1,272 @@
# Mooncacke Store Deployment Guide
## Environmental Dependencies
* Software:
* Python >= 3.9, < 3.12
* CANN >= 8.2.rc1
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
* vLLMmain branch
* vLLM-Ascendmain branch
* Mooncake[AscendTransport/Mooncake at pooling-async-memcpy](https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy)(Currently available branch code, continuously updated.)
Installation and Compilation Guidehttps://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy?tab=readme-ov-file#build-and-use-binaries
## run mooncake master
### 1.Configure mooncake.json
The environment variable **MOONCAKE_CONFIG_PATH** is configured to the full path where mooncake.json is located.
```
{
"local_hostname": "xx.xx.xx.xx",
"metadata_server": "P2PHANDSHAKE",
"protocol": "ascend",
"device_name": "",
"master_server_address": "xx.xx.xx.xx:50088",
"global_segment_size": 30000000000
}
```
**local_hostname**: Configured as the IP address of the current master node,
**metadata_server**: Configured as **P2PHANDSHAKE**,
**protocol:** Configured for Ascend to use Mooncake's HCCL communication,
**device_name**: ""
**master_server_address**: Configured with the IP and port of the master service
**global_segment_size**: Expands the kvcache size registered by the PD node to the master
### 2. Start mooncake_master
Under the mooncake folder:
```
mooncake_master --port 50088
```
## Pooling and Prefill Decode Disaggregate Scenario
### 1.Run `prefill` Node and `decode` Node
Using MultiConnector to simultaneously utilize both p2p connectors and pooled connectors. P2P performs kv_transfer, while pooling creates a larger prefix-cache.
`prefill` Node
```
bash multi_producer.sh
```
The content of the multi_producer.sh script:
```
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
export ASCEND_TRANSPORT_PRINT=1
export ACL_OP_INIT_MODE=1
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
export ASCEND_AGGREGATE_ENABLE=1
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
python3 -m vllm.entrypoints.openai.api_server \
--model /xxxxx/Qwen2.5-7B-Instruct \
--port 8100 \
--trust-remote-code \
--enforce-eager \
--no_enable_prefix_caching \
--tensor-parallel-size 1 \
--data-parallel-size 1 \
--max-model-len 10000 \
--block-size 128 \
--max-num-batched-tokens 4096 \
--kv-transfer-config \
'{
"kv_connector": "MultiConnector",
"kv_role": "kv_producer",
"kv_connector_extra_config": {
"use_layerwise": false,
"connectors": [
{
"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_producer",
"kv_port": "20001",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 1
},
"decode": {
"dp_size": 1,
"tp_size": 1
}
}
},
{
"kv_connector": "MooncakeConnectorStoreV1",
"kv_role": "kv_producer",
"mooncake_rpc_port":"0"
}
]
}
}' > p.log 2>&1
```
`decode` Node
```
bash multi_consumer.sh
```
The content of multi_consumer.sh:
```
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
export ACL_OP_INIT_MODE=1
export ASCEND_TRANSPORT_PRINT=1
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
export ASCEND_AGGREGATE_ENABLE=1
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
python3 -m vllm.entrypoints.openai.api_server \
--model /xxxxx/Qwen2.5-7B-Instruct \
--port 8200 \
--trust-remote-code \
--enforce-eager \
--no_enable_prefix_caching \
--tensor-parallel-size 1 \
--data-parallel-size 1 \
--max-model-len 10000 \
--block-size 128 \
--max-num-batched-tokens 4096 \
--kv-transfer-config \
'{
"kv_connector": "MultiConnector",
"kv_role": "kv_consumer",
"kv_connector_extra_config": {
"use_layerwise": false,
"connectors": [
{
"kv_connector": "MooncakeConnectorV1",
"kv_role": "kv_consumer",
"kv_port": "20002",
"kv_connector_extra_config": {
"prefill": {
"dp_size": 1,
"tp_size": 1
},
"decode": {
"dp_size": 1,
"tp_size": 1
}
}
},
{
"kv_connector": "MooncakeConnectorStoreV1",
"kv_role": "kv_consumer",
"mooncake_rpc_port":"1"
}
]
}
}' > d.log 2>&1
```
### 2、Start proxy_server.
```
bash proxy.sh
```
proxy.sh content:
Change localhost to your actual IP address.
```
python vllm-ascend/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py \
--host localhost\
--prefiller-hosts localhost \
--prefiller-ports 8100 \
--decoder-hosts localhost\
--decoder-ports 8200 \
```
### 3. Run Inference
Configure the localhost, port, and model weight path in the command to your own settings.
Short question:
```
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
```
Long question:
```
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
```
## Pooling and Mixed Deployment Scenario
### 1、Run Mixed Department Script
The mixed script is essentially a pure pooling scenario for the P node.
```
bash mixed_department.sh
```
Content of mixed_department.sh:
```
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
export VLLM_USE_V1=1
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
export ACL_OP_INIT_MODE=1
export ASCEND_TRANSPORT_PRINT=1
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
export ASCEND_AGGREGATE_ENABLE=1
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
python3 -m vllm.entrypoints.openai.api_server \
--model /xxxxx/Qwen2.5-7B-Instruct \
--port 8100 \
--trust-remote-code \
--enforce-eager \
--no_enable_prefix_caching \
--tensor-parallel-size 1 \
--data-parallel-size 1 \
--max-model-len 10000 \
--block-size 128 \
--max-num-batched-tokens 4096 \
--kv-transfer-config \
'{
"kv_connector": "MooncakeConnectorStoreV1",
"kv_role": "kv_both",
"kv_connector_extra_config": {
"use_layerwise": false,
"mooncake_rpc_port":"0"
}
}' > mix.log 2>&1
```
### 2. Run Inference
Configure the localhost, port, and model weight path in the command to your own settings. The requests sent will only go to the port where the mixed deployment script is located, and there is no need to start a separate proxy.
Short question:
```
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
```
Long question:
```
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
```

View File

@@ -43,4 +43,4 @@ vllm serve model_path \
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
}' \
--additional-config \
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}'
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true, "enable_multistream_moe":false}'

View File

@@ -79,7 +79,7 @@ def run_prefill(prefill_done, process_close):
def run_decode(prefill_done):
os.environ['VLLM_LLMDD_RPC_PORT'] = '6634'
os.environ['VLLM_ASCEND_LLMDD_RPC_PORT'] = '6634'
# ranktable.json needs be generated using gen_ranktable.sh
# from the examples/disaggregated_prefill_v1 module in the main branch.
os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json"

View File

@@ -0,0 +1,326 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py
# Note: This script is designed to run with e2e test,
# please be careful to modify it.
"""
Usage:
Single node:
Dense models:
python examples/offline_weight_load.py \
--model="Qwen/Qwen2.5-0.5B-Instruct" \
--tp-size=1 \
--proc-per-node=2
MOE models:
python examples/offline_weight_load.py \
--model="Qwen/Qwen3-30B-A3B" \
--tp-size=2 \
--proc-per-node=2 \
--enable-expert-parallel
Multi-node:
Node 0 (assume the node has ip of 10.99.48.128):
python examples/offline_weight_load.py \
--model="Qwen/Qwen3-30B-A3B" \
--tp-size=2 \
--node-size=2 \
--node-rank=0 \
--proc-per-node=2 \
--enable-expert-parallel \
--master-addr=10.99.48.128 \
--master-port=13345
Node 1:
python examples/offline_weight_load.py \
--model="Qwen/Qwen3-30B-A3B" \
--tp-size=2 \
--node-size=2 \
--node-rank=1 \
--enable-expert-parallel \
--master-addr=10.99.48.128 \
--master-port=13345
"""
import argparse
import contextlib
import gc
import os
from multiprocessing import Process
from time import sleep
import torch
from vllm import LLM, SamplingParams
from vllm.distributed.parallel_state import ( # noqa E402
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
from vllm.utils import get_open_port, GiB_bytes
from safetensors.torch import load_file
os.environ["VLLM_USE_MODELSCOPE"] = "True"
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
def patch_vllm_moe_model_weight_loader(model):
# Define MLP attribute mapping for different model types
model = getattr(model, "model", None) or getattr(model, "language_model", None)
if model is None:
raise ValueError("The provided model does not have a valid 'model' or 'language_model' attribute.")
for layer in model.layers:
mlp_attr = "mlp"
mlp = getattr(layer, mlp_attr)
param_dict = dict(mlp.named_parameters())
for name, param in param_dict.items():
if "w13_weight" in name or "w2_weight" in name:
param.weight_loader = mlp.experts.weight_loader
def load_and_merge_safetensors(directory):
merged_dict = {}
if not os.path.isdir(directory):
raise ValueError(f"directory is not exist : {directory}")
for filename in os.listdir(directory):
if filename.endswith('.safetensors'):
file_path = os.path.join(directory, filename)
print(f"loading file: {file_path}")
f = load_file(file_path)
merged_dict.update(f)
return merged_dict
def parse_args():
parser = argparse.ArgumentParser(description="External launcher Inference")
parser.add_argument(
"--model",
type=str,
default="Qwen/Qwen3-0.6B",
help="Model name or path",
)
parser.add_argument("--tp-size",
type=int,
default=1,
help="Tensor parallel size")
parser.add_argument("--node-size",
type=int,
default=1,
help="Total number of nodes")
parser.add_argument("--node-rank",
type=int,
default=0,
help="Rank of the current node")
parser.add_argument("--proc-per-node",
type=int,
default=1,
help="Number of processes per node")
parser.add_argument("--master-addr",
type=str,
default="",
help="Master node IP address")
parser.add_argument("--master-port",
type=int,
default=0,
help="Master node port")
parser.add_argument("--enforce-eager",
action="store_true",
help="Enforce eager mode execution.")
parser.add_argument("--trust-remote-code",
action="store_true",
help="Trust remote code.")
parser.add_argument("--enable-expert-parallel",
action="store_true",
help="Enable expert parallel, used in MOE models.")
parser.add_argument("--enable-sleep-mode",
action="store_true",
help="Enable sleep mode for the engine.")
parser.add_argument("--temperature",
type=float,
default=0.8,
help="Float that controls the randomness of the sampling.")
parser.add_argument("--model-weight-gib",
type=float,
default=None,
help="Model weight memory usage in GiB (e.g., 1.0 for 0.5B model).")
args = parser.parse_args()
if args.enable_sleep_mode:
if args.model_weight_gib is None or args.temperature != 0:
parser.error("model-weight-gib must be provided, and temperature must be zero when enable-sleep-mode is set.")
if args.model_weight_gib <= 0:
parser.error("model-weight-gib must be greater than 0 when enable-sleep-mode is set.")
if args.model == parser.get_default("model") and args.model_weight_gib is None:
parser.error("model-weight-gib must be provided for default model when enable-sleep-mode is set.")
return args
def main(
local_rank: int,
rank: int,
master_addr: str,
master_port: int,
model_weight_gib: float,
model: str = "Qwen/Qwen3-30B-A3B",
world_size: int = 4,
tensor_parallel_size: int = 2,
enable_expert_parallel: bool = False,
enforce_eager: bool = True,
trust_remote_code: bool = True,
enable_sleep_mode: bool = False,
temperature: float = 0.8,
):
os.environ["MASTER_ADDR"] = master_addr
os.environ["MASTER_PORT"] = str(master_port)
os.environ["RANK"] = str(rank)
os.environ["LOCAL_RANK"] = str(local_rank)
os.environ["WORLD_SIZE"] = str(world_size)
if not torch.distributed.is_initialized():
torch.distributed.init_process_group(
backend="cpu:gloo,npu:hccl",
world_size=world_size,
rank=rank,
)
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 10
sampling_params = SamplingParams(
temperature=temperature,
top_p=0.95,
max_tokens=10,
)
llm = LLM(
model=model,
tensor_parallel_size=tensor_parallel_size,
enable_expert_parallel=enable_expert_parallel,
enforce_eager=enforce_eager,
trust_remote_code=trust_remote_code,
distributed_executor_backend="external_launcher",
seed=0,
gpu_memory_utilization = 0.95,
enable_sleep_mode=enable_sleep_mode,
)
model_path = model
runmodel = llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
patch_vllm_moe_model_weight_loader(runmodel)
sd = load_and_merge_safetensors(model_path)
runmodel.load_weights(sd.items())
print('load state dict done')
tp_ranks = get_tp_group().ranks
print(f'TP RANKS: {tp_ranks}')
outputs = llm.generate(prompts, sampling_params)
if enable_sleep_mode:
if rank == 0:
free_bytes_before_sleep, total = torch.npu.mem_get_info()
llm.sleep(level=1)
if rank == 0:
free_bytes_after_sleep, total = torch.npu.mem_get_info()
freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
print(f"Freed memory: {freed_bytes / 1024 ** 3:.2f} GiB")
# now the freed memory should be larger than the model weights
assert freed_bytes >= model_weight_gib / tensor_parallel_size * GiB_bytes
llm.wake_up()
outputs_after_wakeup = llm.generate(prompts, sampling_params)
if rank == 0:
# cmp output
assert outputs[0].outputs[0].text == outputs_after_wakeup[0].outputs[0].text
print("Sleep and wake up successfully!!")
for i, output in enumerate(outputs):
if i >= 5:
# print only 5 outputs
break
prompt = output.prompt
generated_text = output.outputs[0].text
print(f"Global rank: {rank}, Prompt: {prompt!r}, "
f"Generated text: {generated_text!r}")
# Give engines time to pause their processing loops before exiting.
sleep(5)
del llm
cleanup_env_and_memory()
def cleanup_env_and_memory():
destroy_model_parallel()
destroy_distributed_environment()
with contextlib.suppress(AssertionError):
torch.distributed.destroy_process_group()
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()
if __name__ == "__main__":
args = parse_args()
tp_size = args.tp_size
node_size = args.node_size
proc_per_node = args.proc_per_node
node_rank = args.node_rank
if node_size == 1:
master_addr = "127.0.0.1"
master_port = get_open_port()
else:
master_addr = args.master_addr
master_port = args.master_port
world_size = node_size * proc_per_node
procs = []
for local_rank, rank in enumerate(
range(proc_per_node * node_rank, proc_per_node * (node_rank + 1))):
proc = Process(target=main,
args=(
local_rank,
rank,
master_addr,
master_port,
args.model_weight_gib,
args.model,
world_size,
tp_size,
args.enable_expert_parallel,
args.enforce_eager,
args.trust_remote_code,
args.enable_sleep_mode,
args.temperature,
))
proc.start()
procs.append(proc)
exit_code = 0
for proc in procs:
proc.join(timeout=600)
if proc.exitcode is None:
print(
f"Killing process {proc.pid} that didn't stop within 30 minutes."
)
proc.kill()
exit_code = 1
elif proc.exitcode:
exit_code = proc.exitcode
exit(exit_code)

View File

@@ -29,4 +29,4 @@ vllm serve Qwen/Qwen1.5-MoE-A2.7B \
--gpu-memory-utilization 0.9 \
--trust-remote-code \
--enforce-eager \
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}'
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'

View File

@@ -5,7 +5,7 @@ openai
pytest >= 6.0
pytest-asyncio
pytest-mock
lm-eval==0.4.8
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
types-jsonschema
xgrammar
zmq

View File

@@ -14,7 +14,7 @@ _err() { _red "Error: $*" && exit 1; }
CURL_TIMEOUT=1
CURL_COOLDOWN=5
CURL_MAX_TRIES=180
CURL_MAX_TRIES=300
function wait_url_ready() {
local serve_name="$1"

View File

@@ -32,7 +32,14 @@ from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
BatchEncoding, BatchFeature)
from transformers.models.auto.auto_factory import _BaseAutoModelClass
from vllm import LLM, SamplingParams
from vllm.config import TaskOption, _get_and_verify_dtype
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"):
from vllm.config import TaskOption, _get_and_verify_dtype
else:
from vllm.config.model import TaskOption, _get_and_verify_dtype
from vllm.inputs import TextPrompt
from vllm.outputs import RequestOutput
from vllm.transformers_utils.utils import maybe_model_redirect

View File

@@ -57,8 +57,8 @@ function quickstart_online_test() {
}
_info "====> Start simple_test"
simple_test
time simple_test
_info "====> Start quickstart_offline_test"
quickstart_offline_test
time quickstart_offline_test
_info "====> Start quickstart_online_test"
quickstart_online_test
time quickstart_online_test

View File

@@ -59,4 +59,4 @@ function install_binary_test() {
}
_info "====> Start install_binary_test"
install_binary_test
time install_binary_test

View File

@@ -19,7 +19,12 @@
from typing import Dict, List, Optional, Sequence, Tuple, Union
from vllm.sequence import PromptLogprobs, SampleLogprobs
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"):
from vllm.sequence import PromptLogprobs, SampleLogprobs
else:
from vllm.logprobs import PromptLogprobs, SampleLogprobs
TokensText = Tuple[List[int], str]

View File

@@ -1,12 +1,16 @@
model_name: "deepseek-ai/DeepSeek-V2-Lite"
runner: "linux-aarch64-a2-2"
hardware: "Atlas A2 Series"
tasks:
- name: "gsm8k"
metrics:
- name: "exact_match,strict-match"
value: 0.375
value: 0.385
- name: "exact_match,flexible-extract"
value: 0.375
value: 0.385
tensor_parallel_size: 2
batch_size: 32
gpu_memory_utilization: 0.7
apply_chat_template: False
fewshot_as_multiturn: False
trust_remote_code: True

View File

@@ -1,4 +1,6 @@
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
runner: "linux-aarch64-a2-1"
hardware: "Atlas A2 Series"
model: "vllm-vlm"
tasks:
- name: "mmmu_val"

View File

@@ -1,4 +1,6 @@
model_name: "Qwen/Qwen3-30B-A3B"
runner: "linux-aarch64-a2-2"
hardware: "Atlas A2 Series"
tasks:
- name: "gsm8k"
metrics:

View File

@@ -1,4 +1,6 @@
model_name: "Qwen/Qwen3-8B-Base"
runner: "linux-aarch64-a2-1"
hardware: "Atlas A2 Series"
tasks:
- name: "gsm8k"
metrics:

View File

@@ -1,3 +1,4 @@
DeepSeek-V2-Lite.yaml
Qwen3-8B-Base.yaml
Qwen2.5-VL-7B-Instruct.yaml
Qwen3-30B-A3B.yaml

View File

@@ -2,16 +2,28 @@
- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}
- **Hardware Environment**: Atlas A2 Series
- **Hardware Environment**: {{ hardware }}
- **Parallel mode**: {{ parallel_mode }}
- **Execution mode**: ACLGraph
- **Execution mode**: {{ execution_model }}
**Command**:
```bash
export MODEL_ARGS={{ model_args }}
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}}
{% if apply_chat_template is defined and (apply_chat_template|string|lower in ["true", "1"]) -%}
--apply_chat_template \
{%- endif %}
{% if fewshot_as_multiturn is defined and (fewshot_as_multiturn|string|lower in ["true", "1"]) -%}
--fewshot_as_multiturn \
{%- endif %}
{% if num_fewshot is defined and num_fewshot != "N/A" -%}
--num_fewshot {{ num_fewshot }} \
{%- endif %}
{% if limit is defined and limit != "N/A" -%}
--limit {{ limit }} \
{%- endif %}
--batch_size {{ batch_size }}
```
| Task | Metric | Value | Stderr |

View File

@@ -69,6 +69,8 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
if model_args.get('enable_expert_parallel', False):
parallel_mode += " + EP"
execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
report_content = template.render(
vllm_version=env_config.vllm_version,
vllm_commit=env_config.vllm_commit,
@@ -77,6 +79,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
cann_version=env_config.cann_version,
torch_version=env_config.torch_version,
torch_npu_version=env_config.torch_npu_version,
hardware=eval_config.get("hardware", "unknown"),
model_name=eval_config["model_name"],
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
model_type=eval_config.get("model", "vllm"),
@@ -84,10 +87,11 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
apply_chat_template=eval_config.get("apply_chat_template", True),
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
limit=eval_config.get("limit", "N/A"),
batch_size="auto",
batch_size=eval_config.get("batch_size", "auto"),
num_fewshot=eval_config.get("num_fewshot", "N/A"),
rows=report_data["rows"],
parallel_mode=parallel_mode)
parallel_mode=parallel_mode,
execution_model=execution_model)
report_output = os.path.join(
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
@@ -110,7 +114,7 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
"apply_chat_template": eval_config.get("apply_chat_template", True),
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
"limit": eval_config.get("limit", None),
"batch_size": "auto",
"batch_size": eval_config.get("batch_size", "auto"),
}
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
val = eval_config.get(s, None)

View File

@@ -14,14 +14,24 @@ def test_e2e_ep_correctness(model_name):
]
max_tokens = 5
with VllmRunner(model_name, tensor_parallel_size=2,
enforce_eager=True) as vllm_model:
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
with VllmRunner(
model_name,
tensor_parallel_size=2,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=True) as vllm_model:
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
with VllmRunner(model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
enforce_eager=True) as vllm_model:
with VllmRunner(
model_name,
tensor_parallel_size=2,
enable_expert_parallel=True,
additional_config={"ascend_scheduler_config": {
"enabled": True
}},
enforce_eager=True) as vllm_model:
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
check_outputs_equal(

View File

@@ -23,6 +23,7 @@ Run `pytest tests/test_offline_inference.py`.
import os
from unittest.mock import patch
import pytest
from modelscope import snapshot_download # type: ignore
from vllm import SamplingParams
@@ -30,6 +31,15 @@ from tests.e2e.conftest import VllmRunner
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
QWEN_DENSE_MODELS = [
"vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
]
DEEPSEEK_W4A8_MODELS = [
"vllm-ascend/DeepSeek-V3-W4A8-Pruing",
"vllm-ascend/DeepSeek-V3.1-W4A8-puring"
]
def test_models_distributed_QwQ():
example_prompts = [
@@ -61,8 +71,8 @@ def test_models_distributed_DeepSeek_multistream_moe():
additional_config={
"torchair_graph_config": {
"enabled": True,
"enable_multistream_moe": True,
},
"enable_multistream_moe": True,
"ascend_scheduler_config": {
"enabled": True,
},
@@ -104,14 +114,15 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC():
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
def test_models_distributed_DeepSeek_W4A8DYNAMIC():
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download("vllm-ascend/DeepSeek-V3-W4A8-Pruing"),
snapshot_download(model),
dtype="auto",
tensor_parallel_size=2,
quantization="ascend",
@@ -150,3 +161,46 @@ def test_sp_for_qwen3_moe() -> None:
enable_expert_parallel=True,
enforce_eager=True) as vllm_model:
vllm_model.generate(example_prompts, sampling_params)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM": "1"})
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
enforce_eager=enforce_eager,
dtype="auto",
tensor_parallel_size=2,
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)
@pytest.mark.parametrize("enforce_eager", [True, False])
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
model, enforce_eager):
example_prompts = [
"Hello, my name is",
]
max_tokens = 5
with VllmRunner(
snapshot_download(model),
max_model_len=8192,
enforce_eager=enforce_eager,
dtype="auto",
tensor_parallel_size=2,
quantization="ascend",
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -116,20 +116,22 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
with VllmRunner(model,
additional_config={
'ascend_scheduler_config': {
'enabled': True,
'enable_prefix_caching': True,
"enable_chunked_prefill": True,
},
},
enforce_eager=True,
max_model_len=2048,
tensor_parallel_size=2,
gpu_memory_utilization=0.7) as vllm_model:
chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
INPUT_PROMPTS, max_tokens)
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
# Disable it now. Fix it or drop the ascend scheduler in the future.
# with VllmRunner(model,
# additional_config={
# 'ascend_scheduler_config': {
# 'enabled': True,
# 'enable_prefix_caching': True,
# "enable_chunked_prefill": True,
# },
# },
# enforce_eager=True,
# max_model_len=2048,
# tensor_parallel_size=2,
# gpu_memory_utilization=0.7) as vllm_model:
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
# INPUT_PROMPTS, max_tokens)
check_outputs_equal(
outputs_0_lst=vllm_output,
@@ -138,9 +140,9 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
name_1="prefix_cache_output",
)
check_outputs_equal(
outputs_0_lst=chunk_prefill_prefix_cache_output,
outputs_1_lst=prefix_cache_output,
name_0="chunk_prefill_prefix_cache_output",
name_1="prefix_cache_output",
)
# check_outputs_equal(
# outputs_0_lst=chunk_prefill_prefix_cache_output,
# outputs_1_lst=prefix_cache_output,
# name_0="chunk_prefill_prefix_cache_output",
# name_1="prefix_cache_output",
# )

View File

@@ -66,7 +66,6 @@ def test_models_distributed_Qwen3_MOE_W8A8():
max_model_len=8192,
tensor_parallel_size=2,
quantization="ascend",
enforce_eager=True,
) as vllm_model:
vllm_model.generate_greedy(example_prompts, max_tokens)

View File

@@ -22,6 +22,8 @@ Run `pytest tests/multicard/test_torchair_graph_mode.py`.
import os
from typing import Dict
import pytest
from tests.e2e.conftest import VllmRunner
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
@@ -153,6 +155,7 @@ def _pangu_torchair_test_fixture(
print(f"Generated text: {vllm_output[i][1]!r}")
@pytest.mark.skip("skipping test_e2e_pangu_with_torchair")
def test_e2e_pangu_with_torchair():
additional_config = {
"torchair_graph_config": {

View File

@@ -0,0 +1,188 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with and without aclgraph.
Run `pytest tests/multicard/test_external_launcher.py`.
"""
import os
import subprocess
import sys
import pytest
import torch_npu
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
MODELS = ["Qwen/Qwen3-8B"]
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
@pytest.mark.parametrize("model", MOE_MODELS)
def test_external_launcher_eager(model):
script = script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enforce-eager",
"--enable-expert-parallel",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MOE_MODELS)
def test_external_launcher_aclgraph(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-expert-parallel",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MODELS)
def test_external_launcher_dense(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0
@pytest.mark.parametrize("model", MODELS)
def test_external_launcher_dense_eager(model):
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
env = os.environ.copy()
# TODO: Change to 2 when ci machine has 4 cards
cmd = [
sys.executable,
str(script),
"--model",
model,
"--tp-size",
"2",
"--proc-per-node",
"2",
"--trust-remote-code",
"--enforce-eager",
"--enable-sleep-mode",
"--model-weight-gib",
"20",
]
print(f"Running subprocess: {' '.join(cmd)}")
proc = subprocess.run(
cmd,
env=env,
stdout=subprocess.PIPE,
stderr=subprocess.STDOUT,
timeout=600,
)
output = proc.stdout.decode()
print(output)
assert "TP RANKS: [0]" in output
assert "TP RANKS: [1]" in output
assert "Generated text:" in output
assert proc.returncode == 0

View File

@@ -70,7 +70,7 @@ run_tests_for_model() {
# Start prefill instance
PREFILL_PORT=8001
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_LLMDD_RPC_PORT=5559 vllm serve $model_name \
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \
--port $PREFILL_PORT \
--seed 1024 \
--enforce-eager \
@@ -90,7 +90,7 @@ run_tests_for_model() {
DECODE_PORT=8002
# Build the command with or without model-specific args
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_LLMDD_RPC_PORT=6000 vllm serve $model_name \
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \
--port $DECODE_PORT \
--seed 1024 \
--enforce-eager \

View File

@@ -22,7 +22,6 @@ set -eo errexit
. $(dirname "$0")/common.sh
export VLLM_USE_MODELSCOPE=true
export VLLM_LOGGING_LEVEL=ERROR
_info "====> Start Quickstart test"
. "${SCRIPT_DIR}/doctests/001-quickstart-test.sh"

View File

@@ -33,8 +33,8 @@ def test_bgmv_expand():
y_npu = y.npu()
y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0,
128)
y_out_npu = torch.ops._C_ascend.bgmv_expand(x_npu, w_npu, indices_npu,
y_npu, 0, 128)
# Compare the results.
torch.testing.assert_close(y_out_npu.cpu(),

View File

@@ -33,7 +33,7 @@ def test_bgmv_shrink():
y_npu = y.npu()
y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
torch.ops._C_ascend.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
# Compare the results.
torch.testing.assert_close(y_npu.cpu(),

View File

@@ -28,12 +28,12 @@ import torch
import torch_npu
from vllm.model_executor.layers.activation import SiluAndMul
from vllm_ascend.ops.layers.experts_selector import select_experts
from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
TokenDispatcherWithAllGather
from vllm_ascend.ops.moe.experts_selector import select_experts
from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
NUM_EXPERTS = [8, 64]
EP_SIZE = [1, 4]
EP_SIZE = [1]
TOP_KS = [2, 6]
DEVICE = ["npu"]
@@ -115,19 +115,6 @@ def test_token_dispatcher_with_all_gather(
w1_local = w1
w2_local = w2
if ep_size > 1:
local_e = e // ep_size
e_ids = torch.arange(local_e * 0,
local_e * (0 + 1),
device=device,
dtype=torch.int32)
expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
expert_map[e_ids] = torch.arange(local_e,
device=device,
dtype=torch.int32)
w1_local = w1[e_ids]
w2_local = w2[e_ids]
score = torch.softmax(score, dim=-1, dtype=dtype)
topk_weights, topk_ids = torch.topk(score, topk)
topk_ids = topk_ids.to(torch.int32)
@@ -179,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
torch.npu.reset_peak_memory_stats()
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("k", [128, 511, 1024])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@pytest.mark.parametrize("topk", TOP_KS)
@pytest.mark.parametrize("ep_size", EP_SIZE)
@pytest.mark.parametrize("dtype", [torch.bfloat16])
@pytest.mark.parametrize("device", DEVICE)
def test_token_dispatcher_with_all_gather_quant(
m: int,
n: int,
k: int,
e: int,
topk: int,
ep_size: int,
dtype: torch.dtype,
device: str,
):
context_mock = MagicMock()
context_mock.fused_moe_state = 0
with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
return_value=context_mock):
a = torch.randn((m, k), device=device, dtype=dtype) / 10
w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
w2_scale = torch.empty((e, k), device=device, dtype=dtype)
score = torch.randn((m, e), device=device, dtype=dtype)
expert_map = None
local_e = e
score = torch.softmax(score, dim=-1, dtype=dtype)
topk_weights, topk_ids = torch.topk(score, topk)
topk_ids = topk_ids.to(torch.int32)
row_idx = (torch.arange(
0,
m * topk,
device=device,
dtype=torch.int32,
).view(topk, -1).permute(1, 0).contiguous())
dispatcher_kwargs = {
"num_experts": e,
"top_k": topk,
"num_local_experts": local_e,
}
dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
apply_router_weight_on_input = False
dispatch_output = dispatcher.token_dispatch(
hidden_states=a,
topk_weights=topk_weights,
topk_ids=topk_ids,
row_idx=row_idx,
expert_map=expert_map,
apply_router_weight_on_input=apply_router_weight_on_input,
with_quant=True)
sorted_hidden_states = dispatch_output["hidden_states"]
group_list = dispatch_output["group_list"]
group_list_type = dispatch_output.get("group_list_type", 1)
dynamic_scale = dispatch_output["dynamic_scale"]
expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
w1=w1,
w1_scale=w1_scale,
w2=w2,
w2_scale=w2_scale,
group_list=group_list,
group_list_type=group_list_type,
dynamic_scale=dynamic_scale,
with_quant=True)
combined_output = dispatcher.token_combine(hidden_states=expert_output,
bias=None)
assert combined_output.shape == (m, k)
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()
@pytest.mark.parametrize("m", [1, 33, 64])
@pytest.mark.parametrize("n", [128, 1024, 2048])
@pytest.mark.parametrize("e", NUM_EXPERTS)
@@ -222,7 +290,7 @@ def test_select_experts(
dtype=torch.int32)
custom_routing_function.return_value = (mock_weights, mock_ids)
with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk"
with patch("vllm_ascend.ops.moe.experts_selector._native_grouped_topk"
) as mock_native_grouped_topk:
mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
x)

View File

@@ -1,175 +0,0 @@
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
import gc
from types import SimpleNamespace
import pytest
import torch
from vllm.model_executor.layers.fused_moe.config import ( # isort: skip
FusedMoEConfig, FusedMoEParallelConfig)
from vllm_ascend.distributed.moe_comm_method import ( # isort: skip
AllGatherCommImpl, NativeAllGatherCommImpl)
@pytest.mark.parametrize("num_tokens", [16, 128])
@pytest.mark.parametrize("hidden_size", [64, 128])
@pytest.mark.parametrize("global_num_experts", [8, 16])
@pytest.mark.parametrize("num_local_experts", [4, 8])
@pytest.mark.parametrize("top_k_num", [2, 4])
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
@pytest.mark.parametrize("ep_rank", [0, 1])
@pytest.mark.parametrize("apply_a8_quantization", [False])
def test_all_gather_comm_impl(
num_tokens,
hidden_size,
global_num_experts,
num_local_experts,
top_k_num,
dtype,
ep_rank,
apply_a8_quantization,
mocker,
):
"""
Tests the AllGatherCommImpl against the NativeAllGatherCommImpl.
This test compares the outputs of the NPU-optimized AllGatherCommImpl
with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure
correctness across various configurations.
"""
if top_k_num > global_num_experts:
pytest.skip("top_k_num cannot be greater than global_num_experts")
if num_local_experts > global_num_experts:
pytest.skip(
"num_local_experts cannot be greater than global_num_experts")
device = torch.device("npu")
# mock get_tensor_model_parallel_rank to return ep_rank
mocker.patch(
"vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank",
return_value=ep_rank,
)
# make moe config
parallel_config = SimpleNamespace(
enable_expert_parallel=num_local_experts < global_num_experts)
moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
tp_size_=max(2, global_num_experts // num_local_experts),
dp_size_=1,
vllm_parallel_config=parallel_config,
)
moe_config = FusedMoEConfig(
num_experts=global_num_experts,
experts_per_token=top_k_num,
hidden_dim=hidden_size,
num_local_experts=num_local_experts,
moe_parallel_config=moe_parallel_config,
in_dtype=dtype,
quant_config=None, # No quantization in this test
max_num_tokens=num_tokens,
)
# Instantiate implementations
native_impl = NativeAllGatherCommImpl(moe_config)
all_gather_impl = AllGatherCommImpl(moe_config)
# --- Input Data ---
hidden_states = torch.randn(num_tokens,
hidden_size,
device=device,
dtype=dtype)
topk_ids = torch.randint(0,
global_num_experts, (num_tokens, top_k_num),
device=device,
dtype=torch.int32)
topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype)
topk_weights = torch.nn.functional.softmax(topk_weights, dim=1)
num_experts = global_num_experts
expert_map = None
if num_local_experts < global_num_experts:
# Create a map where some experts are local and some are not
expert_map = torch.full((global_num_experts, ), -1, device=device)
expert_map[ep_rank * num_local_experts:(ep_rank + 1) *
num_local_experts] = torch.arange(num_local_experts,
device=device)
num_experts = num_local_experts
# --- Run Native Implementation (Golden Reference) ---
native_hidden_states_out = hidden_states.clone()
(
native_permuted_hidden,
native_expert_tokens,
_,
_,
) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
num_experts, apply_a8_quantization)
# Simulate MLP output
native_mlp_output = torch.randn_like(native_permuted_hidden)
native_impl.unpermute(native_mlp_output, native_hidden_states_out)
# --- Run AllGather Implementation ---
all_gather_hidden_states_out = hidden_states.clone()
(
all_gather_permuted_hidden,
all_gather_expert_tokens,
_,
_,
) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
expert_map, num_experts, apply_a8_quantization)
# Use the same simulated MLP output for a fair comparison
all_gather_mlp_output = native_mlp_output.clone()
all_gather_impl.unpermute(all_gather_mlp_output,
all_gather_hidden_states_out)
# --- Assertions ---
# Define tolerance based on dtype
atol = 1e-3 if dtype == torch.float16 else 1e-2
rtol = 1e-3 if dtype == torch.float16 else 1e-2
# 1. Compare expert_tokens from pre_process
assert torch.allclose(native_expert_tokens.to(
all_gather_expert_tokens.device),
all_gather_expert_tokens,
atol=atol,
rtol=rtol), "Expert tokens do not match."
# 2. Compare permuted_hidden_states from pre_process
num_valid_tokens = native_expert_tokens.sum()
assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to(
all_gather_permuted_hidden.device),
all_gather_permuted_hidden[:num_valid_tokens],
atol=atol,
rtol=rtol), "Permuted hidden states do not match."
# 3. Compare final hidden_states from post_process
assert torch.allclose(native_hidden_states_out.to(
all_gather_hidden_states_out.device),
all_gather_hidden_states_out,
atol=atol,
rtol=rtol), "Final hidden states do not match."
gc.collect()
torch.npu.empty_cache()
torch.npu.reset_peak_memory_stats()

View File

@@ -182,7 +182,7 @@ def test_rotary_embedding_quant_with_leading_dim(
)
ref_query, ref_key = rope.forward_native(positions, query, key)
query, key = torch.ops._C.rotary_embedding(
query, key = torch.ops._C_ascend.rotary_embedding(
positions,
query,
key,
@@ -239,7 +239,7 @@ class ModelwithRotaryEmbedding(nn.Module):
# we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph
qkv = self.qkv_proj(hidden_states)
q, k, v = qkv.chunk(3, dim=-1)
query, key = torch.ops._C.rotary_embedding(
query, key = torch.ops._C_ascend.rotary_embedding(
positions,
q,
k,
@@ -299,7 +299,7 @@ def test_capture_rotary_embedding_in_aclgraph(
# Validate if the rotary_embedding custom kernel is indeed inside the graph by
# string match
graph = str(gm.graph)
assert "_C.rotary_embedding" in graph
assert "_C_ascend.rotary_embedding" in graph
return gm
static_positions = torch.randint(0, max_position_embeddings,

View File

@@ -72,7 +72,7 @@ def test_get_masked_input_and_mask(
# Get custom op result
print("input_tensor:", input_tensor)
custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask(
custom_masked_input, custom_mask = torch.ops._C_ascend.get_masked_input_and_mask(
input_tensor, test_case["org_start"], test_case["org_end"],
test_case["padding"], test_case["added_start"], test_case["added_end"])

View File

@@ -1,14 +1,10 @@
from __future__ import annotations
import os
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@pytest.fixture
def sampling_config():
@@ -20,9 +16,10 @@ def model_name():
return "wemaster/deepseek_mtp_main_random_bf16"
def test_mtp_correctness(
def mtp_correctness(
sampling_config: SamplingParams,
model_name: str,
num_speculative_tokens: int,
):
example_prompts = [
"Hello, my name is",
@@ -38,7 +35,7 @@ def test_mtp_correctness(
tensor_parallel_size=1,
gpu_memory_utilization=0.7,
max_model_len=256,
enforce_eager=True) as ref_llm:
enforce_eager=False) as ref_llm:
ref_outputs = ref_llm.generate(example_prompts, sampling_config)
with VllmRunner(
@@ -50,9 +47,9 @@ def test_mtp_correctness(
enable_expert_parallel=True,
speculative_config={
"method": "deepseek_mtp",
"num_speculative_tokens": 1,
"num_speculative_tokens": num_speculative_tokens,
},
enforce_eager=True,
enforce_eager=False,
max_model_len=2000,
additional_config={"ascend_scheduler_config": {
"enabled": False
@@ -74,3 +71,18 @@ def test_mtp_correctness(
# Heuristic: expect at least 66% of the prompts to match exactly
# Upon failure, inspect the outputs to check for inaccuracy.
assert matches > int(0.66 * len(ref_outputs))
del spec_llm
def test_mtp1_correctness(
sampling_config: SamplingParams,
model_name: str,
):
mtp_correctness(sampling_config, model_name, 1)
def test_mtp2_correctness(
sampling_config: SamplingParams,
model_name: str,
):
mtp_correctness(sampling_config, model_name, 2)

View File

@@ -1,14 +1,10 @@
from __future__ import annotations
import os
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
@pytest.fixture
def sampling_config():

View File

@@ -99,7 +99,6 @@ def test_ngram_correctness(
assert matches > int(0.7 * len(ref_outputs))
@pytest.mark.skipif(True, reason="oom in CI, fix me")
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
def test_eagle_correctness(
test_prompts: list[list[dict[str, Any]]],
@@ -111,8 +110,6 @@ def test_eagle_correctness(
Compare the outputs of a original LLM and a speculative LLM
should be the same when using eagle speculative decoding.
'''
if not use_eagle3:
pytest.skip("Not current support for the test.")
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
@@ -121,7 +118,6 @@ def test_eagle_correctness(
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
with VllmRunner(
model_name,
trust_remote_code=True,
enable_chunked_prefill=True,
max_num_seqs=1,
max_num_batched_tokens=2048,

View File

@@ -1,6 +1,7 @@
# SPDX-License-Identifier: Apache-2.0
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
@@ -86,3 +87,25 @@ def test_chunked_prefill_with_ascend_scheduler(
name_0="vllm_output",
name_1="chunked_prefill_output",
)
def test_async_scheduling() -> None:
prompts = [
"Hello, my name is",
"The president of the United States is",
"The capital of France is",
"The future of AI is",
] * 10
sampling_params = SamplingParams(temperature=0.2,
max_tokens=10,
stop_token_ids=None)
with VllmRunner(
"Qwen/Qwen2.5-0.5B-Instruct",
max_model_len=4096,
max_num_seqs=50,
dtype="bfloat16",
gpu_memory_utilization=0.9,
async_scheduling=True,
) as vllm_model:
vllm_model.generate(prompts, sampling_params=sampling_params)

View File

@@ -17,17 +17,23 @@
# limitations under the License.
#
import json
import os
from typing import Any, Dict
import jsonschema
import pytest
import regex as re
from vllm_ascend.utils import vllm_version_is
if vllm_version_is("0.10.2"):
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
else:
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
from vllm.outputs import RequestOutput
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
from tests.e2e.conftest import VllmRunner
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
MODEL_NAME = "Qwen/Qwen3-0.6B"
GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
@@ -84,16 +90,29 @@ def sample_json_schema():
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
def test_guided_json_completion(guided_decoding_backend: str,
sample_json_schema):
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
with VllmRunner(
MODEL_NAME,
seed=0,
guided_decoding_backend=guided_decoding_backend,
) as vllm_model:
runner_kwargs: Dict[str, Any] = {}
if vllm_version_is("0.10.2"):
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
runner_kwargs = {
"seed": 0,
"guided_decoding_backend": guided_decoding_backend,
}
else:
sampling_params = SamplingParams(
temperature=1.0,
max_tokens=500,
structured_outputs=StructuredOutputsParams(
json=sample_json_schema))
runner_kwargs = {
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
}
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [
f"Give an example JSON for an employee profile "
f"that fits this schema: {sample_json_schema}"
@@ -121,17 +140,29 @@ def test_guided_json_completion(guided_decoding_backend: str,
def test_guided_regex(guided_decoding_backend: str, sample_regex):
if guided_decoding_backend == "outlines":
pytest.skip("Outlines doesn't support regex-based guided decoding.")
runner_kwargs: Dict[str, Any] = {}
if vllm_version_is("0.10.2"):
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex))
runner_kwargs = {
"seed": 0,
"guided_decoding_backend": guided_decoding_backend,
}
else:
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
structured_outputs=StructuredOutputsParams(regex=sample_regex))
runner_kwargs = {
"seed": 0,
"structured_outputs_config": {
"backend": guided_decoding_backend
},
}
sampling_params = SamplingParams(
temperature=0.8,
top_p=0.95,
guided_decoding=GuidedDecodingParams(regex=sample_regex))
with VllmRunner(
MODEL_NAME,
seed=0,
guided_decoding_backend=guided_decoding_backend,
) as vllm_model:
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
prompts = [
f"Give an example IPv4 address with this regex: {sample_regex}"
] * 2

View File

@@ -0,0 +1,103 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
"""
Compare the outputs of vLLM with multistream_overlap_shared_expert
enabled and disabled.
Run `pytest tests/e2e/singlecard/test_multistream_overlap_shared_expert.py`.
"""
import pytest
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
from tests.e2e.model_utils import check_outputs_equal
MODELS = [
"Qwen/Qwen3-0.6B",
]
@pytest.mark.parametrize("model", MODELS)
@pytest.mark.parametrize("max_tokens", [32])
def test_models_with_multistream_overlap_shared_expert(
model: str,
max_tokens: int,
) -> None:
prompts = [
"Hello, my name is", "The president of the United States is",
"The capital of France is", "The future of AI is"
]
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
additional_config={
"multistream_overlap_shared_expert": True,
},
) as runner:
vllm_moe_ms_eager_outputs = runner.model.generate(
prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=False,
additional_config={
"multistream_overlap_shared_expert": True,
},
) as runner:
vllm_moe_ms_aclgraph_outputs = runner.model.generate(
prompts, sampling_params)
with VllmRunner(
model,
max_model_len=1024,
enforce_eager=True,
) as runner:
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
vllm_moe_ms_eager_outputs_list = []
for output in vllm_moe_ms_eager_outputs:
vllm_moe_ms_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_moe_ms_aclgraph_outputs_list = []
for output in vllm_moe_ms_aclgraph_outputs:
vllm_moe_ms_aclgraph_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
vllm_eager_outputs_list = []
for output in vllm_eager_outputs:
vllm_eager_outputs_list.append(
(output.outputs[0].index, output.outputs[0].text))
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
outputs_1_lst=vllm_moe_ms_eager_outputs_list,
name_0="vllm_eager_outputs",
name_1="vllm_moe_ms_eager_outputs",
)
check_outputs_equal(
outputs_0_lst=vllm_eager_outputs_list,
outputs_1_lst=vllm_moe_ms_aclgraph_outputs_list,
name_0="vllm_eager_outputs",
name_1="vllm_moe_ms_aclgraph_outputs",
)

View File

@@ -20,19 +20,14 @@
Run `pytest tests/test_offline_inference.py`.
"""
import os
import pytest
from vllm import SamplingParams
from vllm.assets.audio import AudioAsset
from vllm.assets.image import ImageAsset
from tests.e2e.conftest import VllmRunner
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
@pytest.mark.skip(reason="fix me")
def test_multimodal_vl(prompt_template):
image = ImageAsset("cherry_blossom") \
.pil_image.convert("RGB")
@@ -52,9 +47,12 @@ def test_multimodal_vl(prompt_template):
"fps": 1,
},
enforce_eager=True) as vllm_model:
vllm_model.generate_greedy(prompts=prompts,
images=images,
max_tokens=64)
outputs = vllm_model.generate_greedy(prompts=prompts,
images=images,
max_tokens=64)
assert len(outputs) == len(prompts)
for _, output_str in outputs:
assert output_str, "Generated output should not be empty."
def test_multimodal_audio():
@@ -86,4 +84,7 @@ def test_multimodal_audio():
dtype="bfloat16",
limit_mm_per_prompt={"audio": 2},
gpu_memory_utilization=0.9) as runner:
runner.generate(inputs, sampling_params=sampling_params)
outputs = runner.generate(inputs, sampling_params=sampling_params)
assert outputs is not None, "Generated outputs should not be None."
assert len(outputs) > 0, "Generated outputs should not be empty."

View File

@@ -0,0 +1,36 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# This file is a part of the vllm-ascend project.
# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#
from vllm import SamplingParams
from tests.e2e.conftest import VllmRunner
def test_models_topk() -> None:
example_prompts = [
"The capital of France is",
]
sampling_params = SamplingParams(max_tokens=10,
temperature=0.0,
top_k=10,
top_p=0.9)
with VllmRunner("Qwen/Qwen3-0.6B",
max_model_len=4096,
gpu_memory_utilization=0.7) as runner:
runner.generate(example_prompts, sampling_params)

View File

@@ -0,0 +1,2 @@
# Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
BASE_IMAGE_NAME="quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11"

View File

@@ -7,8 +7,7 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
AscendAttentionBackendImpl,
AscendAttentionMetadataBuilder,
AscendAttentionState,
AscendMetadata,
CommonAttentionState)
AscendMetadata)
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
@@ -25,10 +24,6 @@ class TestAscendAttentionBackend(TestBase):
self.assertEqual(AscendAttentionBackend.get_metadata_cls(),
AscendMetadata)
def test_get_state_cls(self):
self.assertEqual(AscendAttentionBackend.get_state_cls(),
CommonAttentionState)
def test_get_builder_cls(self):
self.assertEqual(AscendAttentionBackend.get_builder_cls(),
AscendAttentionMetadataBuilder)
@@ -72,7 +67,8 @@ class TestAscendAttentionMetadataBuilder(TestBase):
self.mock_vllm_config.model_config.max_model_len = 640
self.mock_vllm_config.cache_config.block_size = 64
self.mock_device = 'cpu:0'
self.builder = AscendAttentionMetadataBuilder(self.mock_vllm_config,
self.builder = AscendAttentionMetadataBuilder(None, None,
self.mock_vllm_config,
self.mock_device)
def test_reorder_batch(self):
@@ -100,19 +96,21 @@ class TestAscendAttentionMetadataBuilder(TestBase):
max_query_len=5,
decode_token_per_req=torch.tensor([1, 1]),
block_table_tensor=torch.zeros((10, 10)),
slot_mapping_cpu=torch.tensor(range(20)),
slot_mapping=torch.tensor(range(20)),
actual_seq_lengths_q=torch.tensor([0, 1]),
positions=torch.tensor([10, 10]),
attn_mask=torch.ones((10, 10)),
spec_attn_mask=None,
attn_state=AscendAttentionState.PrefillNoCache)
attn_state=AscendAttentionState.PrefillNoCache,
num_computed_tokens_cpu=None,
seq_lens=None)
mock_nz_tensor = MagicMock()
mock_model = MagicMock()
mock_nd_to_nz_2d.return_value = mock_nz_tensor
mock_npu_format_cast.return_value = mock_nz_tensor
self.builder.build(common_attn_metadata, mock_model)
self.builder.build(1, common_attn_metadata, mock_model)
@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
@patch('torch_npu.npu_format_cast')
@@ -131,12 +129,14 @@ class TestAscendAttentionMetadataBuilder(TestBase):
max_query_len=6,
decode_token_per_req=torch.tensor([1, 1, 1]),
block_table_tensor=torch.zeros((10, 10)),
slot_mapping_cpu=torch.tensor(range(20)),
slot_mapping=torch.tensor(range(20)),
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
positions=torch.tensor([10, 10]),
attn_mask=torch.ones((15, 15)),
spec_attn_mask=None,
attn_state=AscendAttentionState.ChunkedPrefill)
attn_state=AscendAttentionState.ChunkedPrefill,
num_computed_tokens_cpu=None,
seq_lens=None)
mock_ascend_attention_state = MagicMock()
mock_ascend_attention_state.PrefillNoCache = 0
@@ -146,7 +146,7 @@ class TestAscendAttentionMetadataBuilder(TestBase):
mock_nd_to_nz_spec.return_value = mock_nz_tensor
mock_npu_format_cast.return_value = mock_nz_tensor
self.builder.build(common_attn_metadata, mock_model)
self.builder.build(1, common_attn_metadata, mock_model)
@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
@@ -160,15 +160,17 @@ class TestAscendAttentionMetadataBuilder(TestBase):
max_query_len=6,
decode_token_per_req=torch.tensor([1, 1, 1]),
block_table_tensor=torch.zeros((10, 10)),
slot_mapping_cpu=torch.tensor(range(20)),
slot_mapping=torch.tensor(range(20)),
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
positions=torch.tensor([10, 10]),
attn_mask=torch.ones((15, 15)),
spec_attn_mask=None,
attn_state=AscendAttentionState.ChunkedPrefill)
attn_state=AscendAttentionState.ChunkedPrefill,
num_computed_tokens_cpu=None,
seq_lens=None)
mock_model = MagicMock()
self.builder.build(common_attn_metadata, mock_model)
self.builder.build(1, common_attn_metadata, mock_model)
class TestAscendAttentionBackendImpl(TestBase):
@@ -341,36 +343,6 @@ class TestAscendAttentionBackendImpl(TestBase):
mock_flash_attention.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_flash_attention')
def test_forward_prefill_no_cache_swa(self, mock_flash_attention,
mock_reshape_cache):
"""Test forward pass in PrefillNoCache state"""
query = torch.randn(10, 8 * 64)
key = torch.randn(10, 8 * 64)
value = torch.randn(10, 8 * 64)
kv_cache = torch.empty(2, 5, 128, 8, 64)
metadata = self.attn_metadata
metadata.attn_state = AscendAttentionState.PrefillNoCache
metadata.attn_mask = torch.randn(1, 1, 10, 10)
metadata.seq_lens = torch.tensor([10])
metadata.num_actual_tokens = 10
metadata.slot_mapping = torch.zeros(10, dtype=torch.long)
layer = self.layer_no_quant
# layer.quant_method.apply.return_value = metadata
print(self.layer_no_quant._v_scale_float)
output = self.impl_swa.forward(layer,
query,
key,
value,
kv_cache,
metadata,
trace_flag=False)
mock_reshape_cache.assert_called_once()
mock_flash_attention.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_flash_attention_qlens')
def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens,
@@ -401,10 +373,12 @@ class TestAscendAttentionBackendImpl(TestBase):
mock_flash_attention_qlens.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_paged_attention')
def test_forward_decode_only(self, mock_paged_attention,
mock_npu_reshape_and_cache):
mock_npu_reshape_and_cache,
mock_get_forward_context):
"""Test forward pass in DecodeOnly state"""
query = torch.randn(10, 8 * 64)
key = torch.randn(10, 8 * 64)
@@ -418,6 +392,8 @@ class TestAscendAttentionBackendImpl(TestBase):
metadata.slot_mapping = torch.zeros(10, dtype=torch.long)
layer = self.layer_no_quant
mock_get_forward_context.return_value = MagicMock(capturing=False)
output = self.impl.forward(layer,
query,
key,
@@ -458,6 +434,44 @@ class TestAscendAttentionBackendImpl(TestBase):
mock_fused_infer_attention_score.assert_called_once()
assert output.shape == (10, 8 * 64)
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
@patch('torch_npu._npu_reshape_and_cache')
@patch('torch_npu._npu_paged_attention')
@patch('torch_npu.npu_fused_infer_attention_score')
def test_forward_decode_only_swa_seq_len_mismatch(
self, mock_fused_infer_attention_score, mock_paged_attention,
mock_npu_reshape_and_cache, mock_get_forward_context):
"""Test forward pass in DecodeOnly state when seq)len_mismatch"""
query = torch.randn(10, 8 * 64)
key = torch.randn(10, 8 * 64)
value = torch.randn(10, 8 * 64)
kv_cache = torch.empty(2, 5, 128, 8, 64)
metadata = self.attn_metadata
metadata.attn_state = AscendAttentionState.DecodeOnly
metadata.seq_lens = torch.tensor([10]) # len == 1 != query.size(0)==10
metadata.block_tables = torch.zeros(1, 5, dtype=torch.long)
metadata.num_actual_tokens = 10
metadata.slot_mapping = torch.zeros(10, dtype=torch.long)
mock_fused_infer_attention_score.return_value = (torch.ones(10, 8,
64), 1)
mock_get_forward_context.return_value = MagicMock(capturing=False)
output = self.impl_swa.forward(self.layer_no_quant,
query,
key,
value,
kv_cache,
metadata,
trace_flag=False)
mock_paged_attention.assert_called_once()
mock_fused_infer_attention_score.assert_not_called()
assert output.shape == (10, 8 * 64)
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
@patch('torch_npu._npu_reshape_and_cache')
@patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')

View File

@@ -186,10 +186,39 @@ class TestAscendMLAMetadataBuilder(TestBase):
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_vllm_config.speculative_config = None
ascend_config = MagicMock()
with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
return_value=ascend_config):
builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
builder = AscendMLAMetadataBuilder(None, None, mock_vllm_config,
mock_device)
self.assertEqual(builder.block_size,
mock_vllm_config.cache_config.block_size)
self.assertEqual(
builder.chunked_prefill_enabled,
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
def test_ascend_mla_metadata_builder_spec_decode(self):
mock_vllm_config = MagicMock()
mock_vllm_config.model_config.max_model_len = 1024
mock_vllm_config.model_config.get_head_size.return_value = 64
mock_vllm_config.model_config.dtype = torch.float16
mock_vllm_config.cache_config.block_size = 16
mock_vllm_config.scheduler_config.max_num_seqs = 4
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_spec_config = MagicMock()
mock_spec_config.num_speculative_tokens = 3
mock_vllm_config.speculative_config = mock_spec_config
ascend_config = MagicMock()
with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
return_value=ascend_config):
builder = AscendMLAMetadataBuilder(None, None, mock_vllm_config,
mock_device)
self.assertEqual(builder.block_size,
mock_vllm_config.cache_config.block_size)
@@ -207,9 +236,12 @@ class TestAscendMLAMetadataBuilder(TestBase):
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
mock_device = 'cpu'
mock_vllm_config.speculative_config = None
with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
return_value=ascend_config):
builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
builder = AscendMLAMetadataBuilder(None, None, mock_vllm_config,
mock_device)
builder.decode_threshold = 1
input_batch = MagicMock()
@@ -522,7 +554,11 @@ class TestAscendMLAImpl(TestBase):
self.impl.num_kv_heads = self.impl.num_heads
decode_res, prefill_res = self.impl._mla_preprocess(
hidden_states, kv_cache, attn_metadata, need_gather_q_kv=False)
"mock_layer",
hidden_states,
kv_cache,
attn_metadata,
need_gather_q_kv=False)
self.assertIsNotNone(decode_res)
self.assertIsNotNone(prefill_res)

View File

@@ -0,0 +1,720 @@
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
#
from unittest.mock import MagicMock, Mock, patch
import torch
from vllm.compilation.cuda_graph import CUDAGraphOptions
from vllm.config import CUDAGraphMode, VllmConfig
from vllm.forward_context import BatchDescriptor, ForwardContext
from tests.ut.base import TestBase
from vllm_ascend.compilation.acl_graph import ACLGraphEntry, ACLGraphWrapper
class TestACLGraphEntry(TestBase):
def test_aclgraph_entry_initialization(self):
"""Test ACLGraphEntry initialization with default values"""
batch_descriptor = BatchDescriptor(
num_tokens=30,
uniform_decode=False,
)
entry = ACLGraphEntry(batch_descriptor=batch_descriptor)
self.assertEqual(entry.batch_descriptor, batch_descriptor)
self.assertIsNone(entry.aclgraph)
self.assertIsNone(entry.output)
self.assertIsNone(entry.input_addresses)
def test_aclgraph_entry_with_values(self):
"""Test ACLGraphEntry initialization with specified values"""
batch_descriptor = BatchDescriptor(
num_tokens=30,
uniform_decode=False,
)
mock_graph = MagicMock()
mock_output = MagicMock()
input_addresses = [12345, 67890]
entry = ACLGraphEntry(batch_descriptor=batch_descriptor,
aclgraph=mock_graph,
output=mock_output,
input_addresses=input_addresses)
self.assertEqual(entry.batch_descriptor, batch_descriptor)
self.assertEqual(entry.aclgraph, mock_graph)
self.assertEqual(entry.output, mock_output)
self.assertEqual(entry.input_addresses, input_addresses)
class TestACLGraphWrapper(TestBase):
def setUp(self):
"""Set up test fixtures"""
super().setUp()
# Mock VllmConfig
self.mock_vllm_config = MagicMock(spec=VllmConfig)
self.mock_vllm_config.compilation_config = MagicMock()
# Mock runnable function
self.mock_runnable = MagicMock(return_value="test_output")
# Mock graph pool
self.mock_graph_pool = MagicMock()
# Mock CUDAGraphOptions
self.mock_cudagraph_options = MagicMock(spec=CUDAGraphOptions)
self.mock_cudagraph_options.debug_log_enable = False
self.mock_cudagraph_options.gc_disable = False
self.mock_cudagraph_options.weak_ref_output = False
# Mock BatchDescriptor
self.mock_batch_descriptor = BatchDescriptor(
num_tokens=30,
uniform_decode=False,
)
# Mock ForwardContext
self.mock_forward_context = MagicMock(spec=ForwardContext)
self.mock_forward_context.batch_descriptor = self.mock_batch_descriptor
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
def test_initialization_with_default_options(self, mock_envs,
mock_current_platform):
"""Test ACLGraphWrapper initialization with default CUDAGraphOptions"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
wrapper = ACLGraphWrapper(runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool)
self.assertEqual(wrapper.runnable, self.mock_runnable)
self.assertEqual(wrapper.vllm_config, self.mock_vllm_config)
self.assertEqual(wrapper.graph_pool, self.mock_graph_pool)
self.assertEqual(wrapper.runtime_mode, CUDAGraphMode.FULL)
self.assertFalse(wrapper.is_debugging_mode)
self.assertIsInstance(wrapper.aclgraph_options, CUDAGraphOptions)
self.assertEqual(wrapper.concrete_aclgraph_entries, {})
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
def test_initialization_with_custom_options(self, mock_envs,
mock_current_platform):
"""Test ACLGraphWrapper initialization with custom CUDAGraphOptions"""
mock_envs.VLLM_LOGGING_LEVEL = "DEBUG"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
self.assertEqual(wrapper.runnable, self.mock_runnable)
self.assertEqual(wrapper.vllm_config, self.mock_vllm_config)
self.assertEqual(wrapper.graph_pool, self.mock_graph_pool)
self.assertEqual(wrapper.runtime_mode, CUDAGraphMode.FULL)
self.assertTrue(wrapper.is_debugging_mode)
self.assertEqual(wrapper.aclgraph_options, self.mock_cudagraph_options)
self.assertEqual(wrapper.concrete_aclgraph_entries, {})
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
def test_initialization_assertion_error(self, mock_envs,
mock_current_platform):
"""Test ACLGraphWrapper initialization raises AssertionError for NONE mode"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
with self.assertRaises(AssertionError):
ACLGraphWrapper(runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.NONE,
graph_pool=self.mock_graph_pool)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
def test_call_with_none_runtime_mode(self, mock_envs,
mock_current_platform,
mock_get_forward_context):
"""Test __call__ method when runtime mode is NONE"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.NONE
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
result = wrapper("arg1", "arg2")
# Should call the runnable directly without graph capture
self.mock_runnable.assert_called_once_with("arg1", "arg2")
self.assertEqual(result, "test_output")
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
def test_call_with_mismatched_runtime_mode(self, mock_envs,
mock_current_platform,
mock_get_forward_context):
"""Test __call__ method when runtime mode doesn't match wrapper mode"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE # Different from FULL
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
result = wrapper("arg1", "arg2")
# Should call the runnable directly without graph capture
self.mock_runnable.assert_called_once_with("arg1", "arg2")
self.assertEqual(result, "test_output")
@patch('vllm_ascend.compilation.acl_graph.torch')
@patch(
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
def test_call_capture_graph_first_time(
self, mock_weak_ref_tensors, mock_compilation_counter, mock_envs,
mock_current_platform, mock_get_forward_context,
mock_validate_cudagraph_capturing_enabled, mock_torch):
"""Test __call__ method captures graph for the first time"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
# Mock torch.npu.NPUGraph
mock_npu_graph = MagicMock()
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
# Mock torch.npu.graph context manager
mock_graph_context = MagicMock()
mock_torch.npu.graph.return_value = mock_graph_context
mock_graph_context.__enter__ = Mock(return_value=None)
mock_graph_context.__exit__ = Mock(return_value=None)
# Mock weak_ref_tensors to return the same output
mock_weak_ref_tensors.return_value = "weak_ref_output"
# Ensure torch.Tensor can be correctly identified by isinstance
mock_torch.Tensor = torch.Tensor
# Set up the compilation counter mock
mock_compilation_counter.num_cudagraph_captured = 0
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
test_tensor = torch.tensor([1, 2, 3])
# Call the wrapper
result = wrapper(test_tensor, "arg2")
# Verify graph capture happened
mock_validate_cudagraph_capturing_enabled.assert_called_once()
mock_torch.npu.NPUGraph.assert_called_once()
mock_torch.npu.graph.assert_called_once_with(mock_npu_graph,
pool=self.mock_graph_pool)
self.mock_runnable.assert_called_once_with(test_tensor, "arg2")
# Verify the entry was created and updated
self.assertIn(self.mock_batch_descriptor,
wrapper.concrete_aclgraph_entries)
entry = wrapper.concrete_aclgraph_entries[self.mock_batch_descriptor]
self.assertEqual(entry.aclgraph, mock_npu_graph)
self.assertEqual(entry.output, "weak_ref_output")
# Verify compilation counter was incremented
self.assertEqual(mock_compilation_counter.num_cudagraph_captured, 1)
# Should return the original output (not weak ref)
self.assertEqual(result, "test_output")
@patch('vllm_ascend.compilation.acl_graph.torch')
@patch(
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
def test_call_replay_graph(self, mock_weak_ref_tensors,
mock_compilation_counter, mock_envs,
mock_current_platform, mock_get_forward_context,
mock_validate_cudagraph_capturing_enabled,
mock_torch):
"""Test __call__ method replays graph when already captured"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
# Mock torch.npu.NPUGraph
mock_npu_graph = MagicMock()
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
# Mock torch.npu.graph context manager
mock_graph_context = MagicMock()
mock_torch.npu.graph.return_value = mock_graph_context
mock_graph_context.__enter__ = Mock(return_value=None)
mock_graph_context.__exit__ = Mock(return_value=None)
# Mock weak_ref_tensors to return the same output
mock_weak_ref_tensors.return_value = "weak_ref_output"
# Ensure torch.Tensor can be correctly identified by isinstance
mock_torch.Tensor = torch.Tensor
# Set up the compilation counter mock
mock_compilation_counter.num_cudagraph_captured = 0
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
test_tensor = torch.tensor([1, 2, 3])
# First call to capture the graph
first_result = wrapper(test_tensor, "arg2")
# Verify graph capture happened during first call
mock_validate_cudagraph_capturing_enabled.assert_called_once()
mock_torch.npu.NPUGraph.assert_called_once()
mock_torch.npu.graph.assert_called_once()
# Reset mock to track second call
self.mock_runnable.reset_mock()
mock_npu_graph.reset_mock()
# Second call should replay the graph
second_result = wrapper(test_tensor, "arg2")
# Verify runnable was called only during capture (not during replay)
self.mock_runnable.assert_not_called()
# Verify graph replay happened
mock_npu_graph.replay.assert_called_once()
# Both calls should return the weak ref output
self.assertEqual(first_result, "test_output") # Original output
self.assertEqual(second_result, "weak_ref_output") # Weak ref output
@patch('vllm_ascend.compilation.acl_graph.torch')
@patch(
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
def test_call_with_debug_mode_input_address_check(
self, mock_weak_ref_tensors, mock_envs, mock_current_platform,
mock_get_forward_context,
mock_validate_cudagraph_capturing_enabled, mock_torch):
"""Test __call__ method with debug mode input address checking"""
mock_envs.VLLM_LOGGING_LEVEL = "DEBUG" # Enable debug mode
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
# Mock torch.npu.NPUGraph
mock_npu_graph = MagicMock()
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
# Mock torch.npu.graph context manager
mock_graph_context = MagicMock()
mock_torch.npu.graph.return_value = mock_graph_context
mock_graph_context.__enter__ = Mock(return_value=None)
mock_graph_context.__exit__ = Mock(return_value=None)
# Mock weak_ref_tensors
mock_weak_ref_tensors.return_value = "weak_ref_output"
# Ensure torch.Tensor can be correctly identified by isinstance
mock_torch.Tensor = torch.Tensor
# Create a mock tensor as the output of runnable
mock_output_tensor = torch.tensor([4, 5, 6])
self.mock_runnable.return_value = mock_output_tensor
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# First call to capture the graph
tensor = torch.tensor([1, 2, 3]) # Create tensor once
_ = wrapper(tensor, "arg2")
# Second call with same tensor addresses should work
_ = wrapper(tensor, "arg2") # Use the same tensor object
# Should not raise AssertionError
self.assertTrue(True)
@patch('vllm_ascend.compilation.acl_graph.torch')
@patch(
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
def test_call_with_debug_mode_input_address_mismatch(
self, mock_weak_ref_tensors, mock_envs, mock_current_platform,
mock_get_forward_context,
mock_validate_cudagraph_capturing_enabled, mock_torch):
"""Test __call__ method with debug mode input address mismatch raises AssertionError"""
mock_envs.VLLM_LOGGING_LEVEL = "DEBUG" # Enable debug mode
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
# Mock torch.npu.NPUGraph
mock_npu_graph = MagicMock()
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
# Mock torch.npu.graph context manager
mock_graph_context = MagicMock()
mock_torch.npu.graph.return_value = mock_graph_context
mock_graph_context.__enter__ = Mock(return_value=None)
mock_graph_context.__exit__ = Mock(return_value=None)
# Mock weak_ref_tensors
mock_weak_ref_tensors.return_value = "weak_ref_output"
# Ensure torch.Tensor can be correctly identified by isinstance
mock_torch.Tensor = torch.Tensor
# Create a mock tensor as the output of runnable
mock_output_tensor = torch.tensor([4, 5, 6])
self.mock_runnable.return_value = mock_output_tensor
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# First call to capture the graph
tensor1 = torch.tensor([1, 2, 3])
_ = wrapper(tensor1, "arg2")
# Second call with different tensor addresses should raise AssertionError
tensor2 = torch.tensor([4, 5,
6]) # Different values, different address
with self.assertRaises(AssertionError) as context:
wrapper(tensor2, "arg2")
self.assertIn("Input addresses for aclgraphs are different",
str(context.exception))
@patch('vllm_ascend.compilation.acl_graph.torch')
@patch(
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
@patch('vllm_ascend.compilation.acl_graph.patch')
def test_call_capture_graph_with_gc_disable(
self, mock_patch, mock_weak_ref_tensors, mock_compilation_counter,
mock_envs, mock_current_platform, mock_get_forward_context,
mock_validate_cudagraph_capturing_enabled, mock_torch):
"""Test __call__ method captures graph with gc_disable option enabled"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
# Enable gc_disable option
self.mock_cudagraph_options.gc_disable = True
# weak_ref_output is not enabled by default
# Mock torch.npu.NPUGraph
mock_npu_graph = MagicMock()
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
# Mock torch.npu.graph context manager
mock_graph_context = MagicMock()
mock_torch.npu.graph.return_value = mock_graph_context
mock_graph_context.__enter__ = Mock(return_value=None)
mock_graph_context.__exit__ = Mock(return_value=None)
# Mock patch context manager
mock_exit_stack = MagicMock()
mock_patch.return_value = mock_exit_stack
mock_exit_stack.enter_context = Mock()
# Mock weak_ref_tensors to simulate the actual behavior:
# 1. First call (inside the graph context) should return "inner_output"
# 2. Second call (for entry.output) should return "weak_ref_output"
mock_weak_ref_tensors.side_effect = ["inner_output", "weak_ref_output"]
# Ensure torch.Tensor can be correctly identified by isinstance
mock_torch.Tensor = torch.Tensor
# Set up the compilation counter mock
mock_compilation_counter.num_cudagraph_captured = 0
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
test_tensor = torch.tensor([1, 2, 3])
# Call the wrapper
result = wrapper(test_tensor, "arg2")
# Verify patch was called to disable gc
self.assertTrue(mock_patch.called)
# Verify graph capture happened
mock_validate_cudagraph_capturing_enabled.assert_called_once()
mock_torch.npu.NPUGraph.assert_called_once()
mock_torch.npu.graph.assert_called_once_with(mock_npu_graph,
pool=self.mock_graph_pool)
# Should return the original output (not weak ref) since weak_ref_output is not enabled
self.assertEqual(result, "test_output")
@patch('vllm_ascend.compilation.acl_graph.torch')
@patch(
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
)
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
def test_call_capture_graph_with_weak_ref_output(
self, mock_weak_ref_tensors, mock_compilation_counter, mock_envs,
mock_current_platform, mock_get_forward_context,
mock_validate_cudagraph_capturing_enabled, mock_torch):
"""Test __call__ method captures graph with weak_ref_output option enabled"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
# Enable weak_ref_output option
self.mock_cudagraph_options.weak_ref_output = True
# Mock torch.npu.NPUGraph
mock_npu_graph = MagicMock()
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
# Mock torch.npu.graph context manager
mock_graph_context = MagicMock()
mock_torch.npu.graph.return_value = mock_graph_context
mock_graph_context.__enter__ = Mock(return_value=None)
mock_graph_context.__exit__ = Mock(return_value=None)
# Mock weak_ref_tensors to simulate the actual behavior:
# 1. First call (inside the graph context with weak_ref_output=True) should return "weak_ref_output"
# 2. Second call (for entry.output) should return "weak_ref_output"
mock_weak_ref_tensors.side_effect = [
"weak_ref_output", "weak_ref_output"
]
# Ensure torch.Tensor can be correctly identified by isinstance
mock_torch.Tensor = torch.Tensor
# Set up the compilation counter mock
mock_compilation_counter.num_cudagraph_captured = 0
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
test_tensor = torch.tensor([1, 2, 3])
# Call the wrapper
result = wrapper(test_tensor, "arg2")
# Verify weak_ref_tensors was called twice (once for inner output, once for final output)
self.assertEqual(mock_weak_ref_tensors.call_count, 2)
# Verify graph capture happened
mock_validate_cudagraph_capturing_enabled.assert_called_once()
mock_torch.npu.NPUGraph.assert_called_once()
mock_torch.npu.graph.assert_called_once_with(mock_npu_graph,
pool=self.mock_graph_pool)
# Should return the weak ref output when weak_ref_output option is enabled
self.assertEqual(result, "weak_ref_output")
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
@patch('vllm_ascend.compilation.acl_graph.current_platform')
@patch('vllm_ascend.compilation.acl_graph.envs')
@patch('vllm_ascend.compilation.acl_graph.logger')
def test_call_capture_graph_with_debug_log(self, mock_logger, mock_envs,
mock_current_platform,
mock_get_forward_context):
"""Test __call__ method captures graph with debug logging enabled"""
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
mock_get_forward_context.return_value = self.mock_forward_context
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
# Enable debug logging
self.mock_cudagraph_options.debug_log_enable = True
# weak_ref_output is not enabled by default
# Mock torch
with patch('vllm_ascend.compilation.acl_graph.torch') as mock_torch:
# Mock torch.npu.NPUGraph
mock_npu_graph = MagicMock()
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
# Mock torch.npu.graph context manager
mock_graph_context = MagicMock()
mock_torch.npu.graph.return_value = mock_graph_context
mock_graph_context.__enter__ = Mock(return_value=None)
mock_graph_context.__exit__ = Mock(return_value=None)
# Ensure torch.Tensor can be correctly identified by isinstance
mock_torch.Tensor = torch.Tensor
# Mock weak_ref_tensors
with patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors'
) as mock_weak_ref_tensors:
# Mock weak_ref_tensors to simulate the actual behavior:
# 1. First call (inside the graph context) should return "inner_output"
# 2. Second call (for entry.output) should return "weak_ref_output"
mock_weak_ref_tensors.side_effect = [
"inner_output", "weak_ref_output"
]
# Mock validate_cudagraph_capturing_enabled
with patch(
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
):
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Create a real torch tensor for the test, not a mock
test_tensor = torch.tensor([1, 2, 3])
# Call the wrapper
_ = wrapper(test_tensor, "arg2")
# Verify debug log was called
mock_logger.debug.assert_called_once()
def test_getattr_access_runnable_attributes(self):
"""Test __getattr__ method accesses runnable attributes"""
mock_runnable = MagicMock()
mock_runnable.test_attr = "test_value"
wrapper = ACLGraphWrapper(
runnable=mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Should be able to access attributes of the runnable
self.assertEqual(wrapper.test_attr, "test_value")
def test_getattr_attribute_not_exists(self):
"""Test __getattr__ method raises AttributeError for non-existent attributes"""
# Create a simple object without any attributes
class EmptyRunnable:
pass
mock_runnable = EmptyRunnable()
wrapper = ACLGraphWrapper(
runnable=mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
# Should raise AttributeError for non-existent attributes
with self.assertRaises(AttributeError) as context:
_ = wrapper.non_existent_attr
self.assertIn("Attribute non_existent_attr not exists",
str(context.exception))
def test_unwrap_method(self):
"""Test unwrap method returns the original runnable"""
wrapper = ACLGraphWrapper(
runnable=self.mock_runnable,
vllm_config=self.mock_vllm_config,
runtime_mode=CUDAGraphMode.FULL,
graph_pool=self.mock_graph_pool,
cudagraph_options=self.mock_cudagraph_options)
unwrapped = wrapper.unwrap()
self.assertEqual(unwrapped, self.mock_runnable)

View File

@@ -27,7 +27,6 @@ class TestAscendSchedulerConfig(TestBase):
max_model_len=8192,
is_multimodal_model=False,
send_delta_data=False,
scheduler_delay_factor=0,
)
def test_initialize_from_config_with_default(self):
@@ -36,7 +35,6 @@ class TestAscendSchedulerConfig(TestBase):
self.basic_scheduler_config, {})
self.assertEqual(ascend_config.enable_chunked_prefill, False)
self.assertEqual(ascend_config.policy, "fcfs")
self.assertEqual(ascend_config.num_scheduler_steps, 1)
self.assertEqual(ascend_config.scheduler_cls,
"vllm_ascend.core.scheduler.AscendScheduler")
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
@@ -49,19 +47,21 @@ class TestAscendSchedulerConfig(TestBase):
AscendSchedulerConfig(
enable_chunked_prefill=False,
policy="fcfs",
num_scheduler_steps=1,
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
max_num_batched_tokens=2048,
max_model_len=2048,
max_long_partial_prefills=1,
long_prefill_token_threshold=512,
),
)
self.assertEqual(ascend_config.enable_chunked_prefill, False)
self.assertEqual(ascend_config.policy, "fcfs")
self.assertEqual(ascend_config.num_scheduler_steps, 1)
self.assertEqual(ascend_config.scheduler_cls,
"vllm_ascend.core.scheduler.AscendScheduler")
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
self.assertEqual(ascend_config.encoder_cache_size, 2048)
self.assertEqual(ascend_config.max_long_partial_prefills, 1)
self.assertEqual(ascend_config.long_prefill_token_threshold, 512)
def test_not_implemented_policy(self):
with self.assertRaises(NotImplementedError) as context:
@@ -78,28 +78,6 @@ class TestAscendSchedulerConfig(TestBase):
str(context.exception),
)
def test_not_implemented_multimodal(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
SchedulerConfig(is_multimodal_model=True), {})
self.assertIn("currently AscendScheduler only supports LLM models",
str(context.exception))
def test_not_implemented_multi_step(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
num_scheduler_steps=2,
max_num_batched_tokens=2048,
max_model_len=2048,
),
)
self.assertIn(
"currently AscendScheduler doesn't support multi-step",
str(context.exception),
)
def test_not_implemented_send_delta_data(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
@@ -115,27 +93,17 @@ class TestAscendSchedulerConfig(TestBase):
str(context.exception),
)
def test_not_implemented_delay_factor(self):
with self.assertRaises(NotImplementedError) as context:
AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
delay_factor=1,
max_num_batched_tokens=2048,
max_model_len=2048,
),
)
self.assertIn(
"currently AscendScheduler doesn't support scheduler_delay_factor",
str(context.exception),
)
def test_no_override(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config, {})
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
self.assertEqual(ascend_config.encoder_cache_size, 8192)
def test_valid_config_with_multimodal(self):
config = AscendSchedulerConfig.initialize_from_config(
SchedulerConfig(is_multimodal_model=True), {})
self.assertTrue(config.is_multimodal_model)
def test_valid_config_with_chunked_prefill(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
@@ -165,3 +133,16 @@ class TestAscendSchedulerConfig(TestBase):
)
self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
self.assertIn("max_model_len (4096)", str(context.exception))
def test_initialize_from_config_with_pd_transfer(self):
ascend_config = AscendSchedulerConfig.initialize_from_config(
self.basic_scheduler_config,
AscendSchedulerConfig(
enable_pd_transfer=True,
decode_max_num_seqs=48,
max_num_batched_tokens=4096,
max_model_len=4096,
),
)
self.assertEqual(ascend_config.enable_pd_transfer, True)
self.assertEqual(ascend_config.decode_max_num_seqs, 48)

View File

@@ -6,25 +6,21 @@ from unittest.mock import MagicMock, patch
import torch
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
SchedulerConfig, SpeculativeConfig, VllmConfig)
from vllm.multimodal.inputs import PlaceholderRange
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
MultiModalKwargsItem, PlaceholderRange)
from vllm.sampling_params import SamplingParams
from vllm.utils import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
from vllm.v1.core.sched.output import SchedulerOutput
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
KVCacheGroupSpec)
from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
from vllm.v1.request import Request, RequestStatus
from vllm.v1.structured_output import StructuredOutputManager
from tests.ut.base import TestBase
from vllm_ascend.core.scheduler import AscendScheduler
from vllm_ascend.utils import vllm_version_is
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
from vllm.v1.outputs import DraftTokenIds
else:
DraftTokenIds = None
EOS_TOKEN_ID = 50256
MODEL = "Qwen3-0.6B"
@@ -44,7 +40,7 @@ def create_requests(
max_tokens: int = 16,
stop_token_ids: Optional[list[int]] = None,
block_size: int = 3,
hash_fn=hash,
hash_fn=sha256,
):
init_none_hash(hash_fn)
prompt_logprobs = PROMPT_LOGPROBS
@@ -54,25 +50,25 @@ def create_requests(
prompt_logprobs=prompt_logprobs)
requests = []
for i in range(num_requests):
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
request = Request(request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
multi_modal_kwargs=None,
multi_modal_placeholders=None,
multi_modal_hashes=None,
eos_token_id=EOS_TOKEN_ID,
pooling_params=None,
block_hasher=get_request_block_hasher(
block_size, hash_fn))
else:
request = Request(request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
eos_token_id=EOS_TOKEN_ID,
pooling_params=None,
block_hasher=get_request_block_hasher(
block_size, hash_fn))
mm_features = []
if mm_positions is not None:
mm_position = mm_positions[i]
for j, position in enumerate(mm_position):
identifier = f"hash{i}_{j}"
mm_feature = MultiModalFeatureSpec(
data=MultiModalKwargsItem.dummy("dummy_m"),
mm_position=position,
identifier=identifier,
modality="image")
mm_features.append(mm_feature)
request = Request(request_id=f"{i}",
prompt_token_ids=[i] * num_tokens,
sampling_params=sampling_params,
eos_token_id=EOS_TOKEN_ID,
pooling_params=None,
mm_features=mm_features if mm_features else None,
block_hasher=get_request_block_hasher(
block_size, hash_fn))
requests.append(request)
return requests
@@ -85,25 +81,15 @@ def make_output(scheduler):
}
sampled_token_ids = [[1000]] * len(scheduler.running)
logprobs = None
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
modelrunner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
spec_token_ids=None,
logprobs=logprobs,
prompt_logprobs_dict={},
pooler_output=[],
)
else:
modelrunner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=logprobs,
prompt_logprobs_dict={},
pooler_output=[],
)
modelrunner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=logprobs,
prompt_logprobs_dict={},
pooler_output=[],
)
return modelrunner_output
@@ -113,7 +99,7 @@ class TestAscendScheduler(TestBase):
@patch("vllm.config.VllmConfig.__post_init__", MagicMock())
@patch('vllm.v1.core.sched.scheduler.compute_encoder_budget')
def create_scheduler(self, mock_compute_encoder_budget):
mock_compute_encoder_budget.return_value = [10, 20]
mock_compute_encoder_budget.return_value = [100, 100]
use_kv_connector = False
block_size = 16
@@ -235,7 +221,7 @@ class TestAscendScheduler(TestBase):
len(requests) - i - 1)
def test_schedule(self):
'''Test scheduling.
'''Test scheduling.
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
'''
scheduler = self.create_scheduler()
@@ -260,6 +246,60 @@ class TestAscendScheduler(TestBase):
for i, request in enumerate(requests):
self.assertEqual(scheduler.running[i], request)
def test_schedule_multimodal_requests(self):
scheduler = self.create_scheduler()
scheduler.scheduler_config.chunked_prefill_enabled = False
mm_positions = [[PlaceholderRange(offset=i, length=10)]
for i in range(10)]
requests = create_requests(
num_requests=10,
mm_positions=mm_positions,
)
for request in requests:
scheduler.add_request(request)
output = scheduler.schedule()
self.assertEqual(len(output.scheduled_new_reqs), len(requests))
self.assertEqual(output.scheduled_cached_reqs.num_reqs, 0)
self.assertEqual(len(output.finished_req_ids), 0)
for req_id, num_tokens in output.num_scheduled_tokens.items():
assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
# Verify all requests are scheduled.
for req_id, num_tokens in output.num_scheduled_tokens.items():
self.assertEqual(num_tokens,
len(requests[int(req_id)].prompt_token_ids))
self.assertEqual(len(output.scheduled_encoder_inputs), len(requests))
for req_id, encoder_input in output.scheduled_encoder_inputs.items():
assert len(encoder_input) == 1
# Verify requests moved from waiting to running
self.assertEqual(len(scheduler.waiting), 0)
self.assertEqual(len(scheduler.running), len(requests))
for i, request in enumerate(requests):
self.assertEqual(scheduler.running[i], request)
def test_concurrent_partial_prefills_schedule(self):
'''Test concurrent partial prefills scheduling.
total requests = 10, every request has 10 token.
while set long_prefill_token_threshold = 1, scheduler can
only schedule max_long_partial_prefills long request.
'''
scheduler = self.create_scheduler()
scheduler.scheduler_config.chunked_prefill_enabled = False
scheduler.scheduler_config.max_long_partial_prefills = 2
scheduler.scheduler_config.long_prefill_token_threshold = 1
requests = create_requests(num_requests=10, num_tokens=20)
for request in requests:
scheduler.add_request(request)
# Test initial scheduling
output = scheduler.schedule()
self.assertEqual(len(output.scheduled_new_reqs),
scheduler.scheduler_config.max_long_partial_prefills)
self.assertEqual(output.scheduled_cached_reqs.num_reqs, 0)
self.assertEqual(len(output.finished_req_ids), 0)
def test_schedule_enable_prefix_caching(self):
'''Test scheduling.
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
@@ -304,69 +344,34 @@ class TestAscendScheduler(TestBase):
scheduler.running.append(req)
req.status = RequestStatus.RUNNING
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 1,
requests[1].request_id: 2
},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [],
requests[1].request_id: [10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID], [
10, 11
]], # First request hits EOS, second continues
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 1,
requests[1].request_id: 2
},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [],
requests[1].request_id: [10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID], [
10, 11
]], # First request hits EOS, second continues
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 1,
requests[1].request_id: 2
},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [],
requests[1].request_id: [10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
], # First request hits EOS, second continues
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -391,67 +396,35 @@ class TestAscendScheduler(TestBase):
scheduler.running.append(req)
req.status = RequestStatus.RUNNING
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 2
},
total_num_scheduled_tokens=5,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 42],
requests[1].request_id: [13]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 2
},
total_num_scheduled_tokens=5,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 42],
requests[1].request_id: [13]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 2
},
total_num_scheduled_tokens=5,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id:
[10, 42],
requests[1].request_id: [13]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 42, 12],
[13, 14]], # First request hits stop token
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -475,67 +448,35 @@ class TestAscendScheduler(TestBase):
scheduler.running.append(req)
req.status = RequestStatus.RUNNING
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 1
},
total_num_scheduled_tokens=4,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 11],
requests[1].request_id: []
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 1
},
total_num_scheduled_tokens=4,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [10, 11],
requests[1].request_id: []
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={
requests[0].request_id: 3,
requests[1].request_id: 1
},
total_num_scheduled_tokens=4,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id:
[10, 11],
requests[1].request_id: []
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[req.request_id for req in requests],
req_id_to_index={
req.request_id: i
for i, req in enumerate(requests)
},
sampled_token_ids=[[10, 11, 12],
[13]], # First request exceeds max_tokens
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
# Verify first request stopped due to length
@@ -556,52 +497,27 @@ class TestAscendScheduler(TestBase):
scheduler.requests[requests[0].request_id] = requests[0]
scheduler.running.append(requests[0])
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={requests[0].request_id: 3},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [EOS_TOKEN_ID, 10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_input_ids=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={requests[0].request_id: 3},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [EOS_TOKEN_ID, 10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler_output = SchedulerOutput(
scheduled_new_reqs=[],
scheduled_cached_reqs=[],
num_scheduled_tokens={requests[0].request_id: 3},
total_num_scheduled_tokens=3,
scheduled_encoder_inputs={},
scheduled_spec_decode_tokens={
requests[0].request_id: [EOS_TOKEN_ID, 10]
},
num_common_prefix_blocks=0,
finished_req_ids=set(),
free_encoder_mm_hashes=[],
structured_output_request_ids={},
grammar_bitmask=None)
model_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output, model_output)
@@ -652,23 +568,13 @@ class TestAscendScheduler(TestBase):
512)
# Model output of the first request.
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
model_runner_output = ModelRunnerOutput(
req_ids=[requests[0].request_id],
req_id_to_index={requests[0].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output0,
model_runner_output)
@@ -678,23 +584,13 @@ class TestAscendScheduler(TestBase):
# request is still running.
scheduler.schedule()
# Model output of the second request.
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
model_runner_output = ModelRunnerOutput(
req_ids=[requests[1].request_id],
req_id_to_index={requests[1].request_id: 0},
sampled_token_ids=[[0]],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
scheduler.update_from_output(scheduler_output1,
model_runner_output)
@@ -746,29 +642,19 @@ class TestAscendScheduler(TestBase):
req_id = requests[i].request_id
self.assertEqual(output.num_scheduled_tokens[req_id], 1)
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
logprobs=None,
prompt_logprobs_dict={},
spec_token_ids=spec_tokens,
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=[[0] for _ in range(len(requests))],
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
engine_core_outputs = scheduler.update_from_output(
output, model_runner_output)
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
scheduler.update_draft_token_ids(draft_token_ids)
scheduler.update_draft_token_ids(draft_token_ids)
for i in range(len(requests)):
running_req = scheduler.running[i]
@@ -804,23 +690,14 @@ class TestAscendScheduler(TestBase):
else:
self.assertNotIn(req_id,
output.scheduled_spec_decode_tokens)
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_to_index,
sampled_token_ids=output_tokens,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[])
engine_core_outputs = scheduler.update_from_output(
output, model_runner_output)
@@ -896,3 +773,34 @@ class TestAscendScheduler(TestBase):
# Confirm no memory leak.
self.assert_scheduler_empty(scheduler)
def test_scheduler_with_pd_transfer(self):
scheduler = self.create_scheduler()
scheduler.phase = "prefill"
requests = create_requests(num_requests=32)
for request in requests:
scheduler.add_request(request)
# 1st iteration, move 16 requests from waiting to running for prefill
scheduler_output = scheduler.schedule()
model_runner_output = make_output(scheduler)
scheduler.update_from_output(scheduler_output, model_runner_output)
first_iter_prefilled_req_num = len(scheduler.running)
self.assertEqual(len(scheduler_output.scheduled_new_reqs),
scheduler.max_num_running_reqs)
self.assertEqual(scheduler_output.scheduled_cached_reqs.num_reqs, 0)
self.assertEqual(len(scheduler_output.finished_req_ids), 0)
# 2nd iteration, move 16 prefilled requests to finished_prefill_reqs
# and move 16 requests from waiting to running for prefill
scheduler_output = scheduler.schedule()
model_runner_output = make_output(scheduler)
scheduler.update_from_output(scheduler_output, model_runner_output)
self.assertEqual(len(scheduler.finished_prefill_reqs),
first_iter_prefilled_req_num)
# 3rd iteration, all requests prefilled, change scheduler phase to decode
scheduler_output = scheduler.schedule()
model_runner_output = make_output(scheduler)
scheduler.update_from_output(scheduler_output, model_runner_output)
self.assertEqual(scheduler.phase, "decode")

View File

@@ -1,139 +0,0 @@
#
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
# Copyright 2023 The vLLM team.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# This file is a part of the vllm-ascend project.
import importlib
import pytest
import torch
from pytest_mock import MockerFixture
from tests.ut.base import PytestBase
from vllm_ascend.distributed.tensor_parallel import (
_gather_along_first_dim, _gather_along_last_dim,
_reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
all_to_all_hp2sp, all_to_all_sp2hp)
class TestDistributedCommunication(PytestBase):
@pytest.fixture(autouse=True)
def context(self, mocker: MockerFixture):
mocker.patch("torch.npu.current_device", return_value="cpu")
mocker.patch("torch.distributed.get_world_size", return_value=4)
mocker.patch("torch.distributed.get_rank", return_value=0)
@pytest.mark.parametrize("world_size, test_tensor, expected",
[(1, torch.randn(8, 16), (8, 16)),
(4, torch.randn(8, 16), (32, 16))])
def test_gather_along_first_dim(self, test_tensor, expected, world_size,
mocker: MockerFixture):
"""test _gather_along_first_dim"""
mocker.patch("torch.distributed.get_world_size",
return_value=world_size)
result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
assert result.shape == expected
@pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
(torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
])
def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
output_split_sizes,
mocker: MockerFixture):
"""test _gather_along_first_dim"""
result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
output_split_sizes)
assert result.shape == expected
@pytest.mark.parametrize("world_size, test_tensor, expected",
[(1, torch.randn(8, 16, 32), (8, 16, 32)),
(4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
def test_gather_along_last_dim(self, test_tensor, expected, world_size,
mocker: MockerFixture):
"""test _gather_along_last_dim"""
mocker.patch("torch.distributed.get_world_size",
return_value=world_size)
result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
assert result.shape == expected
@pytest.mark.parametrize("input_shape,expected_shape", [
((32, 16), (8, 16)),
((40, 10), (10, 10)),
])
def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = _reduce_scatter_along_first_dim(input_tensor,
mocker.MagicMock())
assert result.shape == expected_shape
@pytest.mark.parametrize("input_shape,expected_shape", [
((8, 16, 32), (8, 16, 8)),
])
def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = _reduce_scatter_along_last_dim(input_tensor,
mocker.MagicMock())
assert result.shape == expected_shape
@pytest.mark.parametrize("func,input_shape,expected_shape", [
("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
(8, 16, 128)),
("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
(8, 16, 8)),
("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
])
def test_wrapper_functions(self, func, input_shape, expected_shape,
mocker: MockerFixture):
"""test wrapper funcs"""
mod = importlib.import_module(
'vllm_ascend.distributed.tensor_parallel')
globals = mod.__dict__
test_func = globals[func]
input_tensor = torch.randn(*input_shape)
result = test_func(input_tensor, mocker.MagicMock())
assert result.shape == expected_shape
@pytest.mark.parametrize(
"input_shape,output_shape",
[
((8, 16), (32, 4)), # [num_tokens/TP, H] -> [num_tokens, H/TP]
])
def test_all_to_all_sp2hp(self, input_shape, output_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
assert result.shape == output_shape
@pytest.mark.parametrize(
"input_shape,output_shape",
[
((32, 4), (8, 16)), # [num_tokens, H/TP] -> [num_tokens/TP, H]
])
def test_all_to_all_hp2sp(self, input_shape, output_shape,
mocker: MockerFixture):
input_tensor = torch.randn(*input_shape)
result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
assert result.shape == output_shape

View File

@@ -4,8 +4,8 @@ import pytest
from vllm.config import ParallelConfig
from vllm_ascend.distributed.parallel_state import (
_LMTP, _MC2, destroy_ascend_model_parallel, get_lmhead_tp_group,
get_mc2_group, init_ascend_model_parallel)
_LMTP, _MC2, _OTP, destroy_ascend_model_parallel, get_lmhead_tp_group,
get_mc2_group, get_otp_group, init_ascend_model_parallel)
@pytest.fixture
@@ -29,16 +29,20 @@ def mock_distributed():
def test_init_ascend_model_parallel(mock_distributed, parallel_config):
mock_ascend_config = MagicMock()
mock_ascend_config.lmhead_tensor_parallel_size = 2
mock_ascend_config.oproj_tensor_parallel_size = 2
with patch('vllm_ascend.distributed.parallel_state.model_parallel_initialized', return_value=False), \
patch('vllm_ascend.distributed.parallel_state.init_model_parallel_group'), \
patch('vllm_ascend.distributed.parallel_state.get_ascend_config', return_value=mock_ascend_config):
init_ascend_model_parallel(parallel_config)
mc2_group = get_mc2_group()
assert mc2_group is not None
lmheadtp_group = get_lmhead_tp_group()
otp_group = get_otp_group()
assert mc2_group is not None
assert otp_group is not None
assert lmheadtp_group is not None
destroy_ascend_model_parallel()
assert _MC2 is None
assert _LMTP is None
assert _OTP is None

View File

@@ -0,0 +1,73 @@
import pytest
from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor
class DummyAdaptor(EplbAdaptor):
def __init__(self, **kwargs):
super().__init__(**kwargs)
self.args = kwargs
def get_rank_expert_workload(self):
return "workload"
def get_init_expert_map(self, num_moe_layers):
return {"layers": num_moe_layers}
def do_update_expert_map(self, layer_id, updated_expert_map):
return {"layer_id": layer_id, "map": updated_expert_map}
def do_update_expert_weight(self, layer_id, local_expert_to_replace,
buffer_tensor_id):
return {
"layer_id": layer_id,
"replace": local_expert_to_replace,
"buffer": buffer_tensor_id,
}
def test_base_class_methods_raise():
adaptor = EplbAdaptor()
with pytest.raises(NotImplementedError):
adaptor.get_rank_expert_workload()
with pytest.raises(NotImplementedError):
adaptor.get_init_expert_map(1)
with pytest.raises(NotImplementedError):
adaptor.do_update_expert_map(1, {})
with pytest.raises(NotImplementedError):
adaptor.do_update_expert_weight(1, "x", "y")
def test_dummy_adaptor_init_and_args():
adaptor = DummyAdaptor(test_arg=123)
assert adaptor.args["test_arg"] == 123
def test_get_rank_expert_workload():
adaptor = DummyAdaptor()
result = adaptor.get_rank_expert_workload()
assert result == "workload"
def test_get_init_expert_map():
adaptor = DummyAdaptor()
result = adaptor.get_init_expert_map(5)
assert isinstance(result, dict)
assert result["layers"] == 5
def test_do_update_expert_map():
adaptor = DummyAdaptor()
updated = {"expert": 1}
result = adaptor.do_update_expert_map(2, updated)
assert result["layer_id"] == 2
assert result["map"] == updated
def test_do_update_expert_weight():
adaptor = DummyAdaptor()
result = adaptor.do_update_expert_weight(1, "expertA", "bufferX")
assert result["layer_id"] == 1
assert result["replace"] == "expertA"
assert result["buffer"] == "bufferX"

View File

@@ -0,0 +1,31 @@
# test_policy_abstract.py
from vllm_ascend.eplb.core.policy.policy_abstract import (DynamicConfig,
EplbPolicy)
class DummyPolicy(EplbPolicy):
def rebalance_experts(self, current_expert_table, expert_workload):
return 1, current_expert_table
def test_dynamic_config_attributes():
config = DynamicConfig()
assert config.placement_policy is None
assert config.max_transferred_expert_per_layer == 100
assert config.ep_worldsize == 64
assert config.num_die_per_host == 8
def test_eplb_policy_init_and_method():
config = DynamicConfig()
policy = DummyPolicy(config)
assert policy.config == config
expert_table = [[0, 1, 2]]
workload = [10]
res, new_table = policy.rebalance_experts(expert_table, workload)
assert res == 1
assert new_table == expert_table

View File

@@ -0,0 +1,98 @@
from unittest.mock import patch
import numpy as np
import pytest
from vllm_ascend.eplb.core.policy.policy_dynamic_ep import DynamicEplb
class TestDynamicEplb:
def test_add_redundant_basic(self):
current_expert_table = np.array([[[0, 1], [1, 0]]])
expert_workload = np.array([[[2, 3], [4, 1]]])
num_original_expert = 2
result = DynamicEplb.add_redundant(current_expert_table,
expert_workload,
num_original_expert)
expected = np.array([[2 + 1, 3 + 4]])
assert np.array_equal(result, expected)
def test_get_redundant_num(self):
counts = np.array([2, 1, 3])
assert DynamicEplb.get_redundant_num(3, counts) == 3
def test_calculate_max_heat_per_layer(self):
workload_table = np.array([[[1, 2], [3, 4]], [[2, 2], [1, 1]]])
max_heat = DynamicEplb.calculate_max_heat_per_layer(workload_table, 2)
assert max_heat == [7, 4]
def test_constraint_expert_local_exchange(self):
current = [[[0, 1], [2, 3]]]
global_dep = [[[1, 0], [3, 2]]]
new_dep = DynamicEplb.constraint_expert_local_exchange(
current, global_dep)
assert new_dep == [[[0, 1], [2, 3]]]
def test_compute_balanced_pack_redundancy_normal(self):
origin_weights = [(0, 10), (1, 20)]
result, boxes = DynamicEplb.compute_balanced_pack_redundancy(
origin_weights, 2, 1)
assert isinstance(result, list) and len(result) == 2
def test_compute_balanced_pack_redundancy_card0(self):
origin_weights = [(0, 10)]
with pytest.raises(RuntimeError):
DynamicEplb.compute_balanced_pack_redundancy(origin_weights, 0, 0)
def test_compute_balanced_pack_normal(self):
origin_weights = np.array([(0, 10), (1, 20)], dtype=object)
result, boxes = DynamicEplb.compute_balanced_pack(origin_weights, 2)
assert isinstance(result, list) and len(result) == 2
def test_compute_balanced_pack_card0(self):
origin_weights = np.array([(0, 10)], dtype=object)
with pytest.raises(RuntimeError):
DynamicEplb.compute_balanced_pack(origin_weights, 0)
def test_original_compute_balanced_pack_redundancy(self):
origin_weights = [(0, 5), (1, 10)]
result, boxes = DynamicEplb.original_compute_balanced_pack_redundancy(
origin_weights, 2, 1)
assert isinstance(result, list) and len(result) == 2
def test_rebalance_experts_normal(self):
expert_table = np.array([[[0, 1], [1, 0]]])
workload = np.array([[[2, 3], [4, 1]]])
policy = DynamicEplb(config=None)
change, priority, new_dep = policy.rebalance_experts(
expert_table, workload)
assert change in [0, 1]
assert isinstance(priority, np.ndarray)
assert isinstance(new_dep, list)
assert np.array(new_dep).shape == expert_table.shape
def test_rebalance_experts_exceptions(self):
policy = DynamicEplb(config=None)
# case1: num_original_expert != expert_num
expert_table = np.array([[[0, 1], [1, 0]]])
workload = np.array([[[2, 3], [4, 1]]])
with patch.object(DynamicEplb,
'add_redundant',
return_value=np.array([[1, 2, 3]])):
with pytest.raises(ValueError):
policy.rebalance_experts(expert_table, workload)
# case2: num_npus <= 0
expert_table_zero = np.array([[]]) # 1 layer, 0 NPU, 0 experts
workload_zero = np.array([[]])
with pytest.raises(ValueError):
policy.rebalance_experts(expert_table_zero, workload_zero)
# case3: num_npus < num_redundancy_expert
expert_table_small = np.array([[[0, 0]]]) # 1 layer, 1 NPU, 2 experts
workload_small = np.array([[[1, 1]]])
with patch.object(DynamicEplb, 'get_redundant_num', return_value=2):
with pytest.raises(ValueError):
policy.rebalance_experts(expert_table_small, workload_small)

View File

@@ -0,0 +1,99 @@
from typing import Dict, Set
import numpy as np
import pytest
from vllm_ascend.eplb.core.policy.policy_dynamic_ep_v2 import (DynamicConfig,
DynamicEplbV2)
@pytest.fixture
def config():
return DynamicConfig()
@pytest.fixture
def policy(config):
return DynamicEplbV2(config)
def test_safe_operations(policy):
# safe_divide
assert policy.safe_divide(10, 2) == 5
assert policy.safe_divide(1, 0) == 0
# safe_exact_divide
assert policy.safe_exact_divide(10, 3) == 3
assert policy.safe_exact_divide(1, 0) == 0
# safe_mod
assert policy.safe_mod(10, 3) == 1
assert policy.safe_mod(1, 0) == 0
def test_add_redundant():
workload = np.array([[[1, 2], [3, 4]]])
placement = np.array([[[0, 1], [0, 1]]])
result = DynamicEplbV2.add_redundant(placement, workload, 2)
assert result.shape == (1, 2)
assert np.all(result[0] == [4, 6]) # 0:1+3, 1:2+4
def test_get_redundant_num():
counts = np.array([1, 2, 1])
assert DynamicEplbV2.get_redundant_num(3, counts) == 1 # sum(counts-1)
def test_calculate_max_heat_per_layer():
workload = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
result = DynamicEplbV2.calculate_max_heat_per_layer(workload, 2)
assert result == [7, 15]
def test_calculate_initial_imbalance(policy):
deployment = np.array([[[0, 1], [0, 1]]])
workloads = np.array([[1, 1]])
result = policy.calculate_initial_imbalance(deployment, workloads)
assert isinstance(result, list)
assert len(result) == 1
def test_compute_redundant_assignments(policy):
base_experts = [(0, 10), (1, 5)]
redundant, sorted_weights = policy.compute_redundant_assignments(
base_experts, num_redundant_experts=2, num_experts=2)
assert len(redundant) == 2
assert len(sorted_weights) == 2
def test_prepare_expert_list():
base_experts = [(0, 10), (1, 5)]
redundant_assignments = [[2], []]
result = DynamicEplbV2.prepare_expert_list(base_experts,
redundant_assignments, 1)
assert isinstance(result, list)
assert len(result) == 1
def test_non_redundant_expert_information():
origin_deployment = np.array([[0, 1]])
updated_weights = [(0, 10), (1, 5)]
rendun_pos: Dict[int, Set[int]] = {0: set()}
assignments, weights, loads, counts = DynamicEplbV2.non_redundant_expert_information(
origin_deployment, updated_weights, rendun_pos)
assert assignments[0] == [0, 1]
assert loads[0] == 15
def test_recomputing_initial_weight(policy):
layer_workloads = [10, 5]
device_assignments = [[0, 1]]
cur_layer_workload, num_all_experts = policy.recomputing_initial_weight(
layer_workloads, device_assignments)
assert cur_layer_workload[0] == 10
assert num_all_experts[0] == 1
def test_safe_divide_zero_edge_case(policy):
assert policy.safe_divide(0, 1) == 0
assert policy.safe_divide(0, 5) == 0

View File

@@ -0,0 +1,23 @@
import pytest
from vllm_ascend.eplb.core.policy.policy_abstract import DynamicConfig
from vllm_ascend.eplb.core.policy.policy_dynamic_ep import DynamicEplb
from vllm_ascend.eplb.core.policy.policy_dynamic_ep_v2 import DynamicEplbV2
from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory
from vllm_ascend.eplb.core.policy.policy_random import RandomLoadBalance
@pytest.fixture
def dummy_config():
return DynamicConfig()
@pytest.mark.parametrize("policy_type, expected_class", [
(0, RandomLoadBalance),
(1, DynamicEplb),
(2, DynamicEplbV2),
(999, RandomLoadBalance),
])
def test_generate_policy(policy_type, expected_class, dummy_config):
policy_instance = PolicyFactory.generate_policy(policy_type, dummy_config)
assert isinstance(policy_instance, expected_class)

View File

@@ -0,0 +1,122 @@
from typing import Any
from unittest.mock import MagicMock, patch
import pytest
import torch
import vllm_ascend.eplb.core.eplb_device_transfer_loader as loader
@pytest.fixture
def mock_adaptor():
adaptor = MagicMock()
adaptor.expert_map_per_layer_cpu = {
0: {
10: torch.tensor(1),
20: torch.tensor(0)
}
}
adaptor.expert_param_per_layer = {
0: {
0: [[torch.tensor([1.0])]],
1: [[torch.tensor([2.0])]]
}
}
adaptor.buffer_tensor_list = [[[torch.tensor([3.0])],
[torch.tensor([4.0])]]]
return adaptor
def test_generate_task_and_state_flow(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor)
with patch("torch.distributed.P2POp") as mock_p2p, \
patch("torch.distributed.isend", return_value="isend_op"), \
patch("torch.distributed.irecv", return_value="irecv_op"):
mock_p2p.side_effect = lambda op, tensor, rank: (op, tensor, rank)
loader_obj.state = loader.ExpertWeightUpdateState.READY
loader_obj.generate_expert_d2d_transfer_task([(1, 10)], [(2, 20)],
{20: torch.tensor(0)}, 0)
assert loader_obj.comm_op_list is None
loader_obj.state = loader.ExpertWeightUpdateState.WAITING
loader_obj.generate_expert_d2d_transfer_task([], [], {}, 0)
assert loader_obj.comm_op_list is None
updated_map = {20: torch.tensor(0)}
loader_obj.generate_expert_d2d_transfer_task([(1, 10)], [(2, 20)],
updated_map, 0)
assert loader_obj.state == loader.ExpertWeightUpdateState.READY
assert loader_obj.comm_op_list
assert loader_obj.recv_expert_list
def test_asyn_transfer_and_update(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor)
loader_obj.comm_op_list = ["fake_op"]
loader_obj.state = loader.ExpertWeightUpdateState.READY
reqs: list[MagicMock] = []
with patch("torch.distributed.batch_isend_irecv",
return_value=[MagicMock(), MagicMock()]):
loader_obj.asyn_expert_weight_transfer(reqs)
assert loader_obj.state == loader.ExpertWeightUpdateState.TRANSFERRING
assert len(reqs) > 0
mock_req = MagicMock()
mock_req.wait.return_value = None
reqs = [mock_req]
loader_obj.recv_expert_list = [(0, 0)]
loader_obj.updated_expert_map = {20: torch.tensor(0)}
loader_obj.updated_log2phy_map = {"dummy": 1}
loader_obj.layer_id = 0
loader_obj.comm_op_list = ["op"]
loader_obj.update_expert_map_and_weight(reqs)
mock_adaptor.do_update_expert_map.assert_called_once()
mock_adaptor.do_update_log2phy_map.assert_called_once()
mock_adaptor.do_update_expert_weight.assert_called_once()
assert loader_obj.state == loader.ExpertWeightUpdateState.WAITING
assert loader_obj.recv_expert_list == []
def test_set_log2phy_map(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor)
loader_obj.set_log2phy_map({"a": 1})
assert loader_obj.updated_log2phy_map == {"a": 1}
def test_invalid_state_asyn_update(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor)
loader_obj.state = loader.ExpertWeightUpdateState.WAITING
reqs: list[Any] = []
loader_obj.asyn_expert_weight_transfer(reqs)
assert reqs == []
loader_obj.state = loader.ExpertWeightUpdateState.READY
loader_obj.update_expert_map_and_weight([])
assert not mock_adaptor.do_update_expert_map.called
def test_load_impl_not_implemented(mock_adaptor):
loader_obj = loader.D2DExpertWeightLoader()
loader_obj.set_adator(mock_adaptor)
with pytest.raises(NotImplementedError):
loader_obj.load_impl({}, {})

View File

@@ -0,0 +1,79 @@
import random
import torch
from vllm_ascend.eplb.core import eplb_utils
def test_determine_default_expert_map_single_world():
count, expert_map = eplb_utils.determine_default_expert_map(
global_expert_num=4,
world_size=1,
rank_id=0,
global_redundant_expert_num=0)
assert count == 4
assert torch.equal(expert_map, torch.arange(4, dtype=torch.int32))
def test_determine_default_expert_map_multiple_worlds_no_redundant():
count, expert_map = eplb_utils.determine_default_expert_map(
global_expert_num=8,
world_size=2,
rank_id=0,
global_redundant_expert_num=0)
assert count == 4
assert torch.all(expert_map[:4] >= 0)
assert torch.all(expert_map[4:] == -1)
def test_determine_default_expert_map_multiple_worlds_with_redundant():
count, expert_map = eplb_utils.determine_default_expert_map(
global_expert_num=5,
world_size=2,
rank_id=0,
global_redundant_expert_num=1)
assert count == 3
assert torch.all(expert_map[0:3] >= 0)
def test_generate_log2phy_map_single_rank_holding():
expert_map = torch.tensor([[0, -1], [-1, 0]], dtype=torch.int32)
log2phy_map = eplb_utils.generate_log2phy_map(expert_map)
assert torch.all(log2phy_map[:, 0] == log2phy_map[0, 0])
assert torch.all(log2phy_map[:, 1] == log2phy_map[1, 1])
def test_generate_log2phy_map_multiple_rank_holding(monkeypatch):
expert_map = torch.tensor([[0], [0]], dtype=torch.int32)
monkeypatch.setattr(random, "choice", lambda x: x[0])
log2phy_map = eplb_utils.generate_log2phy_map(expert_map)
assert log2phy_map.shape == (2, 1)
assert (log2phy_map >= 0).all()
def test_determine_default_log2phy_map_world_size_1():
log2phy = eplb_utils.determine_default_log2phy_map(
global_expert_num=3,
world_size=1,
rank_id=0,
global_redundant_expert_num=0)
assert log2phy.shape == (3, )
assert (log2phy >= 0).all()
def test_determine_default_log2phy_map_world_size_multiple():
log2phy = eplb_utils.determine_default_log2phy_map(
global_expert_num=6,
world_size=2,
rank_id=1,
global_redundant_expert_num=1)
assert log2phy.shape == (6, )
assert (log2phy >= 0).all()

View File

@@ -7,6 +7,7 @@ import time
import types
import unittest
from collections import defaultdict, deque
from typing import OrderedDict
from unittest.mock import MagicMock, patch
import msgspec
@@ -34,7 +35,7 @@ class TestKVCacheTaskTrackerInit(unittest.TestCase):
tracker = KVCacheTaskTracker()
self.assertIsInstance(tracker.done_task_lock, type(threading.Lock()))
self.assertIsInstance(tracker.finished_requests, set)
self.assertIsInstance(tracker.delayed_free_requests, deque)
self.assertIsInstance(tracker.delayed_free_requests, OrderedDict)
class TestGetAndClearFinishedSingleRequests(unittest.TestCase):
@@ -495,18 +496,42 @@ class TestKVCacheTaskTracker(unittest.TestCase):
def test_update_done_task_count(self):
self.assertEqual(len(self.tracker.finished_requests), 0)
self.assertEqual(len(self.tracker.delayed_free_requests), 0)
self.assertEqual(len(self.tracker.record_finished_requests), 0)
current_time = time.time()
self.tracker.add_delayed_request("req_1", current_time)
result = self.tracker.delayed_free_requests
result_record = self.tracker.record_finished_requests
self.assertEqual(len(result), 1)
self.assertEqual(result[0], ("req_1", current_time))
self.assertEqual(result["req_1"], current_time)
self.assertEqual(len(result_record), 0)
self.tracker.update_done_task_count("req_1")
result_finished = self.tracker.finished_requests
result_delayed = self.tracker.delayed_free_requests
result_record = self.tracker.record_finished_requests
self.assertEqual(result_finished, {"req_1"})
self.assertEqual(len(result_delayed), 0)
self.assertEqual(len(result_record), 0)
self.tracker.update_done_task_count("req_2")
result_finished = self.tracker.finished_requests
result_delayed = self.tracker.delayed_free_requests
result_record = self.tracker.record_finished_requests
self.assertEqual(result_finished, {"req_1", "req_2"})
self.assertEqual(len(result_delayed), 0)
self.assertEqual(len(result_record), 1)
self.assertEqual(result_record, {"req_2"})
def test_updtate_add_delayed_request(self) -> None:
self.tracker.update_done_task_count("req2")
result_start_record = self.tracker.record_finished_requests
self.assertEqual(len(result_start_record), 1)
self.tracker.add_delayed_request("req2", time.time())
result_delayed = self.tracker.delayed_free_requests
result_end_record = self.tracker.record_finished_requests
self.assertEqual(len(result_delayed), 0)
self.assertEqual(len(result_end_record), 0)
def test_retrieve_expired_requests(self):
current_time = time.time()
@@ -518,7 +543,7 @@ class TestKVCacheTaskTracker(unittest.TestCase):
})
result_delay = self.tracker.delayed_free_requests
self.assertEqual(len(result_delay), 1)
self.assertEqual(result_delay[0], ("req_2", current_time))
self.assertIn("req_2", result_delay)
def test_duplicate_task_update(self):
self.tracker.update_done_task_count("req1")
@@ -961,6 +986,46 @@ class TestMooncakeConnectorWorker(unittest.TestCase):
for p in self.patches:
p.stop() # type: ignore
def test_worker_use_ascend_direct(self):
test_case = [True, False]
for use_ascend_direct in test_case:
with self.subTest(use_ascend_direct=use_ascend_direct):
config = MagicMock()
config.kv_transfer_config = MagicMock()
config.kv_transfer_config.get_from_extra_config.side_effect = (
lambda k, d: {
"prefill": {
"tp_size": 2,
"dp_size": 1
},
"decode": {
"tp_size": 2,
"dp_size": 1
},
"use_ascend_direct": use_ascend_direct,
}.get(k, d))
config.parallel_config = MagicMock()
config.parallel_config.tensor_parallel_size = 2
config.parallel_config.data_parallel_rank_local = 0
config.parallel_config.data_parallel_size_local = 1
config.kv_transfer_config.kv_port = 8000
config.kv_transfer_config.kv_role = 'worker'
with patch(
"vllm_ascend.distributed.mooncake_connector.get_tensor_model_parallel_rank",
return_value=0):
with patch(
"vllm_ascend.distributed.mooncake_connector.get_tp_group",
return_value=None):
with patch(
"vllm_ascend.distributed.mooncake_connector.get_ip",
return_value="127.0.0.1"):
worker = MooncakeConnectorWorker(
config, self.engine_id)
self.assertIsNotNone(worker)
def test_register_kv_caches_producer(self):
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
worker.register_kv_caches(self.kv_caches)

View File

@@ -10,6 +10,7 @@ import torch
from vllm import SamplingParams
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
ModelConfig, SchedulerConfig, VllmConfig)
from vllm.utils import sha256
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
init_none_hash)
from vllm.v1.core.sched.scheduler import Scheduler
@@ -19,8 +20,6 @@ from vllm.v1.outputs import ModelRunnerOutput
from vllm.v1.request import Request
from vllm.v1.structured_output import StructuredOutputManager
from vllm_ascend.utils import vllm_version_is
EOS_TOKEN_ID = 50256
os.environ["VLLM_USE_V1"] = "1"
@@ -131,10 +130,10 @@ def create_request(
"""Make dummy request for testing."""
global _none_hash_initialized
if not _none_hash_initialized:
init_none_hash(hash)
init_none_hash(sha256)
_none_hash_initialized = True
block_hasher = get_request_block_hasher(block_size, hash)
block_hasher = get_request_block_hasher(block_size, sha256)
kv_transfer_params: Optional[dict[str, Any]] = None
@@ -160,27 +159,14 @@ def create_request(
else:
prompt_token_ids = [i * request_id for i in range(num_tokens)]
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
req = Request(
request_id=f"id-{request_id}",
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params,
multi_modal_kwargs=None,
multi_modal_placeholders=None,
multi_modal_hashes=None,
pooling_params=[],
eos_token_id=EOS_TOKEN_ID,
block_hasher=block_hasher,
)
else:
req = Request(
request_id=f"id-{request_id}",
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params,
pooling_params=[],
eos_token_id=EOS_TOKEN_ID,
block_hasher=block_hasher,
)
req = Request(
request_id=f"id-{request_id}",
prompt_token_ids=prompt_token_ids,
sampling_params=sampling_params,
pooling_params=[],
eos_token_id=EOS_TOKEN_ID,
block_hasher=block_hasher,
)
req.kv_transfer_params = kv_transfer_params
return req
@@ -208,26 +194,15 @@ def create_model_runner_output(
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
finished_recving=finished_recving)
extra_args = {"kv_connector_output": kv_connector_output}
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
spec_token_ids=None,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
**extra_args,
)
else:
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
**extra_args,
)
model_runner_output = ModelRunnerOutput(
req_ids=req_ids,
req_id_to_index=req_id_to_index,
sampled_token_ids=sampled_token_ids,
logprobs=None,
prompt_logprobs_dict={},
pooler_output=[],
**extra_args,
)
return model_runner_output

114
tests/ut/models/conftest.py Normal file
View File

@@ -0,0 +1,114 @@
from types import SimpleNamespace
from unittest.mock import MagicMock, Mock, patch
import pytest
import torch
from transformers import PretrainedConfig
from vllm.config import CacheConfig, EPLBConfig, ParallelConfig
from vllm.distributed.parallel_state import GroupCoordinator
@pytest.fixture
def base_config():
config = PretrainedConfig(
hidden_size=128,
num_attention_heads=8,
num_hidden_layers=2,
intermediate_size=256,
hidden_act="silu",
rms_norm_eps=1e-6,
rope_theta=10000.0,
max_position_embeddings=2048,
n_routed_experts=4,
n_shared_experts=1,
moe_intermediate_size=256,
num_experts_per_tok=2,
routed_scaling_factor=1.0,
first_k_dense_replace=0,
moe_layer_freq=1,
kv_lora_rank=16,
qk_nope_head_dim=16,
qk_rope_head_dim=16,
v_head_dim=32,
topk_method="noaux_tc",
scoring_func="softmax",
norm_topk_prob=True,
n_group=1,
topk_group=1,
vocab_size=10000,
)
return config
@pytest.fixture
def vllm_config(base_config):
model_config = SimpleNamespace(
hf_config=base_config,
tensor_parallel_size=1,
dtype=torch.float32,
use_mla=True,
quant_config=None,
max_model_len=2048,
)
parallel_config = MagicMock(spec=ParallelConfig)
eplb_config = MagicMock(spec=EPLBConfig)
eplb_config.num_redundant_experts = 0
parallel_config.eplb_config = eplb_config
cache_config = CacheConfig()
vllm_config = Mock()
vllm_config.model_config = model_config
vllm_config.cache_config = cache_config
vllm_config.quant_config = None
vllm_config.parallel_config = parallel_config
return vllm_config
@pytest.fixture
def mock_distributed():
tp_group = Mock(spec=GroupCoordinator)
tp_group.rank_in_group = 0
tp_group.world_size = 1
tp_group.device_group = Mock()
dp_group = Mock(spec=GroupCoordinator)
dp_group.rank_in_group = 0
dp_group.world_size = 1
ep_group = Mock(spec=GroupCoordinator)
ep_group.rank_in_group = 0
ep_group.world_size = 1
ep_group.device_group = Mock()
ep_group.device_group.rank.return_value = 0
ep_group.device_group.size.return_value = 1
pp_group = Mock(spec=GroupCoordinator)
pp_group.rank_in_group = 0
pp_group.world_size = 1
mock_vllm_config = Mock()
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
patch("vllm_ascend.ops.moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
patch("vllm_ascend.ops.moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
_PP=pp_group), \
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
patch("torch.npu.current_device", return_value=0):
yield
@pytest.fixture
def mock_forward_context():
forward_context = Mock(in_profile_run=False, with_prefill=False)
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
return_value=forward_context):
yield

View File

@@ -13,10 +13,13 @@ from vllm_ascend.models.deepseek_mtp import (
class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
@pytest.fixture
def setup_mtp_layer(self, mocker: MockerFixture):
def setup_mtp_layer(self, mocker: MockerFixture, vllm_config: VllmConfig,
mock_distributed):
config = PretrainedConfig(vocab_size=1000,
hidden_size=768,
rms_norm_eps=1e-5)
mocker.patch("vllm_ascend.models.deepseek_mtp.get_current_vllm_config",
return_value=vllm_config)
mocker.patch(
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
return_value=None)
@@ -29,15 +32,15 @@ class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekShareHead.__init__",
return_value=None)
mocker_deepseek_v2_decode_layer = mocker.patch(
"vllm_ascend.models.deepseek_v2.CustomDeepseekV2DecoderLayer.__init__",
"vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.__init__",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)
mocker.patch("vllm_ascend.utils.get_ascend_config",
mocker.patch("vllm_ascend.models.deepseek_v2.get_ascend_config",
return_value=mocker.Mock())
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "", None)
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "0", None)
mocker_deepseek_v2_decode_layer.assert_called_once()
return mtp_layer
@@ -165,8 +168,6 @@ class TestCustomDeepSeekMTP(PytestBase):
mocker.patch(
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
return_value=None)
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
return_value=None)
mocker.patch(
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
return_value=None)

Some files were not shown because too many files have changed in this diff Show More