Merge pull request 'v0.11.0rc0' (#1) from v0.11.0rc0 into main
Reviewed-on: http://git.modelhub.org.cn:980/EngineX-Ascend/enginex-ascend-910-vllm/pulls/1
This commit is contained in:
@@ -15,7 +15,7 @@
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM git.modelhub.org.cn:9443/enginex-ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
||||
FROM quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
@@ -37,7 +37,7 @@ RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.10.1.1
|
||||
ARG VLLM_TAG=v0.11.0rc3
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
|
||||
61
Dockerfile.310p
Normal file
61
Dockerfile.310p
Normal file
@@ -0,0 +1,61 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM quay.io/ascend/cann:8.2.rc1-310p-ubuntu22.04-py3.11
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0rc3
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install vllm-ascend
|
||||
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
|
||||
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export SOC_VERSION=ASCEND310P3 && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
59
Dockerfile.310p.openEuler
Normal file
59
Dockerfile.310p.openEuler
Normal file
@@ -0,0 +1,59 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM quay.io/ascend/cann:8.2.rc1-310p-openeuler24.03-py3.11
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
|
||||
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
||||
|
||||
RUN yum update -y && \
|
||||
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0rc3
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install vllm-ascend
|
||||
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||
export SOC_VERSION=ASCEND310P3 && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
60
Dockerfile.a3
Normal file
60
Dockerfile.a3
Normal file
@@ -0,0 +1,60 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM quay.io/ascend/cann:8.2.rc1-a3-ubuntu22.04-py3.11
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
|
||||
# Define environments
|
||||
ENV DEBIAN_FRONTEND=noninteractive
|
||||
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
||||
|
||||
RUN apt-get update -y && \
|
||||
apt-get install -y python3-pip git vim wget net-tools gcc g++ cmake libnuma-dev && \
|
||||
rm -rf /var/cache/apt/* && \
|
||||
rm -rf /var/lib/apt/lists/*
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0rc3
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -v -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install vllm-ascend
|
||||
# Append `libascend_hal.so` path (devlib) to LD_LIBRARY_PATH
|
||||
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
58
Dockerfile.a3.openEuler
Normal file
58
Dockerfile.a3.openEuler
Normal file
@@ -0,0 +1,58 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM quay.io/ascend/cann:8.2.rc1-a3-openeuler24.03-py3.11
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
|
||||
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
||||
|
||||
RUN yum update -y && \
|
||||
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0rc3
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install vllm-ascend
|
||||
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
58
Dockerfile.openEuler
Normal file
58
Dockerfile.openEuler
Normal file
@@ -0,0 +1,58 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
FROM quay.io/ascend/cann:8.2.rc1-910b-openeuler24.03-py3.11
|
||||
|
||||
ARG PIP_INDEX_URL="https://mirrors.tuna.tsinghua.edu.cn/pypi/web/simple"
|
||||
ARG COMPILE_CUSTOM_KERNELS=1
|
||||
|
||||
ENV COMPILE_CUSTOM_KERNELS=${COMPILE_CUSTOM_KERNELS}
|
||||
|
||||
RUN yum update -y && \
|
||||
yum install -y python3-pip git vim wget net-tools gcc gcc-c++ make cmake numactl-devel && \
|
||||
rm -rf /var/cache/yum
|
||||
|
||||
RUN pip config set global.index-url ${PIP_INDEX_URL}
|
||||
|
||||
WORKDIR /workspace
|
||||
|
||||
COPY . /vllm-workspace/vllm-ascend/
|
||||
|
||||
# Install vLLM
|
||||
ARG VLLM_REPO=https://github.com/vllm-project/vllm.git
|
||||
ARG VLLM_TAG=v0.11.0rc3
|
||||
|
||||
RUN git clone --depth 1 $VLLM_REPO --branch $VLLM_TAG /vllm-workspace/vllm
|
||||
# In x86, triton will be installed by vllm. But in Ascend, triton doesn't work correctly. we need to uninstall it.
|
||||
RUN VLLM_TARGET_DEVICE="empty" python3 -m pip install -e /vllm-workspace/vllm/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip uninstall -y triton && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install vllm-ascend
|
||||
RUN export PIP_EXTRA_INDEX_URL=https://mirrors.huaweicloud.com/ascend/repos/pypi && \
|
||||
source /usr/local/Ascend/ascend-toolkit/set_env.sh && \
|
||||
source /usr/local/Ascend/nnal/atb/set_env.sh && \
|
||||
export LD_LIBRARY_PATH=$LD_LIBRARY_PATH:/usr/local/Ascend/ascend-toolkit/latest/`uname -i`-linux/devlib && \
|
||||
export CPLUS_INCLUDE_PATH=$CPLUS_INCLUDE_PATH:/usr/include/c++/12:/usr/include/c++/12/`uname -i`-openEuler-linux && \
|
||||
python3 -m pip install -v -e /vllm-workspace/vllm-ascend/ --extra-index https://download.pytorch.org/whl/cpu/ && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
# Install modelscope (for fast download) and ray (for multinode)
|
||||
RUN python3 -m pip install modelscope 'ray>=2.47.1' 'protobuf>3.20.0' && \
|
||||
python3 -m pip cache purge
|
||||
|
||||
CMD ["/bin/bash"]
|
||||
@@ -4,7 +4,7 @@
|
||||
|
||||
## 镜像
|
||||
|
||||
Latest RC Version: git.modelhub.org.cn:9443/enginex-ascend/vllm-ascend:v0.10.0rc1
|
||||
Latest RC Version: git.modelhub.org.cn:9443/enginex-ascend/vllm-ascend:v0.11.0rc0
|
||||
|
||||
## 总览
|
||||
|
||||
@@ -77,5 +77,5 @@ curl -X POST http://localhost:10086/v1/chat/completions \
|
||||
|
||||
| Version | Release type | Doc |
|
||||
|------------|--------------|--------------------------------------|
|
||||
|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|
||||
|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
|
||||
|v0.11.0rc0| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|
||||
|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
|
||||
|
||||
@@ -42,7 +42,7 @@ By using vLLM Ascend plugin, popular open-source models, including Transformer-l
|
||||
- OS: Linux
|
||||
- Software:
|
||||
* Python >= 3.9, < 3.12
|
||||
* CANN >= 8.2.rc1
|
||||
* CANN >= 8.2.rc1 (Ascend HDK version refers to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
|
||||
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
|
||||
* vLLM (the same version as vllm-ascend)
|
||||
|
||||
@@ -52,7 +52,7 @@ Please use the following recommended versions to get started quickly:
|
||||
|
||||
| Version | Release type | Doc |
|
||||
|------------|--------------|--------------------------------------|
|
||||
|v0.10.1rc1|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
|
||||
|v0.11.0rc0|Latest release candidate|[QuickStart](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/latest/installation.html) for more details|
|
||||
|v0.9.1|Latest stable version|[QuickStart](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [Installation](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html) for more details|
|
||||
|
||||
## Contributing
|
||||
@@ -73,7 +73,7 @@ Below is maintained branches:
|
||||
|
||||
| Branch | Status | Note |
|
||||
|------------|--------------|--------------------------------------|
|
||||
| main | Maintained | CI commitment for vLLM main branch and vLLM 0.10.x branch |
|
||||
| main | Maintained | CI commitment for vLLM main branch and vLLM v0.11.0 tag |
|
||||
| v0.7.1-dev | Unmaintained | Only doc fixed is allowed |
|
||||
| v0.7.3-dev | Maintained | CI commitment for vLLM 0.7.3 version, only bug fix is allowed and no new release tag any more. |
|
||||
| v0.9.1-dev | Maintained | CI commitment for vLLM 0.9.1 version |
|
||||
|
||||
@@ -43,7 +43,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
|
||||
- 操作系统:Linux
|
||||
- 软件:
|
||||
* Python >= 3.9, < 3.12
|
||||
* CANN >= 8.2.rc1
|
||||
* CANN >= 8.2.rc1 (Ascend HDK 版本参考[这里](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html))
|
||||
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
|
||||
* vLLM (与vllm-ascend版本一致)
|
||||
|
||||
@@ -53,7 +53,7 @@ vLLM 昇腾插件 (`vllm-ascend`) 是一个由社区维护的让vLLM在Ascend NP
|
||||
|
||||
| Version | Release type | Doc |
|
||||
|------------|--------------|--------------------------------------|
|
||||
|v0.10.1rc1| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|
||||
|v0.11.0rc0| 最新RC版本 |请查看[快速开始](https://vllm-ascend.readthedocs.io/en/latest/quick_start.html)和[安装指南](https://vllm-ascend.readthedocs.io/en/latest/installation.html)了解更多|
|
||||
|v0.9.1| 最新正式/稳定版本 |[快速开始](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/quick_start.html) and [安装指南](https://vllm-ascend.readthedocs.io/en/v0.9.1-dev/installation.html)了解更多|
|
||||
|
||||
## 贡献
|
||||
@@ -73,7 +73,7 @@ vllm-ascend有主干分支和开发分支。
|
||||
|
||||
| 分支 | 状态 | 备注 |
|
||||
|------------|------------|---------------------|
|
||||
| main | Maintained | 基于vLLM main分支CI看护 |
|
||||
| main | Maintained | 基于vLLM main分支和vLLM最新版本(v0.11.0)CI看护 |
|
||||
| v0.7.1-dev | Unmaintained | 只允许文档修复 |
|
||||
| v0.7.3-dev | Maintained | 基于vLLM v0.7.3版本CI看护, 只允许Bug修复,不会再发布新版本 |
|
||||
| v0.9.1-dev | Maintained | 基于vLLM v0.9.1版本CI看护 |
|
||||
|
||||
@@ -112,7 +112,7 @@ def test_get_masked_input_and_mask(
|
||||
|
||||
# Define custom function
|
||||
def custom_fn():
|
||||
return torch.ops._C.get_masked_input_and_mask(
|
||||
return torch.ops._C_ascend.get_masked_input_and_mask(
|
||||
input_tensor,
|
||||
test_case["org_start"],
|
||||
test_case["org_end"],
|
||||
|
||||
@@ -78,7 +78,9 @@ kill_npu_processes() {
|
||||
ps -aux
|
||||
lsof -t -i:8000 | xargs -r kill -9
|
||||
pgrep python3 | xargs -r kill -9
|
||||
|
||||
# vLLM now names the process with VLLM prefix after https://github.com/vllm-project/vllm/pull/21445
|
||||
pgrep VLLM | xargs -r kill -9
|
||||
|
||||
sleep 4
|
||||
rm -rf ~/.config/vllm
|
||||
|
||||
|
||||
@@ -23,7 +23,8 @@
|
||||
"hf_split": "train",
|
||||
"endpoint": "/v1/chat/completions",
|
||||
"dataset_path": "lmarena-ai/vision-arena-bench-v0.1",
|
||||
"num_prompts": 200
|
||||
"num_prompts": 200,
|
||||
"no_stream": ""
|
||||
}
|
||||
},
|
||||
{
|
||||
|
||||
@@ -20,7 +20,6 @@
|
||||
#include <torch_npu/csrc/core/npu/NPUStream.h>
|
||||
#include <torch_npu/csrc/framework/OpCommand.h>
|
||||
#include <torch_npu/csrc/npu/Module.h>
|
||||
#include <pybind11/pybind11.h>
|
||||
#include "acl/acl.h"
|
||||
#include "ops.h"
|
||||
#include "utils.h"
|
||||
@@ -142,7 +141,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
|
||||
TP2, rank 1:
|
||||
|< -----------BASE----------- >|< -BASE PADDING- >|< -----------LORA PADDING----------- >|
|
||||
corresponding token_id: | 512 | 513 | 514 | ... | 1009 | -1 | ... | -1 | -1 | ... | -1 | -1 | ... | -1 |
|
||||
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 |
|
||||
index: | 0 | 1 | 2 | ... | 497 | 498 | ... | 511 | 512 | ... | 519 | 520 | ... | 543 |
|
||||
Parameters:
|
||||
org_vocab_start_index //base embeddings start
|
||||
org_vocab_end_index //base embeddings end
|
||||
@@ -165,22 +164,22 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
|
||||
// Create output tensors
|
||||
at::Tensor masked_input = at::empty_like(input);
|
||||
at::Tensor mask = at::empty_like(input).to(at::kBool);
|
||||
|
||||
|
||||
// Get data pointers
|
||||
void *input_ptr = input.data_ptr();
|
||||
void *masked_input_ptr = masked_input.data_ptr();
|
||||
void *mask_ptr = mask.data_ptr();
|
||||
|
||||
|
||||
// Get current stream
|
||||
aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
|
||||
|
||||
|
||||
// Get scalar type
|
||||
at::ScalarType scalar_type = input.scalar_type();
|
||||
|
||||
|
||||
// Create and configure OpCommand
|
||||
at_npu::native::OpCommand cmd;
|
||||
cmd.Name("get_masked_input_and_mask");
|
||||
cmd.SetCustomHandler([scalar_type, size, stream,
|
||||
cmd.SetCustomHandler([scalar_type, size, stream,
|
||||
input_ptr, masked_input_ptr, mask_ptr,
|
||||
org_vocab_start_index, org_vocab_end_index,
|
||||
num_org_vocab_padding, added_vocab_start_index,
|
||||
@@ -194,7 +193,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
|
||||
get_masked_input_and_mask_impl(
|
||||
stream,
|
||||
input_ptr,
|
||||
masked_input_ptr,
|
||||
masked_input_ptr,
|
||||
mask_ptr,
|
||||
org_vocab_start_index,
|
||||
org_vocab_end_index,
|
||||
@@ -204,7 +203,7 @@ std::tuple<at::Tensor, at::Tensor> get_masked_input_and_mask(
|
||||
size,
|
||||
loop_cnt,
|
||||
aiv_num);
|
||||
|
||||
|
||||
return 0;
|
||||
});
|
||||
cmd.Run();
|
||||
@@ -321,8 +320,8 @@ void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at
|
||||
aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
|
||||
at_npu::native::OpCommand cmd;
|
||||
cmd.Name("sgmv_shrink");
|
||||
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size,
|
||||
seq_len_ptr, seq_len_size, y_ptr,
|
||||
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size,
|
||||
seq_len_ptr, seq_len_size, y_ptr,
|
||||
batch_size, input_hidden_token, lora_rank, scale_f]() -> int {
|
||||
auto dtype = get_dtype_from_torch(scalar_type);
|
||||
int device_id = 0;
|
||||
@@ -331,7 +330,7 @@ void sgmv_shrink(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indices, at
|
||||
int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
|
||||
TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
|
||||
sgmv_shrink_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size,
|
||||
y_ptr, batch_size,
|
||||
y_ptr, batch_size,
|
||||
num_tokens_per_core, input_hidden_token, lora_rank, scale_f);
|
||||
return 0;
|
||||
});
|
||||
@@ -368,7 +367,7 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
|
||||
aclrtStream stream = c10_npu::getCurrentNPUStream().stream();
|
||||
at_npu::native::OpCommand cmd;
|
||||
cmd.Name("sgmv_expand");
|
||||
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
|
||||
cmd.SetCustomHandler([scalar_type, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
|
||||
batch_size, lora_rank, slice_offset, slice_size, output_full_dim]() -> int {
|
||||
auto dtype = get_dtype_from_torch(scalar_type);
|
||||
int device_id = 0;
|
||||
@@ -376,7 +375,7 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
|
||||
TORCH_CHECK(aclGetDeviceCapability(device_id, ACL_DEVICE_INFO_VECTOR_CORE_NUM, &aiv_num) == ACL_SUCCESS);
|
||||
int num_tokens_per_core = (batch_size + aiv_num - 1) / aiv_num;
|
||||
TORCH_CHECK("num_tokens_per_core != 0", "num_tokens_per_core should not be 0");
|
||||
sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
|
||||
sgmv_expand_impl(dtype, stream, x_ptr, weight_ptr, lora_indices_ptr, lora_indices_size, seq_len_ptr, seq_len_size, y_ptr, y_out_ptr,
|
||||
batch_size, num_tokens_per_core, lora_rank, slice_size, slice_offset, output_full_dim);
|
||||
return 0;
|
||||
});
|
||||
@@ -385,7 +384,7 @@ at::Tensor sgmv_expand(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_indic
|
||||
}
|
||||
} // namespace vllm_ascend
|
||||
|
||||
TORCH_LIBRARY_EXPAND(_C, ops)
|
||||
TORCH_LIBRARY_EXPAND(CONCAT(_C, _ascend), ops)
|
||||
{
|
||||
// vLLM-Ascend custom ops
|
||||
ops.def("weak_ref_tensor(Tensor input) -> Tensor");
|
||||
@@ -424,5 +423,3 @@ TORCH_LIBRARY_EXPAND(_C, ops)
|
||||
" int slice_offset, int slice_size) -> Tensor");
|
||||
ops.impl("sgmv_expand", torch::kPrivateUse1, &vllm_ascend::sgmv_expand);
|
||||
}
|
||||
|
||||
REGISTER_EXTENSION(_C)
|
||||
|
||||
@@ -40,7 +40,7 @@ std::tuple<at::Tensor, at::Tensor> rotary_embedding_meta(
|
||||
at::Tensor &positions,
|
||||
at::Tensor &query,
|
||||
at::Tensor &key,
|
||||
int64_t head_size,
|
||||
int64_t head_size,
|
||||
at::Tensor &cos_sin_cache,
|
||||
bool is_neox) {
|
||||
auto num_tokens = positions.sym_numel();
|
||||
@@ -86,9 +86,9 @@ at::Tensor sgmv_expand_meta(at::Tensor &x, at::Tensor &weight, at::Tensor &lora_
|
||||
} // namespace vllm_ascend
|
||||
|
||||
namespace {
|
||||
// Register the meta implementations of the custom kernels for symbolic tracing, this will also
|
||||
// Register the meta implementations of the custom kernels for symbolic tracing, this will also
|
||||
// the custom kernel been captured into aclgraph
|
||||
TORCH_LIBRARY_IMPL_EXPAND(_C, Meta, ops) {
|
||||
TORCH_LIBRARY_IMPL_EXPAND(CONCAT(_C, _ascend), Meta, ops) {
|
||||
// Rotary embedding meta implementation
|
||||
ops.impl("rotary_embedding", &vllm_ascend::meta::rotary_embedding_meta);
|
||||
// Masked input and mask meta implementation
|
||||
@@ -99,4 +99,4 @@ namespace {
|
||||
ops.impl("sgmv_expand", &vllm_ascend::meta::sgmv_expand_meta);
|
||||
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
@@ -22,6 +22,8 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
|
||||
|
||||
| vLLM Ascend | vLLM | Python | Stable CANN | PyTorch/torch_npu | MindIE Turbo |
|
||||
|-------------|--------------|------------------|-------------|--------------------|--------------|
|
||||
| v0.11.0rc0 | v0.11.0rc3 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
|
||||
| v0.10.2rc1 | v0.10.2 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
|
||||
| v0.10.1rc1 | v0.10.1/v0.10.1.1 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
|
||||
| v0.10.0rc1 | v0.10.0 | >= 3.9, < 3.12 | 8.2.RC1 | 2.7.1 / 2.7.1.dev20250724 | |
|
||||
| v0.9.2rc1 | v0.9.2 | >= 3.9, < 3.12 | 8.1.RC1 | 2.5.1 / 2.5.1.post1.dev20250619 | |
|
||||
@@ -42,6 +44,8 @@ Following is the Release Compatibility Matrix for vLLM Ascend Plugin:
|
||||
|
||||
| Date | Event |
|
||||
|------------|-------------------------------------------|
|
||||
| 2025.09.30 | Release candidates, v0.11.0rc0 |
|
||||
| 2025.09.16 | Release candidates, v0.10.2rc1 |
|
||||
| 2025.09.04 | Release candidates, v0.10.1rc1 |
|
||||
| 2025.09.03 | v0.9.1 Final release |
|
||||
| 2025.08.22 | Release candidates, v0.9.1rc3 |
|
||||
|
||||
@@ -65,19 +65,19 @@ myst_substitutions = {
|
||||
# the branch of vllm, used in vllm clone
|
||||
# - main branch: 'main'
|
||||
# - vX.Y.Z branch: 'vX.Y.Z'
|
||||
'vllm_version': 'v0.10.1.1',
|
||||
'vllm_version': 'v0.11.0rc3',
|
||||
# the branch of vllm-ascend, used in vllm-ascend clone and image tag
|
||||
# - main branch: 'main'
|
||||
# - vX.Y.Z branch: latest vllm-ascend release tag
|
||||
'vllm_ascend_version': 'v0.10.1rc1',
|
||||
'vllm_ascend_version': 'v0.11.0rc0',
|
||||
# the newest release version of vllm-ascend and matched vLLM, used in pip install.
|
||||
# This value should be updated when cut down release.
|
||||
'pip_vllm_ascend_version': "0.10.1rc1",
|
||||
'pip_vllm_version': "0.10.1.1",
|
||||
'pip_vllm_ascend_version': "0.11.0rc0",
|
||||
'pip_vllm_version': "0.11.0",
|
||||
# CANN image tag
|
||||
'cann_image_tag': "8.2.rc1-910b-ubuntu22.04-py3.11",
|
||||
# vllm version in ci
|
||||
'ci_vllm_version': 'v0.10.1.1',
|
||||
'ci_vllm_version': 'v0.11.0rc3',
|
||||
}
|
||||
|
||||
# Add any paths that contain templates here, relative to this directory.
|
||||
|
||||
@@ -0,0 +1,20 @@
|
||||
# deepseek-ai/DeepSeek-V2-Lite
|
||||
|
||||
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
|
||||
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
|
||||
- **Hardware Environment**: Atlas A2 Series
|
||||
- **Parallel mode**: TP2
|
||||
- **Execution mode**: ACLGraph
|
||||
|
||||
**Command**:
|
||||
|
||||
```bash
|
||||
export MODEL_ARGS='pretrained=deepseek-ai/DeepSeek-V2-Lite,tensor_parallel_size=2,dtype=auto,trust_remote_code=True,max_model_len=4096,enforce_eager=True'
|
||||
lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k \
|
||||
--batch_size auto
|
||||
```
|
||||
|
||||
| Task | Metric | Value | Stderr |
|
||||
|-----------------------|-------------|----------:|-------:|
|
||||
| gsm8k | exact_match,strict-match | ✅0.3813 | ± 0.0134 |
|
||||
| gsm8k | exact_match,flexible-extract | ✅0.3836 | ± 0.0134 |
|
||||
@@ -0,0 +1,19 @@
|
||||
# Qwen/Qwen2.5-VL-7B-Instruct
|
||||
|
||||
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
|
||||
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
|
||||
- **Hardware Environment**: Atlas A2 Series
|
||||
- **Parallel mode**: TP1
|
||||
- **Execution mode**: ACLGraph
|
||||
|
||||
**Command**:
|
||||
|
||||
```bash
|
||||
export MODEL_ARGS='pretrained=Qwen/Qwen2.5-VL-7B-Instruct,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=8192'
|
||||
lm_eval --model vllm-vlm --model_args $MODEL_ARGS --tasks mmmu_val \
|
||||
--apply_chat_template True --fewshot_as_multiturn True --batch_size auto
|
||||
```
|
||||
|
||||
| Task | Metric | Value | Stderr |
|
||||
|-----------------------|-------------|----------:|-------:|
|
||||
| mmmu_val | acc,none | ✅0.52 | ± 0.0162 |
|
||||
@@ -0,0 +1,21 @@
|
||||
# Qwen/Qwen3-30B-A3B
|
||||
|
||||
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
|
||||
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
|
||||
- **Hardware Environment**: Atlas A2 Series
|
||||
- **Parallel mode**: TP2 + EP
|
||||
- **Execution mode**: ACLGraph
|
||||
|
||||
**Command**:
|
||||
|
||||
```bash
|
||||
export MODEL_ARGS='pretrained=Qwen/Qwen3-30B-A3B,tensor_parallel_size=2,dtype=auto,trust_remote_code=False,max_model_len=4096,gpu_memory_utilization=0.6,enable_expert_parallel=True'
|
||||
lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \
|
||||
--num_fewshot 5 --batch_size auto
|
||||
```
|
||||
|
||||
| Task | Metric | Value | Stderr |
|
||||
|-----------------------|-------------|----------:|-------:|
|
||||
| gsm8k | exact_match,strict-match | ✅0.8923 | ± 0.0085 |
|
||||
| gsm8k | exact_match,flexible-extract | ✅0.8506 | ± 0.0098 |
|
||||
| ceval-valid | acc,none | ✅0.8358 | ± 0.0099 |
|
||||
@@ -0,0 +1,21 @@
|
||||
# Qwen/Qwen3-8B-Base
|
||||
|
||||
- **vLLM Version**: vLLM: 0.10.1.1 ([1da94e6](https://github.com/vllm-project/vllm/commit/1da94e6)), **vLLM Ascend Version**: v0.10.1rc1 ([7e16b4a](https://github.com/vllm-project/vllm-ascend/commit/7e16b4a))
|
||||
- **Software Environment**: **CANN**: 8.2.RC1, **PyTorch**: 2.7.1, **torch-npu**: 2.7.1.dev20250724
|
||||
- **Hardware Environment**: Atlas A2 Series
|
||||
- **Parallel mode**: TP1
|
||||
- **Execution mode**: ACLGraph
|
||||
|
||||
**Command**:
|
||||
|
||||
```bash
|
||||
export MODEL_ARGS='pretrained=Qwen/Qwen3-8B-Base,tensor_parallel_size=1,dtype=auto,trust_remote_code=False,max_model_len=4096'
|
||||
lm_eval --model vllm --model_args $MODEL_ARGS --tasks gsm8k,ceval-valid \
|
||||
--apply_chat_template True --fewshot_as_multiturn True --num_fewshot 5 --batch_size auto
|
||||
```
|
||||
|
||||
| Task | Metric | Value | Stderr |
|
||||
|-----------------------|-------------|----------:|-------:|
|
||||
| gsm8k | exact_match,strict-match | ✅0.8271 | ± 0.0104 |
|
||||
| gsm8k | exact_match,flexible-extract | ✅0.8294 | ± 0.0104 |
|
||||
| ceval-valid | acc,none | ✅0.815 | ± 0.0103 |
|
||||
@@ -3,4 +3,8 @@
|
||||
:::{toctree}
|
||||
:caption: Accuracy Report
|
||||
:maxdepth: 1
|
||||
DeepSeek-V2-Lite
|
||||
Qwen2.5-VL-7B-Instruct
|
||||
Qwen3-30B-A3B
|
||||
Qwen3-8B-Base
|
||||
:::
|
||||
|
||||
@@ -61,7 +61,6 @@ from torch import nn
|
||||
from vllm.attention import Attention
|
||||
from vllm.config import VllmConfig
|
||||
from vllm.sequence import IntermediateTensors
|
||||
from vllm.model_executor.sampling_metadata import SamplingMetadata
|
||||
|
||||
class CustomAttention(nn.Module):
|
||||
def __init__(self, vllm_config: VllmConfig, prefix: str):
|
||||
|
||||
@@ -3,7 +3,7 @@
|
||||
## Version Specific FAQs
|
||||
|
||||
- [[v0.9.1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2643)
|
||||
- [[v0.10.1rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/2630)
|
||||
- [[v0.11.0rc1] FAQ & Feedback](https://github.com/vllm-project/vllm-ascend/issues/3222)
|
||||
|
||||
## General FAQs
|
||||
|
||||
@@ -196,3 +196,21 @@ export ATB_LLM_LCOC_ENABLE=0
|
||||
### 19. How to fix the error "ImportError: Please install vllm[audio] for audio support" for Qwen2.5-Omni model?
|
||||
The `Qwen2.5-Omni` model requires the `librosa` package to be installed, you need to install the `qwen-omni-utils` package to ensure all dependencies are met `pip install qwen-omni-utils`,
|
||||
this package will install `librosa` and its related dependencies, resolving the `ImportError: No module named 'librosa'` issue and ensuring audio processing functionality works correctly.
|
||||
|
||||
### 20. How to troubleshoot and resolve size capture failures resulting from stream resource exhaustion, and what are the underlying causes?
|
||||
|
||||
```
|
||||
error example in detail:
|
||||
ERROR 09-26 10:48:07 [model_runner_v1.py:3029] ACLgraph sizes capture fail: RuntimeError:
|
||||
ERROR 09-26 10:48:07 [model_runner_v1.py:3029] ACLgraph has insufficient available streams to capture the configured number of sizes.Please verify both the availability of adequate streams and the appropriateness of the configured size count.
|
||||
```
|
||||
|
||||
Recommended mitigation strategies:
|
||||
1. Manually configure the compilation_config parameter with a reduced size set: '{"cudagraph_capture_sizes":[size1, size2, size3, ...]}'.
|
||||
2. Employ ACLgraph's full graph mode as an alternative to the piece-wise approach.
|
||||
|
||||
Root cause analysis:
|
||||
The current stream requirement calculation for size captures only accounts for measurable factors including: data parallel size, tensor parallel size, expert parallel configuration, piece graph count, multistream overlap shared expert settings, and HCCL communication mode (AIV/AICPU). However, numerous unquantifiable elements - such as operator characteristics and specific hardware features - consume additional streams outside of this calculation framework, resulting in stream resource exhaustion during size capture operations.
|
||||
|
||||
### 21. Installing vllm-ascend will overwrite the existing torch-npu package?
|
||||
Installing vllm-ascend will overwrite the existing torch-npu package. If you need to install a specific version of torch-npu, you can manually install the specified version of torch-npu after installing vllm-ascend.
|
||||
|
||||
@@ -11,6 +11,7 @@ This document describes how to install vllm-ascend manually.
|
||||
|
||||
| Software | Supported version | Note |
|
||||
|---------------|----------------------------------|-------------------------------------------|
|
||||
| Ascend HDK | Refer to [here](https://www.hiascend.com/document/detail/zh/canncommercial/82RC1/releasenote/releasenote_0000.html) | Required for CANN |
|
||||
| CANN | >= 8.2.RC1 | Required for vllm-ascend and torch-npu |
|
||||
| torch-npu | >= 2.7.1.dev20250724 | Required for vllm-ascend, No need to install manually, it will be auto installed in below steps |
|
||||
| torch | >= 2.7.1 | Required for torch-npu and vllm |
|
||||
|
||||
@@ -148,10 +148,6 @@ msgid ""
|
||||
" to be passed in."
|
||||
msgstr "在为MOE模型使用专家负载均衡时,需要传入专家映射路径。"
|
||||
|
||||
#: ../../user_guide/configuration/additional_config.md
|
||||
msgid "`chunked_prefill_for_mla`"
|
||||
msgstr "`chunked_prefill_for_mla`"
|
||||
|
||||
#: ../../user_guide/configuration/additional_config.md
|
||||
msgid "`False`"
|
||||
msgstr "`False`"
|
||||
@@ -199,8 +195,8 @@ msgid ""
|
||||
msgstr "是否将MLA的向量操作放到另一个流中。此选项仅对使用MLA的模型(例如,DeepSeek)有效。"
|
||||
|
||||
#: ../../user_guide/configuration/additional_config.md
|
||||
msgid "`enable_multistream_moe`"
|
||||
msgstr "`enable_multistream_moe`"
|
||||
msgid "`multistream_overlap_shared_expert`"
|
||||
msgstr "`multistream_overlap_shared_expert`"
|
||||
|
||||
#: ../../user_guide/configuration/additional_config.md
|
||||
msgid ""
|
||||
|
||||
@@ -8,6 +8,7 @@ single_npu_multimodal
|
||||
single_npu_audio
|
||||
single_npu_qwen3_embedding
|
||||
single_npu_qwen3_quantization
|
||||
multi_npu_qwen3_next
|
||||
multi_npu
|
||||
multi_npu_moge
|
||||
multi_npu_qwen3_moe
|
||||
@@ -15,4 +16,7 @@ multi_npu_quantization
|
||||
single_node_300i
|
||||
multi_node
|
||||
multi_node_kimi
|
||||
multi_node_qwen3vl
|
||||
multi_node_pd_disaggregation
|
||||
multi_node_ray
|
||||
:::
|
||||
|
||||
244
docs/source/tutorials/multi_node_pd_disaggregation.md
Normal file
244
docs/source/tutorials/multi_node_pd_disaggregation.md
Normal file
@@ -0,0 +1,244 @@
|
||||
# Prefill-Decode Disaggregation Verification (Qwen)
|
||||
|
||||
## Getting Start
|
||||
|
||||
vLLM-Ascend now supports prefill-decode (PD) disaggregation with EP (Expert Parallel) options. This guide take one-by-one steps to verify these features with constrained resources.
|
||||
|
||||
Take the Qwen3-30B-A3B model as an example, use vllm-ascend v0.10.1rc1 (with vLLM v0.10.1.1) on 3 Atlas 800T A2 servers to deploy the "1P2D" architecture. Assume the ip of the prefiller server is 192.0.0.1, and the decoder servers are 192.0.0.2 (decoder 1) and 192.0.0.3 (decoder 2). On each server, use 2 NPUs to deploy one service instance.
|
||||
|
||||
## Verify Multi-Node Communication Environment
|
||||
|
||||
### Physical Layer Requirements
|
||||
|
||||
- The physical machines must be located on the same WLAN, with network connectivity.
|
||||
- All NPUs must be interconnected. Intra-node connectivity is via HCCS, and inter-node connectivity is via RDMA.
|
||||
|
||||
### Verification Process
|
||||
|
||||
1. Single Node Verification:
|
||||
|
||||
Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`:
|
||||
|
||||
```bash
|
||||
# Check the remote switch ports
|
||||
for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done
|
||||
# Get the link status of the Ethernet ports (UP or DOWN)
|
||||
for i in {0..7}; do hccn_tool -i $i -link -g ; done
|
||||
# Check the network health status
|
||||
for i in {0..7}; do hccn_tool -i $i -net_health -g ; done
|
||||
# View the network detected IP configuration
|
||||
for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done
|
||||
# View gateway configuration
|
||||
for i in {0..7}; do hccn_tool -i $i -gateway -g ; done
|
||||
# View NPU network configuration
|
||||
cat /etc/hccn.conf
|
||||
```
|
||||
|
||||
2. Get NPU IP Addresses
|
||||
|
||||
```bash
|
||||
for i in {0..7}; do hccn_tool -i $i -ip -g;done
|
||||
```
|
||||
|
||||
3. Cross-Node PING Test
|
||||
|
||||
```bash
|
||||
# Execute on the target node (replace 'x.x.x.x' with actual npu ip address)
|
||||
for i in {0..7}; do hccn_tool -i $i -ping -g address x.x.x.x;done
|
||||
```
|
||||
|
||||
## Generate Ranktable
|
||||
|
||||
The rank table is a JSON file that specifies the mapping of Ascend NPU ranks to nodes. For more details please refer to the [vllm-ascend examples](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/README.md). Execute the following commands for reference.
|
||||
|
||||
```shell
|
||||
cd vllm-ascend/examples/disaggregate_prefill_v1/
|
||||
bash gen_ranktable.sh --ips <prefiller_node1_local_ip> <prefiller_node2_local_ip> <decoder_node1_local_ip> <decoder_node2_local_ip> \
|
||||
--npus-per-node <npu_clips> --network-card-name <nic_name> --prefill-device-cnt <prefiller_npu_clips> --decode-device-cnt <decode_npu_clips> \
|
||||
[--local-device-ids <id_1>,<id_2>,<id_3>...]
|
||||
```
|
||||
|
||||
Assume that we use device 0,1 on the prefiller server node and device 6,7 on both of the decoder server nodes. Take the following commands as an example. (`--local-device-ids` is necessary if you specify certain NPU devices on the local server.)
|
||||
|
||||
```shell
|
||||
# On the prefiller node
|
||||
cd vllm-ascend/examples/disaggregate_prefill_v1/
|
||||
bash gen_ranktable.sh --ips 192.0.0.1 192.0.0.2 192.0.0.3 \
|
||||
--npus-per-node 2 --network-card-name eth0 --prefill-device-cnt 2 --decode-device-cnt 4 --local-device-ids 0,1
|
||||
|
||||
# On the decoder 1
|
||||
cd vllm-ascend/examples/disaggregate_prefill_v1/
|
||||
bash gen_ranktable.sh --ips 192.0.0.1 192.0.0.2 192.0.0.3 \
|
||||
--npus-per-node 2 --network-card-name eth0 --prefill-device-cnt 2 --decode-device-cnt 4 --local-device-ids 6,7
|
||||
|
||||
# On the decoder 2
|
||||
cd vllm-ascend/examples/disaggregate_prefill_v1/
|
||||
bash gen_ranktable.sh --ips 192.0.0.1 192.0.0.2 192.0.0.3 \
|
||||
--npus-per-node 2 --network-card-name eth0 --prefill-device-cnt 2 --decode-device-cnt 4 --local-device-ids 6,7
|
||||
```
|
||||
|
||||
Rank table will generated at /vllm-workspace/vllm-ascend/examples/disaggregate_prefill_v1/ranktable.json
|
||||
|
||||
|Parameter | meaning |
|
||||
| --- | --- |
|
||||
| --ips | Each node's local ip (prefiller nodes should be front of decoder nodes) |
|
||||
| --npus-per-node | Each node's npu clips |
|
||||
| --network-card-name | The physical machines' NIC |
|
||||
|--prefill-device-cnt | Npu clips used for prefill |
|
||||
|--decode-device-cnt |Npu clips used for decode |
|
||||
|--local-device-ids |Optional. No need if using all devices on the local node. |
|
||||
|
||||
## Prefiller / Decoder Deployment
|
||||
|
||||
We can run the following scripts to launch a server on the prefiller/decoder node respectively.
|
||||
|
||||
:::::{tab-set}
|
||||
|
||||
::::{tab-item} Prefiller node
|
||||
|
||||
```shell
|
||||
export HCCL_IF_IP=192.0.0.1 # node ip
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
vllm serve /model/Qwen3-30B-A3B \
|
||||
--host 0.0.0.0 \
|
||||
--port 13700 \
|
||||
--tensor-parallel-size 2 \
|
||||
--no-enable-prefix-caching \
|
||||
--seed 1024 \
|
||||
--served-model-name qwen3-moe \
|
||||
--max-model-len 6144 \
|
||||
--max-num-batched-tokens 6144 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--enable-expert-parallel \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_parallel_size": 1,
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}' \
|
||||
--enforce-eager
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} Decoder node 1
|
||||
|
||||
```shell
|
||||
export HCCL_IF_IP=192.0.0.2 # node ip
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
vllm serve /model/Qwen3-30B-A3B \
|
||||
--host 0.0.0.0 \
|
||||
--port 13700 \
|
||||
--no-enable-prefix-caching \
|
||||
--tensor-parallel-size 2 \
|
||||
--seed 1024 \
|
||||
--served-model-name qwen3-moe \
|
||||
--max-model-len 6144 \
|
||||
--max-num-batched-tokens 6144 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--enable-expert-parallel \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_parallel_size": 1,
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}'
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} Decoder node 2
|
||||
|
||||
```shell
|
||||
export HCCL_IF_IP=192.0.0.3 # node ip
|
||||
export GLOO_SOCKET_IFNAME="eth0" # network card name
|
||||
export TP_SOCKET_IFNAME="eth0"
|
||||
export HCCL_SOCKET_IFNAME="eth0"
|
||||
export DISAGGREGATED_PREFILL_RANK_TABLE_PATH="/path/to/your/generated/ranktable.json"
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=10
|
||||
export VLLM_USE_V1=1
|
||||
|
||||
vllm serve /model/Qwen3-30B-A3B \
|
||||
--host 0.0.0.0 \
|
||||
--port 13700 \
|
||||
--no-enable-prefix-caching \
|
||||
--tensor-parallel-size 2 \
|
||||
--seed 1024 \
|
||||
--served-model-name qwen3-moe \
|
||||
--max-model-len 6144 \
|
||||
--max-num-batched-tokens 6144 \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--enable-expert-parallel \
|
||||
--kv-transfer-config \
|
||||
'{"kv_connector": "LLMDataDistCMgrConnector",
|
||||
"kv_buffer_device": "npu",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_parallel_size": 1,
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"torchair_graph_config": {"enabled":false, "enable_multistream_shared_expert":false}, "ascend_scheduler_config":{"enabled":true, "enable_chunked_prefill":false}}'
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
:::::
|
||||
|
||||
## Example proxy for Deployment
|
||||
|
||||
Run a proxy server on the same node with prefiller service instance. You can get the proxy program in the repository's examples: [load\_balance\_proxy\_server\_example.py](https://github.com/vllm-project/vllm-ascend/blob/main/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py)
|
||||
|
||||
```shell
|
||||
python load_balance_proxy_server_example.py \
|
||||
--host 192.0.0.1 \
|
||||
--port 8080 \
|
||||
--prefiller-hosts 192.0.0.1 \
|
||||
--prefiller-port 13700 \
|
||||
--decoder-hosts 192.0.0.2 192.0.0.3 \
|
||||
--decoder-ports 13700 13700
|
||||
```
|
||||
|
||||
## Verification
|
||||
|
||||
Check service health using the proxy server endpoint.
|
||||
|
||||
```shell
|
||||
curl http://192.0.0.1:8080/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen3-moe",
|
||||
"prompt": "Who are you?",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0
|
||||
}'
|
||||
```
|
||||
156
docs/source/tutorials/multi_node_qwen3vl.md
Normal file
156
docs/source/tutorials/multi_node_qwen3vl.md
Normal file
@@ -0,0 +1,156 @@
|
||||
# Multi-Node-DP (Qwen3-VL-235B-A22B)
|
||||
|
||||
## Verify Multi-Node Communication Environment
|
||||
|
||||
referring to [multi_node.md](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_node.html#verification-process)
|
||||
|
||||
## Run with docker
|
||||
Assume you have an Atlas 800 A3(64G*16) nodes(or 2 * A2), and want to deploy the `Qwen3-VL-235B-A22B-Instruct` model across multi-node.
|
||||
|
||||
```{code-block} bash
|
||||
:substitutions:
|
||||
# Update the vllm-ascend image
|
||||
export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
|
||||
docker run --rm \
|
||||
--name vllm-ascend \
|
||||
--net=host \
|
||||
--device /dev/davinci0 \
|
||||
--device /dev/davinci1 \
|
||||
--device /dev/davinci2 \
|
||||
--device /dev/davinci3 \
|
||||
--device /dev/davinci4 \
|
||||
--device /dev/davinci5 \
|
||||
--device /dev/davinci6 \
|
||||
--device /dev/davinci7 \
|
||||
--device /dev/davinci8 \
|
||||
--device /dev/davinci9 \
|
||||
--device /dev/davinci10 \
|
||||
--device /dev/davinci11 \
|
||||
--device /dev/davinci12 \
|
||||
--device /dev/davinci13 \
|
||||
--device /dev/davinci14 \
|
||||
--device /dev/davinci15 \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
|
||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
-v /root/.cache:/root/.cache \
|
||||
-p 8000:8000 \
|
||||
-it $IMAGE bash
|
||||
```
|
||||
|
||||
Run the following scripts on two nodes respectively
|
||||
|
||||
:::{note}
|
||||
Before launch the inference server, ensure the following environment variables are set for multi node communication
|
||||
:::
|
||||
|
||||
node0
|
||||
|
||||
```shell
|
||||
#!/bin/sh
|
||||
# this obtained through ifconfig
|
||||
# nic_name is the network interface name corresponding to local_ip
|
||||
nic_name="xxxx"
|
||||
local_ip="xxxx"
|
||||
|
||||
export HCCL_IF_IP=$local_ip
|
||||
export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--data-parallel-size 2 \
|
||||
--api-server-count 2 \
|
||||
--data-parallel-size-local 1 \
|
||||
--data-parallel-address $local_ip \
|
||||
--data-parallel-rpc-port 13389 \
|
||||
--seed 1024 \
|
||||
--served-model-name qwen3vl \
|
||||
--tensor-parallel-size 8 \
|
||||
--enable-expert-parallel \
|
||||
--max-num-seqs 16 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--trust-remote-code \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
```
|
||||
|
||||
node1
|
||||
|
||||
```shell
|
||||
#!/bin/sh
|
||||
|
||||
nic_name="xxxx"
|
||||
local_ip="xxxx"
|
||||
node0_ip="xxxx"
|
||||
|
||||
export HCCL_IF_IP=$local_ip
|
||||
export GLOO_SOCKET_IFNAME=$nic_name
|
||||
export TP_SOCKET_IFNAME=$nic_name
|
||||
export HCCL_SOCKET_IFNAME=$nic_name
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export HCCL_BUFFSIZE=1024
|
||||
|
||||
vllm serve Qwen/Qwen3-VL-235B-A22B-Instruct \
|
||||
--host 0.0.0.0 \
|
||||
--port 8000 \
|
||||
--headless \
|
||||
--data-parallel-size 2 \
|
||||
--data-parallel-size-local 1 \
|
||||
--data-parallel-start-rank 1 \
|
||||
--data-parallel-address $node0_ip \
|
||||
--data-parallel-rpc-port 13389 \
|
||||
--seed 1024 \
|
||||
--tensor-parallel-size 8 \
|
||||
--served-model-name qwen3vl \
|
||||
--max-num-seqs 16 \
|
||||
--max-model-len 32768 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--enable-expert-parallel \
|
||||
--trust-remote-code \
|
||||
--no-enable-prefix-caching \
|
||||
--gpu-memory-utilization 0.8 \
|
||||
```
|
||||
|
||||
If the service starts successfully, the following information will be displayed on node0:
|
||||
|
||||
```shell
|
||||
INFO: Started server process [44610]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
INFO: Started server process [44611]
|
||||
INFO: Waiting for application startup.
|
||||
INFO: Application startup complete.
|
||||
```
|
||||
|
||||
Once your server is started, you can query the model with input prompts:
|
||||
|
||||
```shell
|
||||
curl http://localhost:8000/v1/chat/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen3vl",
|
||||
"messages": [
|
||||
{"role": "system", "content": "You are a helpful assistant."},
|
||||
{"role": "user", "content": [
|
||||
{"type": "image_url", "image_url": {"url": "https://modelscope.oss-cn-beijing.aliyuncs.com/resource/qwen.png"}},
|
||||
{"type": "text", "text": "What is the text in the illustrate?"}
|
||||
]}
|
||||
]
|
||||
}'
|
||||
```
|
||||
182
docs/source/tutorials/multi_node_ray.md
Normal file
182
docs/source/tutorials/multi_node_ray.md
Normal file
@@ -0,0 +1,182 @@
|
||||
# Multi-Node-Ray (Qwen/Qwen3-235B-A22B)
|
||||
|
||||
Multi-node inference is suitable for the scenarios that the model cannot be deployed on a single machine. In such cases, the model can be distributed using tensor parallelism or pipeline parallelism. The specific parallelism strategies will be covered in the following sections. To successfully deploy multi-node inference, the following three steps need to be completed:
|
||||
|
||||
* **Verify Multi-Node Communication Environment**
|
||||
* **Set Up and Start the Ray Cluster**
|
||||
* **Start the Online Inference Service on multinode**
|
||||
|
||||
## Verify Multi-Node Communication Environment
|
||||
|
||||
### Physical Layer Requirements:
|
||||
|
||||
* The physical machines must be located on the same LAN, with network connectivity.
|
||||
* All NPUs are connected with optical modules, and the connection status must be normal.
|
||||
|
||||
### Verification Process:
|
||||
|
||||
Execute the following commands on each node in sequence. The results must all be `success` and the status must be `UP`:
|
||||
|
||||
```bash
|
||||
# Check the remote switch ports
|
||||
for i in {0..7}; do hccn_tool -i $i -lldp -g | grep Ifname; done
|
||||
# Get the link status of the Ethernet ports (UP or DOWN)
|
||||
for i in {0..7}; do hccn_tool -i $i -link -g ; done
|
||||
# Check the network health status
|
||||
for i in {0..7}; do hccn_tool -i $i -net_health -g ; done
|
||||
# View the network detected IP configuration
|
||||
for i in {0..7}; do hccn_tool -i $i -netdetect -g ; done
|
||||
# View gateway configuration
|
||||
for i in {0..7}; do hccn_tool -i $i -gateway -g ; done
|
||||
# View NPU network configuration
|
||||
cat /etc/hccn.conf
|
||||
```
|
||||
|
||||
### NPU Interconnect Verification:
|
||||
#### 1. Get NPU IP Addresses
|
||||
|
||||
```bash
|
||||
for i in {0..7}; do hccn_tool -i $i -ip -g | grep ipaddr; done
|
||||
```
|
||||
|
||||
#### 2. Cross-Node PING Test
|
||||
|
||||
```bash
|
||||
# Execute on the target node (replace with actual IP)
|
||||
hccn_tool -i 0 -ping -g address 10.20.0.20
|
||||
```
|
||||
|
||||
## Set Up and Start the Ray Cluster
|
||||
### Setting Up the Basic Container
|
||||
To ensure a consistent execution environment across all nodes, including the model path and Python environment, it is recommended to use Docker images.
|
||||
|
||||
For setting up a multi-node inference cluster with Ray, **containerized deployment** is the preferred approach. Containers should be started on both the master and worker nodes, with the `--net=host` option to enable proper network connectivity.
|
||||
|
||||
Below is the example container setup command, which should be executed on **all nodes** :
|
||||
|
||||
```{code-block} bash
|
||||
:substitutions:
|
||||
# Update the vllm-ascend image
|
||||
export IMAGE=quay.nju.edu.cn/ascend/vllm-ascend:|vllm_ascend_version|
|
||||
export NAME=vllm-ascend
|
||||
|
||||
# Run the container using the defined variables
|
||||
# Note if you are running bridge network with docker, Please expose available ports for multiple nodes communication in advance
|
||||
docker run --rm \
|
||||
--name $NAME \
|
||||
--net=host \
|
||||
--device /dev/davinci0 \
|
||||
--device /dev/davinci1 \
|
||||
--device /dev/davinci2 \
|
||||
--device /dev/davinci3 \
|
||||
--device /dev/davinci4 \
|
||||
--device /dev/davinci5 \
|
||||
--device /dev/davinci6 \
|
||||
--device /dev/davinci7 \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||
-v /usr/local/Ascend/driver/tools/hccn_tool:/usr/local/Ascend/driver/tools/hccn_tool \
|
||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
-v /path/to/shared/cache:/root/.cache \ # IMPORTANT: This must be a shared directory accessible by all nodes
|
||||
-it $IMAGE bash
|
||||
```
|
||||
|
||||
### Start Ray Cluster
|
||||
After setting up the containers and installing vllm-ascend on each node, follow the steps below to start the Ray cluster and execute inference tasks.
|
||||
|
||||
Choose one machine as the head node and the others as worker nodes. Before proceeding, use `ip addr` to check your `nic_name` (network interface name).
|
||||
|
||||
Set the `ASCEND_RT_VISIBLE_DEVICES` environment variable to specify the NPU devices to use. For Ray versions above 2.1, also set the `RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES` variable to avoid device recognition issues.
|
||||
|
||||
Below are the commands for the head and worker nodes:
|
||||
|
||||
**Head node**:
|
||||
|
||||
:::{note}
|
||||
When starting a Ray cluster for multi-node inference, the environment variables on each node must be set **before** starting the Ray cluster for them to take effect.
|
||||
Updating the environment variables requires restarting the Ray cluster.
|
||||
:::
|
||||
|
||||
```shell
|
||||
# Head node
|
||||
export HCCL_IF_IP={local_ip}
|
||||
export GLOO_SOCKET_IFNAME={nic_name}
|
||||
export TP_SOCKET_IFNAME={nic_name}
|
||||
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
ray start --head
|
||||
```
|
||||
|
||||
**Worker node**:
|
||||
|
||||
:::{note}
|
||||
When starting a Ray cluster for multi-node inference, the environment variables on each node must be set **before** starting the Ray cluster for them to take effect. Updating the environment variables requires restarting the Ray cluster.
|
||||
:::
|
||||
|
||||
```shell
|
||||
# Worker node
|
||||
export HCCL_IF_IP={local_ip}
|
||||
export GLOO_SOCKET_IFNAME={nic_name}
|
||||
export TP_SOCKET_IFNAME={nic_name}
|
||||
export RAY_EXPERIMENTAL_NOSET_ASCEND_RT_VISIBLE_DEVICES=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
ray start --address='{head_node_ip}:6379' --node-ip-address={local_ip}
|
||||
```
|
||||
|
||||
Once the cluster is started on multiple nodes, execute `ray status` and `ray list nodes` to verify the Ray cluster's status. You should see the correct number of nodes and NPUs listed.
|
||||
|
||||
## Start the Online Inference Service on multinode scenario
|
||||
In the container, you can use vLLM as if all NPUs were on a single node. vLLM will utilize NPU resources across all nodes in the Ray cluster.
|
||||
|
||||
**You only need to run the vllm command on one node.**
|
||||
|
||||
To set up parallelism, the common practice is to set the `tensor-parallel-size` to the number of NPUs per node, and the `pipeline-parallel-size` to the number of nodes.
|
||||
|
||||
For example, with 16 NPUs across 2 nodes (8 NPUs per node), set the tensor parallel size to 8 and the pipeline parallel size to 2:
|
||||
|
||||
```shell
|
||||
vllm serve Qwen/Qwen3-235B-A22B \
|
||||
--distributed-executor-backend ray \
|
||||
--pipeline-parallel-size 2 \
|
||||
--tensor-parallel-size 8 \
|
||||
--enable-expert-parallel \
|
||||
--seed 1024 \
|
||||
--max-model-len 8192 \
|
||||
--max-num-seqs 25 \
|
||||
--served-model-name qwen \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9
|
||||
```
|
||||
|
||||
Alternatively, if you want to use only tensor parallelism, set the tensor parallel size to the total number of NPUs in the cluster. For example, with 16 NPUs across 2 nodes, set the tensor parallel size to 16:
|
||||
|
||||
```shell
|
||||
vllm serve Qwen/Qwen3-235B-A22B \
|
||||
--distributed-executor-backend ray \
|
||||
--tensor-parallel-size 16 \
|
||||
--enable-expert-parallel \
|
||||
--seed 1024 \
|
||||
--max-model-len 8192 \
|
||||
--max-num-seqs 25 \
|
||||
--served-model-name qwen \
|
||||
--trust-remote-code \
|
||||
--gpu-memory-utilization 0.9
|
||||
```
|
||||
|
||||
Once your server is started, you can query the model with input prompts:
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/v1/completions \
|
||||
-H "Content-Type: application/json" \
|
||||
-d '{
|
||||
"model": "qwen",
|
||||
"prompt": "tell me how to sleep well",
|
||||
"max_tokens": 100,
|
||||
"temperature": 0
|
||||
}'
|
||||
```
|
||||
156
docs/source/tutorials/multi_npu_qwen3_next.md
Normal file
156
docs/source/tutorials/multi_npu_qwen3_next.md
Normal file
@@ -0,0 +1,156 @@
|
||||
# Multi-NPU (Qwen3-Next)
|
||||
|
||||
```{note}
|
||||
The Qwen3 Next are using [Triton Ascend](https://gitee.com/ascend/triton-ascend) which is currently experimental. In future versions, there may be behavioral changes around stability, accuracy and performance improvement.
|
||||
```
|
||||
|
||||
## Run vllm-ascend on Multi-NPU with Qwen3 Next
|
||||
|
||||
Run docker container:
|
||||
|
||||
```{code-block} bash
|
||||
:substitutions:
|
||||
# Update the vllm-ascend image
|
||||
export IMAGE=quay.io/ascend/vllm-ascend:|vllm_ascend_version|
|
||||
docker run --rm \
|
||||
--name vllm-ascend-qwen3 \
|
||||
--device /dev/davinci0 \
|
||||
--device /dev/davinci1 \
|
||||
--device /dev/davinci2 \
|
||||
--device /dev/davinci3 \
|
||||
--device /dev/davinci_manager \
|
||||
--device /dev/devmm_svm \
|
||||
--device /dev/hisi_hdc \
|
||||
-v /usr/local/dcmi:/usr/local/dcmi \
|
||||
-v /usr/local/bin/npu-smi:/usr/local/bin/npu-smi \
|
||||
-v /usr/local/Ascend/driver/lib64/:/usr/local/Ascend/driver/lib64/ \
|
||||
-v /usr/local/Ascend/driver/version.info:/usr/local/Ascend/driver/version.info \
|
||||
-v /etc/ascend_install.info:/etc/ascend_install.info \
|
||||
-v /root/.cache:/root/.cache \
|
||||
-p 8000:8000 \
|
||||
-it $IMAGE bash
|
||||
```
|
||||
|
||||
Setup environment variables:
|
||||
|
||||
```bash
|
||||
# Load model from ModelScope to speed up download
|
||||
export VLLM_USE_MODELSCOPE=True
|
||||
```
|
||||
|
||||
### Install Triton Ascend
|
||||
|
||||
:::::{tab-set}
|
||||
::::{tab-item} Linux (aarch64)
|
||||
|
||||
The [Triton Ascend](https://gitee.com/ascend/triton-ascend) is required when you run Qwen3 Next, please follow the instructions below to install it and its dependency.
|
||||
|
||||
Install the Ascend BiSheng toolkit:
|
||||
|
||||
```bash
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/Ascend-BiSheng-toolkit_aarch64.run
|
||||
chmod a+x Ascend-BiSheng-toolkit_aarch64.run
|
||||
./Ascend-BiSheng-toolkit_aarch64.run --install
|
||||
source /usr/local/Ascend/8.3.RC1/bisheng_toolkit/set_env.sh
|
||||
```
|
||||
|
||||
Install Triton Ascend:
|
||||
|
||||
```bash
|
||||
wget https://vllm-ascend.obs.cn-north-4.myhuaweicloud.com/vllm-ascend/triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl
|
||||
pip install triton_ascend-3.2.0.dev20250914-cp311-cp311-manylinux_2_27_aarch64.manylinux_2_28_aarch64.whl
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} Linux (x86_64)
|
||||
|
||||
Coming soon ...
|
||||
|
||||
::::
|
||||
:::::
|
||||
|
||||
### Inference on Multi-NPU
|
||||
|
||||
Please make sure you already executed the command:
|
||||
|
||||
```bash
|
||||
source /usr/local/Ascend/8.3.RC1/bisheng_toolkit/set_env.sh
|
||||
```
|
||||
|
||||
:::::{tab-set}
|
||||
::::{tab-item} Online Inference
|
||||
|
||||
Run the following script to start the vLLM server on Multi-NPU:
|
||||
|
||||
For an Atlas A2 with 64GB of NPU card memory, tensor-parallel-size should be at least 4, and for 32GB of memory, tensor-parallel-size should be at least 8.
|
||||
|
||||
```bash
|
||||
vllm serve Qwen/Qwen3-Next-80B-A3B-Instruct --tensor-parallel-size 4 --max-model-len 4096 --gpu-memory-utilization 0.7 --enforce-eager
|
||||
```
|
||||
|
||||
Once your server is started, you can query the model with input prompts
|
||||
|
||||
```bash
|
||||
curl http://localhost:8000/v1/chat/completions -H "Content-Type: application/json" -d '{
|
||||
"model": "Qwen/Qwen3-Next-80B-A3B-Instruct",
|
||||
"messages": [
|
||||
{"role": "user", "content": "Who are you?"}
|
||||
],
|
||||
"temperature": 0.6,
|
||||
"top_p": 0.95,
|
||||
"top_k": 20,
|
||||
"max_tokens": 32
|
||||
}'
|
||||
```
|
||||
|
||||
::::
|
||||
|
||||
::::{tab-item} Offline Inference
|
||||
|
||||
Run the following script to execute offline inference on multi-NPU:
|
||||
|
||||
```python
|
||||
import gc
|
||||
import torch
|
||||
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import (destroy_distributed_environment,
|
||||
destroy_model_parallel)
|
||||
|
||||
def clean_up():
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
|
||||
if __name__ == '__main__':
|
||||
prompts = [
|
||||
"Who are you?",
|
||||
]
|
||||
sampling_params = SamplingParams(temperature=0.6, top_p=0.95, top_k=40, max_tokens=32)
|
||||
llm = LLM(model="Qwen/Qwen3-Next-80B-A3B-Instruct",
|
||||
tensor_parallel_size=4,
|
||||
enforce_eager=True,
|
||||
distributed_executor_backend="mp",
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=4096)
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
for output in outputs:
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Prompt: {prompt!r}, Generated text: {generated_text!r}")
|
||||
|
||||
del llm
|
||||
clean_up()
|
||||
```
|
||||
|
||||
If you run this script successfully, you can see the info shown below:
|
||||
|
||||
```bash
|
||||
Prompt: 'Who are you?', Generated text: ' What do you know about me?\n\nHello! I am Qwen, a large-scale language model independently developed by the Tongyi Lab under Alibaba Group. I am'
|
||||
```
|
||||
|
||||
::::
|
||||
:::::
|
||||
@@ -30,11 +30,18 @@ The following table lists the additional configuration options available in vLLM
|
||||
| `ascend_scheduler_config` | dict | `{}` | The config options for ascend scheduler |
|
||||
| `refresh` | bool | `false` | Whether to refresh global ascend config content. This value is usually used by rlhf or ut/e2e test case. |
|
||||
| `expert_map_path` | str | `None` | When using expert load balancing for the MOE model, an expert map path needs to be passed in. |
|
||||
| `chunked_prefill_for_mla` | bool | `False` | Whether to enable the fused operator-like chunked_prefill. |
|
||||
| `enable_prefetch` | bool | `False` | Whether to enable weight prefetch. |
|
||||
| `kv_cache_dtype` | str | `None` | When using the kv cache quantization method, kv cache dtype needs to be set, currently only int8 is supported. |
|
||||
| `enable_shared_expert_dp` | bool | `False` | When the shared expert in DP, it has better performance but consumes more memory. Currently only DeepSeek series models are supported to use. |
|
||||
| `lmhead_tensor_parallel_size` | int | `None` | The custom tensor parallel size of lmhead. |
|
||||
| `oproj_tensor_parallel_size` | int | `None` | The custom tensor parallel size of oproj. |
|
||||
| `multistream_overlap_shared_expert`| bool | `False` | Whether to enable multistream shared expert. This option only takes effects on moe models with shared experts. |
|
||||
| `dynamic_eplb` | bool | `False` | Whether to enable dynamic eplb |
|
||||
|`num_iterations_eplb_update`| int | `400` | Forward iterations when eplb would begin |
|
||||
|`gate_eplb`| bool | `False` | Whether to enale eplb only once. |
|
||||
|`num_wait_worker_iterations`| int | `30` | The forward iterations when eplb worker will finish cpu task. In our test default value 30 would cover most cases. |
|
||||
|`expert_map_record_path`| str | `None` | When dynamic eplb is completed, save the current expert load heatmap to the specified path. |
|
||||
|`init_redundancy_expert`| int | `0` |Specify redundant experts during initialization.|
|
||||
|
||||
The details of each config option are as follows:
|
||||
|
||||
@@ -45,8 +52,8 @@ The details of each config option are as follows:
|
||||
| `enabled` | bool | `False` | Whether to enable torchair graph mode. Currently only DeepSeek series models and PanguProMoE are supported to use torchair graph mode |
|
||||
| `mode` | str | `None` | When using reduce-overhead mode for torchair, mode needs to be set |
|
||||
| `enable_multistream_mla`| bool | `False` | Whether to put vector ops of MLA to another stream. This option only takes effects on models using MLA (e.g., DeepSeek). |
|
||||
| `enable_multistream_moe`| bool | `False` | Whether to enable multistream shared expert. This option only takes effects on DeepSeek moe models. |
|
||||
| `enable_view_optimize` | bool | `True` | Whether to enable torchair view optimization |
|
||||
| `enable_frozen_parameter` | bool | `True` | Whether to fix the memory address of weights during inference to reduce the input address refresh time during graph execution. |
|
||||
| `use_cached_graph` | bool | `False` | Whether to use cached graph |
|
||||
| `graph_batch_sizes` | list[int] | `[]` | The batch size for torchair graph cache |
|
||||
| `graph_batch_sizes_init` | bool | `False` | Init graph batch size dynamically if `graph_batch_sizes` is empty |
|
||||
@@ -57,6 +64,10 @@ The details of each config option are as follows:
|
||||
| Name | Type | Default | Description |
|
||||
| ---- | ---- | ------- | ----------- |
|
||||
| `enabled` | bool | `False` | Whether to enable ascend scheduler for V1 engine|
|
||||
| `enable_pd_transfer` | bool | `False` | Whether to enable pd transfer. When using it, decode is started only when prefill of all requests is done. This option only takes effects on offline inference. |
|
||||
| `decode_max_num_seqs` | int | `0` | Whether to change max_num_seqs of decode phase when enable pd transfer. This option only takes effects when enable_pd_transfer is True. |
|
||||
| `max_long_partial_prefills` | Union[int, float] | `float('inf')` | the maximum number of prompts longer than long_prefill_token_threshold that will be prefilled concurrently. |
|
||||
| `long_prefill_token_threshold` | Union[int, float] | `float('inf')` | a request is considered long if the prompt is longer than this number of tokens. |
|
||||
|
||||
ascend_scheduler_config also support the options from [vllm scheduler config](https://docs.vllm.ai/en/stable/api/vllm/config.html#vllm.config.SchedulerConfig). For example, you can add `enable_chunked_prefill: True` to ascend_scheduler_config as well.
|
||||
|
||||
@@ -71,13 +82,15 @@ An example of additional configuration is as follows:
|
||||
"use_cached_graph": True,
|
||||
"graph_batch_sizes": [1, 2, 4, 8],
|
||||
"graph_batch_sizes_init": False,
|
||||
"enable_multistream_moe": False,
|
||||
"enable_kv_nz": False
|
||||
},
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
"enable_chunked_prefill": True,
|
||||
"max_long_partial_prefills": 1,
|
||||
"long_prefill_token_threshold": 4096,
|
||||
},
|
||||
"multistream_overlap_shared_expert": True,
|
||||
"refresh": False,
|
||||
}
|
||||
```
|
||||
|
||||
94
docs/source/user_guide/feature_guide/eplb_swift_balancer.md
Normal file
94
docs/source/user_guide/feature_guide/eplb_swift_balancer.md
Normal file
@@ -0,0 +1,94 @@
|
||||
# Expert Load Balance (EPLB)
|
||||
|
||||
## Overview
|
||||
|
||||
Expert balancing for MoE models in LLM serving is essential for optimal performance. Dynamically changing experts during inference can negatively impact TTFT (Time To First Token) and TPOT (Tokens Per Output Token) due to stop-the-world operations. SwiftBalancer enables asynchronous expert load balancing with zero-overhead expert movement, ensuring seamless service continuity.
|
||||
|
||||
## EPLB Effects
|
||||
|
||||
- Reduced Latency: Dynamically balances expert loads to minimize TTFT and TPOT by distributing workloads evenly across experts.
|
||||
- Enhanced Throughput: Optimizes GPU utilization, increasing token generation speed under high-concurrency scenarios.
|
||||
- Zero-Overhead Movement: Expert redistribution occurs asynchronously without interrupting ongoing inference requests.
|
||||
- Adaptive Scaling: Automatically adjusts to workload fluctuations while maintaining stable performance.
|
||||
- Fault Tolerance: Redundant expert placement ensures system resilience during hardware failures.
|
||||
|
||||
## How to Use EPLB
|
||||
|
||||
### Dynamic EPLB
|
||||
|
||||
Enable dynamic balancing with auto-tuned parameters. Adjust num_iterations_eplb_update and num_wait_worker_iterations based on workload patterns.
|
||||
|
||||
```shell
|
||||
vllm serve Qwen/Qwen3-235B-A22 \
|
||||
--tensor-parallel-size 16 \
|
||||
--enable-expert-parallel \
|
||||
--additional-config '{
|
||||
"dynamic_eplb": true,
|
||||
"num_iterations_eplb_update": 400,
|
||||
"gate_eplb": true,
|
||||
"num_wait_worker_iterations": 30
|
||||
}'
|
||||
```
|
||||
|
||||
### Static EPLB
|
||||
#### Initial Setup (Record Expert Map)
|
||||
|
||||
Generate the initial expert distribution map using expert_map_record_path. This creates a baseline configuration for future deployments.
|
||||
|
||||
```shell
|
||||
vllm serve Qwen/Qwen3-235B-A22 \
|
||||
--tensor-parallel-size 16 \
|
||||
--enable-expert-parallel \
|
||||
--additional-config '{
|
||||
"expert_map_record_path": "/path/to/eplb.json",
|
||||
"init_redundancy_expert": 16,
|
||||
"dynamic_eplb": true,
|
||||
"num_iterations_eplb_update": 400,
|
||||
"gate_eplb": true,
|
||||
"num_wait_worker_iterations": 30
|
||||
}'
|
||||
```
|
||||
|
||||
#### Subsequent Deployments (Use Recorded Map)
|
||||
Load the pre-recorded expert map for consistent performance. This avoids recalculating distributions at runtime.
|
||||
|
||||
```shell
|
||||
vllm serve Qwen/Qwen3-235B-A22 \
|
||||
--tensor-parallel-size 16 \
|
||||
--enable-expert-parallel \
|
||||
--additional-config '{
|
||||
"expert_map_path": "/path/to/eplb.json"
|
||||
}'
|
||||
```
|
||||
|
||||
## Critical Considerations
|
||||
1. Parameter Tuning:
|
||||
- num_iterations_eplb_update: Higher values (e.g., 400+) for stable workloads; lower values (e.g., 100-200) for fluctuating traffic.
|
||||
- num_wait_worker_iterations: Should be ≥30 to avoid premature balancing during startup.
|
||||
- init_redundancy_expert: Must match tensor-parallel size (e.g., 16 for 16 GPUs) to ensure sufficient redundancy.
|
||||
|
||||
2. Hardware Requirements:
|
||||
- Ensure all GPUs have identical memory capacity and compute capabilities.
|
||||
- Network bandwidth must support expert redistribution traffic (≥10Gbps recommended).
|
||||
|
||||
3. Model Compatibility:
|
||||
- Only MoE models with explicit expert parallelism support (e.g., Qwen3-235B-A22) are compatible.
|
||||
- Verify model architecture supports dynamic expert routing via --enable-expert-parallel.
|
||||
|
||||
4. Gating Configuration:
|
||||
- When gate_eplb=true, validate that the gating mechanism can handle expert movement without routing errors.
|
||||
- Test with synthetic workloads before production deployment.
|
||||
|
||||
5. Monitoring & Validation:
|
||||
- Track metrics: expert_load_balance_ratio, ttft_p99, tpot_avg, and gpu_utilization.
|
||||
- Use vllm monitor to detect imbalances during runtime.
|
||||
- Always verify expert map JSON structure before loading (validate with jq or similar tools).
|
||||
|
||||
6. Startup Behavior:
|
||||
- Initial requests may experience higher latency during the first balancing cycle (typically 1-2 minutes).
|
||||
- Avoid sudden traffic spikes during warm-up phase.
|
||||
|
||||
7. Common Pitfalls:
|
||||
- Incorrect tensor-parallel-size vs. actual GPU count → causes resource underutilization.
|
||||
- Using expert_map_path without generating the map first → runtime errors.
|
||||
- Setting init_redundancy_expert > available GPUs → system failure.
|
||||
BIN
docs/source/user_guide/feature_guide/images/eplb_img.png
Normal file
BIN
docs/source/user_guide/feature_guide/images/eplb_img.png
Normal file
Binary file not shown.
|
After Width: | Height: | Size: 55 KiB |
@@ -10,4 +10,5 @@ quantization
|
||||
sleep_mode
|
||||
structured_output
|
||||
lora
|
||||
eplb_swift_balancer
|
||||
:::
|
||||
|
||||
@@ -108,18 +108,19 @@ Please convert DeepSeek series models using `br_release_MindStudio_8.1.RC2_TR5_2
|
||||
|
||||
### 3. When converting deepseek series models with modelslim, what should you pay attention?
|
||||
|
||||
When using the weight generated by modelslim with the `--dynamic` parameter, if torchair graph mode is enabled, please modify the configuration file in the CANN package to prevent incorrect inference results.
|
||||
When the mla portion of the weights used `W8A8_DYNAMIC` quantization, if torchair graph mode is enabled, please modify the configuration file in the CANN package to prevent incorrect inference results.
|
||||
|
||||
The operation steps are as follows:
|
||||
|
||||
1. Search in the CANN package directory used, for example:
|
||||
find /usr/local/Ascend/ -name fusion_config.json
|
||||
|
||||
2. Add `"AddRmsNormDynamicQuantFusionPass":"off",` to the fusion_config.json you find, the location is as follows:
|
||||
2. Add `"AddRmsNormDynamicQuantFusionPass":"off",` and `"MultiAddRmsNormDynamicQuantFusionPass":"off",` to the fusion_config.json you find, the location is as follows:
|
||||
|
||||
```bash
|
||||
{
|
||||
"Switch":{
|
||||
"GraphFusion":{
|
||||
"AddRmsNormDynamicQuantFusionPass":"off",
|
||||
"MultiAddRmsNormDynamicQuantFusionPass":"off",
|
||||
```
|
||||
|
||||
@@ -1,5 +1,70 @@
|
||||
# Release note
|
||||
|
||||
## v0.11.0rc0 - 2025.09.30
|
||||
|
||||
This is the special release candidate of v0.11.0 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started.
|
||||
|
||||
### Highlights
|
||||
|
||||
- DeepSeek V3.2 is supported now. [#3270](https://github.com/vllm-project/vllm-ascend/pull/3270)
|
||||
- Qwen3-vl is supported now. [#3103](https://github.com/vllm-project/vllm-ascend/pull/3103)
|
||||
|
||||
### Core
|
||||
|
||||
- DeepSeek works with aclgraph now. [#2707](https://github.com/vllm-project/vllm-ascend/pull/2707)
|
||||
- MTP works with aclgraph now. [#2932](https://github.com/vllm-project/vllm-ascend/pull/2932)
|
||||
- EPLB is supported now. [#2956](https://github.com/vllm-project/vllm-ascend/pull/2956)
|
||||
- Mooncacke store kvcache connector is supported now. [#2913](https://github.com/vllm-project/vllm-ascend/pull/2913)
|
||||
- CPU offload connector is supported now. [#1659](https://github.com/vllm-project/vllm-ascend/pull/1659)
|
||||
|
||||
### Other
|
||||
|
||||
- Qwen3-next is stable now. [#3007](https://github.com/vllm-project/vllm-ascend/pull/3007)
|
||||
- Fixed a lot of bugs introduced in v0.10.2 by Qwen3-next. [#2964](https://github.com/vllm-project/vllm-ascend/pull/2964) [#2781](https://github.com/vllm-project/vllm-ascend/pull/2781) [#3070](https://github.com/vllm-project/vllm-ascend/pull/3070) [#3113](https://github.com/vllm-project/vllm-ascend/pull/3113)
|
||||
- The LoRA feature is back now. [#3044](https://github.com/vllm-project/vllm-ascend/pull/3044)
|
||||
- Eagle3 spec decode method is back now. [#2949](https://github.com/vllm-project/vllm-ascend/pull/2949)
|
||||
|
||||
## v0.10.2rc1 - 2025.09.16
|
||||
|
||||
This is the 1st release candidate of v0.10.2 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started.
|
||||
|
||||
### Highlights
|
||||
|
||||
- Add support for Qwen3 Next. Please note that expert parallel and MTP feature doesn't work with this release. We'll make it work enough soon. Follow the [official guide](https://vllm-ascend.readthedocs.io/en/latest/tutorials/multi_npu_qwen3_next.html) to get start [#2917](https://github.com/vllm-project/vllm-ascend/pull/2917)
|
||||
- Add quantization support for aclgraph [#2841](https://github.com/vllm-project/vllm-ascend/pull/2841)
|
||||
|
||||
### Core
|
||||
|
||||
- Aclgraph now works with Ray backend. [#2589](https://github.com/vllm-project/vllm-ascend/pull/2589)
|
||||
- MTP now works with the token > 1. [#2708](https://github.com/vllm-project/vllm-ascend/pull/2708)
|
||||
- Qwen2.5 VL now works with quantization. [#2778](https://github.com/vllm-project/vllm-ascend/pull/2778)
|
||||
- Improved the performance with async scheduler enabled. [#2783](https://github.com/vllm-project/vllm-ascend/pull/2783)
|
||||
- Fixed the performance regression with non MLA model when use default scheduler. [#2894](https://github.com/vllm-project/vllm-ascend/pull/2894)
|
||||
|
||||
### Other
|
||||
- The performance of w8a8 quantization is improved. [#2275](https://github.com/vllm-project/vllm-ascend/pull/2275)
|
||||
- The performance of moe model is improved. [#2689](https://github.com/vllm-project/vllm-ascend/pull/2689) [#2842](https://github.com/vllm-project/vllm-ascend/pull/2842)
|
||||
- Fixed resources limit error when apply speculative decoding and aclgraph. [#2472](https://github.com/vllm-project/vllm-ascend/pull/2472)
|
||||
- Fixed the git config error in docker images. [#2746](https://github.com/vllm-project/vllm-ascend/pull/2746)
|
||||
- Fixed the sliding windows attention bug with prefill. [#2758](https://github.com/vllm-project/vllm-ascend/pull/2758)
|
||||
- The official doc for Prefill Decode Disaggregation with Qwen3 is added. [#2751](https://github.com/vllm-project/vllm-ascend/pull/2751)
|
||||
- `VLLM_ENABLE_FUSED_EXPERTS_ALLGATHER_EP` env works again. [#2740](https://github.com/vllm-project/vllm-ascend/pull/2740)
|
||||
- A new improvement for oproj in deepseek is added. Set `oproj_tensor_parallel_size` to enable this feature[#2167](https://github.com/vllm-project/vllm-ascend/pull/2167)
|
||||
- Fix a bug that deepseek with torchair doesn't work as expect when `graph_batch_sizes` is set. [#2760](https://github.com/vllm-project/vllm-ascend/pull/2760)
|
||||
- Avoid duplicate generation of sin_cos_cache in rope when kv_seqlen > 4k. [#2744](https://github.com/vllm-project/vllm-ascend/pull/2744)
|
||||
- The performance of Qwen3 dense model is improved with flashcomm_v1. Set `VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE=1` and `VLLM_ASCEND_ENABLE_FLASHCOMM=1` to enable it. [#2779](https://github.com/vllm-project/vllm-ascend/pull/2779)
|
||||
- The performance of Qwen3 dense model is improved with prefetch feature. Set `VLLM_ASCEND_ENABLE_PREFETCH_MLP=1` to enable it. [#2816](https://github.com/vllm-project/vllm-ascend/pull/2816)
|
||||
- The performance of Qwen3 MoE model is improved with rope ops update. [#2571](https://github.com/vllm-project/vllm-ascend/pull/2571)
|
||||
- Fix the weight load error for RLHF case. [#2756](https://github.com/vllm-project/vllm-ascend/pull/2756)
|
||||
- Add warm_up_atb step to speed up the inference. [#2823](https://github.com/vllm-project/vllm-ascend/pull/2823)
|
||||
- Fixed the aclgraph steam error for moe model. [#2827](https://github.com/vllm-project/vllm-ascend/pull/2827)
|
||||
|
||||
### Known issue
|
||||
- The server will be hang when running Prefill Decode Disaggregation with different TP size for P and D. It's fixed by [vLLM commit](https://github.com/vllm-project/vllm/pull/23917) which is not included in v0.10.2. You can pick this commit to fix the issue.
|
||||
- The HBM usage of Qwen3 Next is higher than expected. It's a [known issue](https://github.com/vllm-project/vllm-ascend/issues/2884) and we're working on it. You can set `max_model_len` and `gpu_memory_utilization` to suitable value basing on your parallel config to avoid oom error.
|
||||
- We notice that lora doesn't work with this release due to the refactor of kv cache. We'll fix it soon. [2941](https://github.com/vllm-project/vllm-ascend/issues/2941)
|
||||
- Please do not enable chunked prefill with prefix cache when running with Ascend scheduler. The performance and accuracy is not good/correct. [#2943](https://github.com/vllm-project/vllm-ascend/issues/2943)
|
||||
|
||||
## v0.10.1rc1 - 2025.09.04
|
||||
|
||||
This is the 1st release candidate of v0.10.1 for vLLM Ascend. Please follow the [official doc](https://vllm-ascend.readthedocs.io/en/) to get started.
|
||||
|
||||
@@ -42,7 +42,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5559
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5559
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -70,9 +70,7 @@ vllm serve /models/deepseek_r1_w8a8 \
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"chunked_prefill_for_mla":true}'
|
||||
}'
|
||||
```
|
||||
|
||||
Run prefill server P2 on second node:
|
||||
@@ -85,7 +83,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5659
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5659
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -114,9 +112,7 @@ vllm serve /models/deepseek_r1_w8a8 \
|
||||
"kv_port": "20001",
|
||||
"engine_id": "0",
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"chunked_prefill_for_mla":true}'
|
||||
}'
|
||||
```
|
||||
|
||||
Run decode server d1 on third node:
|
||||
@@ -131,7 +127,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5759
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5759
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
@@ -173,7 +169,7 @@ export DISAGGREGATED_PREFILL_RANK_TABLE_PATH=/vllm-workspace/vllm-ascend/example
|
||||
export OMP_PROC_BIND=false
|
||||
export OMP_NUM_THREADS=100
|
||||
export VLLM_USE_V1=1
|
||||
export VLLM_LLMDD_RPC_PORT=5859
|
||||
export VLLM_ASCEND_LLMDD_RPC_PORT=5859
|
||||
|
||||
vllm serve /models/deepseek_r1_w8a8 \
|
||||
--host 0.0.0.0 \
|
||||
|
||||
@@ -17,6 +17,10 @@ parser.add_argument("--decode-device-cnt",
|
||||
type=int,
|
||||
required=True,
|
||||
help="number of decode devices")
|
||||
parser.add_argument("--local-device-ids",
|
||||
type=str,
|
||||
required=False,
|
||||
help="local device ids")
|
||||
args = parser.parse_args()
|
||||
local_host = args.local_host
|
||||
prefill_device_cnt = args.prefill_device_cnt
|
||||
@@ -54,39 +58,49 @@ chips_per_card = get_cmd_stdout("npu-smi info -l | grep \"Chip Count\"").split(
|
||||
"\n")[0].split(":")[1].strip()
|
||||
chips_per_card = int(chips_per_card)
|
||||
|
||||
if args.local_device_ids:
|
||||
local_device_ids = args.local_device_ids.split(',')
|
||||
else:
|
||||
local_device_ids = []
|
||||
for card_id in range(num_cards):
|
||||
for chip_id in range(chips_per_card):
|
||||
device_id = card_id * chips_per_card + chip_id
|
||||
local_device_ids.append(device_id)
|
||||
|
||||
# generate local device list for local rank 0, and gather it to all ranks
|
||||
local_device_list: list[dict[str, str]] = list()
|
||||
if local_rank == "0":
|
||||
super_pod_id = "0"
|
||||
for card_id in range(num_cards):
|
||||
for chip_id in range(chips_per_card):
|
||||
device_id = card_id * chips_per_card + chip_id
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
super_device_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
|
||||
).split(":")[1].strip()
|
||||
super_pod_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
|
||||
).split(":")[1].strip()
|
||||
else:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
for idx in range(len(local_device_ids)):
|
||||
device_id = local_device_ids[idx]
|
||||
chip_id = device_id % chips_per_card
|
||||
card_id = device_id // chips_per_card
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -vnic -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
super_device_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep SDID"
|
||||
).split(":")[1].strip()
|
||||
super_pod_id = get_cmd_stdout(
|
||||
f"npu-smi info -t spod-info -i {card_id} -c {chip_id} | grep \"Super Pod ID\""
|
||||
).split(":")[1].strip()
|
||||
else:
|
||||
device_ip = get_cmd_stdout(
|
||||
f"{hccn_tool_path} -i {device_id} -ip -g | grep ipaddr"
|
||||
).split(":")[1].strip()
|
||||
|
||||
device_info = {
|
||||
"server_id": local_host,
|
||||
"device_id": str(device_id),
|
||||
"device_ip": str(device_ip),
|
||||
}
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_info.update({
|
||||
"super_pod_id": str(super_pod_id),
|
||||
"super_device_id": str(super_device_id)
|
||||
})
|
||||
local_device_list.append(device_info)
|
||||
device_info = {
|
||||
"server_id": local_host,
|
||||
"device_id": str(device_id),
|
||||
"device_ip": str(device_ip),
|
||||
}
|
||||
if soc_info == AscendSocVersion.A3:
|
||||
device_info.update({
|
||||
"super_pod_id": str(super_pod_id),
|
||||
"super_device_id": str(super_device_id)
|
||||
})
|
||||
local_device_list.append(device_info)
|
||||
|
||||
dist.init_process_group(backend=dist.Backend.GLOO)
|
||||
global_device_list = [None] * dist.get_world_size()
|
||||
|
||||
@@ -33,6 +33,11 @@ while [[ $# -gt 0 ]]; do
|
||||
DECODE_DEVICE_CNT="$1"
|
||||
shift
|
||||
;;
|
||||
--local-device-ids)
|
||||
shift
|
||||
LOCAL_DEVICE_IDS="$1"
|
||||
shift
|
||||
;;
|
||||
esac
|
||||
done
|
||||
LOCAL_HOSTS=($(hostname -I))
|
||||
@@ -68,6 +73,10 @@ echo "NNODES": $NNODES
|
||||
echo "NODE_RANK": $NODE_RANK
|
||||
echo "==============="
|
||||
|
||||
if [ -n "$LOCAL_DEVICE_IDS" ]; then
|
||||
OPTIONAL_SECTION=" --local-device-ids $LOCAL_DEVICE_IDS"
|
||||
fi
|
||||
|
||||
if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
||||
GLOO_SOCKET_IFNAME=$NETWORK_CARD_NAME torchrun \
|
||||
--nproc_per_node 1 \
|
||||
@@ -75,5 +84,5 @@ if [[ -n "${GEN_RANKTABLE}" || ! -e ${PWD}/ranktable.json ]]; then
|
||||
--node_rank ${NODE_RANK} \
|
||||
--master_addr ${MASTER_ADDR} \
|
||||
--master_port ${MASTER_PORT} \
|
||||
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT
|
||||
gen_ranktable.py --local-host $LOCAL_HOST --prefill-device-cnt $PREFILL_DEVICE_CNT --decode-device-cnt $DECODE_DEVICE_CNT $OPTIONAL_SECTION
|
||||
fi
|
||||
|
||||
@@ -363,6 +363,7 @@ async def send_request_to_service(client: httpx.AsyncClient,
|
||||
}
|
||||
req_data["stream"] = False
|
||||
req_data["max_tokens"] = 1
|
||||
req_data["min_tokens"] = 1
|
||||
if "stream_options" in req_data:
|
||||
del req_data["stream_options"]
|
||||
headers = {
|
||||
|
||||
@@ -0,0 +1,272 @@
|
||||
# Mooncacke Store Deployment Guide
|
||||
|
||||
## Environmental Dependencies
|
||||
|
||||
* Software:
|
||||
* Python >= 3.9, < 3.12
|
||||
* CANN >= 8.2.rc1
|
||||
* PyTorch >= 2.7.1, torch-npu >= 2.7.1.dev20250724
|
||||
* vLLM:main branch
|
||||
* vLLM-Ascend:main branch
|
||||
* Mooncake:[AscendTransport/Mooncake at pooling-async-memcpy](https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy)(Currently available branch code, continuously updated.)
|
||||
Installation and Compilation Guide:https://github.com/AscendTransport/Mooncake/tree/pooling-async-memcpy?tab=readme-ov-file#build-and-use-binaries
|
||||
|
||||
## run mooncake master
|
||||
|
||||
### 1.Configure mooncake.json
|
||||
|
||||
The environment variable **MOONCAKE_CONFIG_PATH** is configured to the full path where mooncake.json is located.
|
||||
|
||||
```
|
||||
{
|
||||
"local_hostname": "xx.xx.xx.xx",
|
||||
"metadata_server": "P2PHANDSHAKE",
|
||||
"protocol": "ascend",
|
||||
"device_name": "",
|
||||
"master_server_address": "xx.xx.xx.xx:50088",
|
||||
"global_segment_size": 30000000000
|
||||
}
|
||||
```
|
||||
|
||||
**local_hostname**: Configured as the IP address of the current master node,
|
||||
**metadata_server**: Configured as **P2PHANDSHAKE**,
|
||||
**protocol:** Configured for Ascend to use Mooncake's HCCL communication,
|
||||
**device_name**: ""
|
||||
**master_server_address**: Configured with the IP and port of the master service
|
||||
**global_segment_size**: Expands the kvcache size registered by the PD node to the master
|
||||
|
||||
### 2. Start mooncake_master
|
||||
|
||||
Under the mooncake folder:
|
||||
|
||||
```
|
||||
mooncake_master --port 50088
|
||||
```
|
||||
|
||||
## Pooling and Prefill Decode Disaggregate Scenario
|
||||
|
||||
### 1.Run `prefill` Node and `decode` Node
|
||||
|
||||
Using MultiConnector to simultaneously utilize both p2p connectors and pooled connectors. P2P performs kv_transfer, while pooling creates a larger prefix-cache.
|
||||
|
||||
`prefill` Node:
|
||||
|
||||
```
|
||||
bash multi_producer.sh
|
||||
```
|
||||
|
||||
The content of the multi_producer.sh script:
|
||||
|
||||
```
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
export ACL_OP_INIT_MODE=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
|
||||
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--model /xxxxx/Qwen2.5-7B-Instruct \
|
||||
--port 8100 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--no_enable_prefix_caching \
|
||||
--tensor-parallel-size 1 \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 10000 \
|
||||
--block-size 128 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--kv-transfer-config \
|
||||
'{
|
||||
"kv_connector": "MultiConnector",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_connector_extra_config": {
|
||||
"use_layerwise": false,
|
||||
"connectors": [
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_producer",
|
||||
"kv_port": "20001",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_producer",
|
||||
"mooncake_rpc_port":"0"
|
||||
}
|
||||
]
|
||||
}
|
||||
}' > p.log 2>&1
|
||||
```
|
||||
|
||||
`decode` Node:
|
||||
|
||||
```
|
||||
bash multi_consumer.sh
|
||||
```
|
||||
|
||||
The content of multi_consumer.sh:
|
||||
|
||||
```
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=4,5,6,7
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
|
||||
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--model /xxxxx/Qwen2.5-7B-Instruct \
|
||||
--port 8200 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--no_enable_prefix_caching \
|
||||
--tensor-parallel-size 1 \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 10000 \
|
||||
--block-size 128 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--kv-transfer-config \
|
||||
'{
|
||||
"kv_connector": "MultiConnector",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_connector_extra_config": {
|
||||
"use_layerwise": false,
|
||||
"connectors": [
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"kv_port": "20002",
|
||||
"kv_connector_extra_config": {
|
||||
"prefill": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
},
|
||||
"decode": {
|
||||
"dp_size": 1,
|
||||
"tp_size": 1
|
||||
}
|
||||
}
|
||||
},
|
||||
{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_consumer",
|
||||
"mooncake_rpc_port":"1"
|
||||
}
|
||||
]
|
||||
}
|
||||
}' > d.log 2>&1
|
||||
```
|
||||
|
||||
### 2、Start proxy_server.
|
||||
|
||||
```
|
||||
bash proxy.sh
|
||||
```
|
||||
|
||||
proxy.sh content:
|
||||
Change localhost to your actual IP address.
|
||||
|
||||
```
|
||||
python vllm-ascend/examples/disaggregated_prefill_v1/load_balance_proxy_server_example.py \
|
||||
--host localhost\
|
||||
--prefiller-hosts localhost \
|
||||
--prefiller-ports 8100 \
|
||||
--decoder-hosts localhost\
|
||||
--decoder-ports 8200 \
|
||||
```
|
||||
|
||||
### 3. Run Inference
|
||||
|
||||
Configure the localhost, port, and model weight path in the command to your own settings.
|
||||
|
||||
Short question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
|
||||
```
|
||||
|
||||
Long question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8000/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
|
||||
```
|
||||
|
||||
## Pooling and Mixed Deployment Scenario
|
||||
|
||||
### 1、Run Mixed Department Script
|
||||
|
||||
The mixed script is essentially a pure pooling scenario for the P node.
|
||||
|
||||
```
|
||||
bash mixed_department.sh
|
||||
```
|
||||
|
||||
Content of mixed_department.sh:
|
||||
|
||||
```
|
||||
export LD_LIBRARY_PATH=/usr/local/Ascend/ascend-toolkit/latest/python/site-packages:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$PYTHONPATH:/xxxxx/vllm
|
||||
export MOONCAKE_CONFIG_PATH="/xxxxxx/mooncake.json"
|
||||
export VLLM_USE_V1=1
|
||||
export ASCEND_RT_VISIBLE_DEVICES=0,1,2,3
|
||||
export ACL_OP_INIT_MODE=1
|
||||
export ASCEND_TRANSPORT_PRINT=1
|
||||
# The upper boundary environment variable for memory swap logging is set to mooncake, where 1 indicates enabled and 0 indicates disabled.
|
||||
export ASCEND_AGGREGATE_ENABLE=1
|
||||
# The upper-level environment variable is the switch for enabling the mooncake aggregation function, where 1 means on and 0 means off.
|
||||
|
||||
python3 -m vllm.entrypoints.openai.api_server \
|
||||
--model /xxxxx/Qwen2.5-7B-Instruct \
|
||||
--port 8100 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--no_enable_prefix_caching \
|
||||
--tensor-parallel-size 1 \
|
||||
--data-parallel-size 1 \
|
||||
--max-model-len 10000 \
|
||||
--block-size 128 \
|
||||
--max-num-batched-tokens 4096 \
|
||||
--kv-transfer-config \
|
||||
'{
|
||||
"kv_connector": "MooncakeConnectorStoreV1",
|
||||
"kv_role": "kv_both",
|
||||
"kv_connector_extra_config": {
|
||||
"use_layerwise": false,
|
||||
"mooncake_rpc_port":"0"
|
||||
}
|
||||
}' > mix.log 2>&1
|
||||
```
|
||||
|
||||
### 2. Run Inference
|
||||
|
||||
Configure the localhost, port, and model weight path in the command to your own settings. The requests sent will only go to the port where the mixed deployment script is located, and there is no need to start a separate proxy.
|
||||
|
||||
Short question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Hello. I have a question. The president of the United States is", "max_tokens": 200, "temperature":0.0 }'
|
||||
```
|
||||
|
||||
Long question:
|
||||
|
||||
```
|
||||
curl -s http://localhost:8100/v1/completions -H "Content-Type: application/json" -d '{ "model": "/xxxxx/Qwen2.5-7B-Instruct", "prompt": "Given the accelerating impacts of climate change—including rising sea levels, increasing frequency of extreme weather events, loss of biodiversity, and adverse effects on agriculture and human health—there is an urgent need for a robust, globally coordinated response. However, international efforts are complicated by a range of factors: economic disparities between high-income and low-income countries, differing levels of industrialization, varying access to clean energy technologies, and divergent political systems that influence climate policy implementation. In this context, how can global agreements like the Paris Accord be redesigned or strengthened to not only encourage but effectively enforce emission reduction targets? Furthermore, what mechanisms can be introduced to promote fair and transparent technology transfer, provide adequate financial support for climate adaptation in vulnerable regions, and hold nations accountable without exacerbating existing geopolitical tensions or disproportionately burdening those with historically lower emissions?", "max_tokens": 256, "temperature":0.0 }'
|
||||
```
|
||||
@@ -43,4 +43,4 @@ vllm serve model_path \
|
||||
"kv_connector_module_path": "vllm_ascend.distributed.llmdatadist_c_mgr_connector"
|
||||
}' \
|
||||
--additional-config \
|
||||
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "enable_multistream_moe":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true}'
|
||||
'{"ascend_scheduler_config": {"enabled": true}, "torchair_graph_config":{"enabled":true,"enable_kv_nz":false, "graph_batch_size":[28]}, "enable_weight_nz_layout":true, "enable_multistream_moe":false}'
|
||||
@@ -79,7 +79,7 @@ def run_prefill(prefill_done, process_close):
|
||||
|
||||
|
||||
def run_decode(prefill_done):
|
||||
os.environ['VLLM_LLMDD_RPC_PORT'] = '6634'
|
||||
os.environ['VLLM_ASCEND_LLMDD_RPC_PORT'] = '6634'
|
||||
# ranktable.json needs be generated using gen_ranktable.sh
|
||||
# from the examples/disaggregated_prefill_v1 module in the main branch.
|
||||
os.environ['DISAGGREGATED_PREFILL_RANK_TABLE_PATH'] = "./ranktable.json"
|
||||
|
||||
326
examples/offline_weight_load.py
Normal file
326
examples/offline_weight_load.py
Normal file
@@ -0,0 +1,326 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm-project/vllm/examples/offline_inference/data_parallel.py
|
||||
|
||||
# Note: This script is designed to run with e2e test,
|
||||
# please be careful to modify it.
|
||||
"""
|
||||
Usage:
|
||||
Single node:
|
||||
Dense models:
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen2.5-0.5B-Instruct" \
|
||||
--tp-size=1 \
|
||||
--proc-per-node=2
|
||||
MOE models:
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen3-30B-A3B" \
|
||||
--tp-size=2 \
|
||||
--proc-per-node=2 \
|
||||
--enable-expert-parallel
|
||||
|
||||
Multi-node:
|
||||
Node 0 (assume the node has ip of 10.99.48.128):
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen3-30B-A3B" \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
--node-rank=0 \
|
||||
--proc-per-node=2 \
|
||||
--enable-expert-parallel \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
Node 1:
|
||||
python examples/offline_weight_load.py \
|
||||
--model="Qwen/Qwen3-30B-A3B" \
|
||||
--tp-size=2 \
|
||||
--node-size=2 \
|
||||
--node-rank=1 \
|
||||
--enable-expert-parallel \
|
||||
--master-addr=10.99.48.128 \
|
||||
--master-port=13345
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import contextlib
|
||||
import gc
|
||||
import os
|
||||
from multiprocessing import Process
|
||||
from time import sleep
|
||||
|
||||
import torch
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.distributed.parallel_state import ( # noqa E402
|
||||
destroy_distributed_environment, destroy_model_parallel, get_tp_group)
|
||||
from vllm.utils import get_open_port, GiB_bytes
|
||||
from safetensors.torch import load_file
|
||||
|
||||
os.environ["VLLM_USE_MODELSCOPE"] = "True"
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
def patch_vllm_moe_model_weight_loader(model):
|
||||
# Define MLP attribute mapping for different model types
|
||||
|
||||
model = getattr(model, "model", None) or getattr(model, "language_model", None)
|
||||
if model is None:
|
||||
raise ValueError("The provided model does not have a valid 'model' or 'language_model' attribute.")
|
||||
|
||||
for layer in model.layers:
|
||||
mlp_attr = "mlp"
|
||||
mlp = getattr(layer, mlp_attr)
|
||||
|
||||
param_dict = dict(mlp.named_parameters())
|
||||
for name, param in param_dict.items():
|
||||
if "w13_weight" in name or "w2_weight" in name:
|
||||
param.weight_loader = mlp.experts.weight_loader
|
||||
|
||||
def load_and_merge_safetensors(directory):
|
||||
merged_dict = {}
|
||||
|
||||
if not os.path.isdir(directory):
|
||||
raise ValueError(f"directory is not exist : {directory}")
|
||||
|
||||
for filename in os.listdir(directory):
|
||||
if filename.endswith('.safetensors'):
|
||||
file_path = os.path.join(directory, filename)
|
||||
print(f"loading file: {file_path}")
|
||||
|
||||
f = load_file(file_path)
|
||||
merged_dict.update(f)
|
||||
|
||||
return merged_dict
|
||||
|
||||
def parse_args():
|
||||
|
||||
parser = argparse.ArgumentParser(description="External launcher Inference")
|
||||
parser.add_argument(
|
||||
"--model",
|
||||
type=str,
|
||||
default="Qwen/Qwen3-0.6B",
|
||||
help="Model name or path",
|
||||
)
|
||||
parser.add_argument("--tp-size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Tensor parallel size")
|
||||
parser.add_argument("--node-size",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Total number of nodes")
|
||||
parser.add_argument("--node-rank",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Rank of the current node")
|
||||
parser.add_argument("--proc-per-node",
|
||||
type=int,
|
||||
default=1,
|
||||
help="Number of processes per node")
|
||||
parser.add_argument("--master-addr",
|
||||
type=str,
|
||||
default="",
|
||||
help="Master node IP address")
|
||||
parser.add_argument("--master-port",
|
||||
type=int,
|
||||
default=0,
|
||||
help="Master node port")
|
||||
parser.add_argument("--enforce-eager",
|
||||
action="store_true",
|
||||
help="Enforce eager mode execution.")
|
||||
parser.add_argument("--trust-remote-code",
|
||||
action="store_true",
|
||||
help="Trust remote code.")
|
||||
parser.add_argument("--enable-expert-parallel",
|
||||
action="store_true",
|
||||
help="Enable expert parallel, used in MOE models.")
|
||||
parser.add_argument("--enable-sleep-mode",
|
||||
action="store_true",
|
||||
help="Enable sleep mode for the engine.")
|
||||
parser.add_argument("--temperature",
|
||||
type=float,
|
||||
default=0.8,
|
||||
help="Float that controls the randomness of the sampling.")
|
||||
parser.add_argument("--model-weight-gib",
|
||||
type=float,
|
||||
default=None,
|
||||
help="Model weight memory usage in GiB (e.g., 1.0 for 0.5B model).")
|
||||
|
||||
args = parser.parse_args()
|
||||
if args.enable_sleep_mode:
|
||||
if args.model_weight_gib is None or args.temperature != 0:
|
||||
parser.error("model-weight-gib must be provided, and temperature must be zero when enable-sleep-mode is set.")
|
||||
if args.model_weight_gib <= 0:
|
||||
parser.error("model-weight-gib must be greater than 0 when enable-sleep-mode is set.")
|
||||
if args.model == parser.get_default("model") and args.model_weight_gib is None:
|
||||
parser.error("model-weight-gib must be provided for default model when enable-sleep-mode is set.")
|
||||
|
||||
return args
|
||||
|
||||
|
||||
def main(
|
||||
local_rank: int,
|
||||
rank: int,
|
||||
master_addr: str,
|
||||
master_port: int,
|
||||
model_weight_gib: float,
|
||||
model: str = "Qwen/Qwen3-30B-A3B",
|
||||
world_size: int = 4,
|
||||
tensor_parallel_size: int = 2,
|
||||
enable_expert_parallel: bool = False,
|
||||
enforce_eager: bool = True,
|
||||
trust_remote_code: bool = True,
|
||||
enable_sleep_mode: bool = False,
|
||||
temperature: float = 0.8,
|
||||
):
|
||||
os.environ["MASTER_ADDR"] = master_addr
|
||||
os.environ["MASTER_PORT"] = str(master_port)
|
||||
os.environ["RANK"] = str(rank)
|
||||
os.environ["LOCAL_RANK"] = str(local_rank)
|
||||
os.environ["WORLD_SIZE"] = str(world_size)
|
||||
if not torch.distributed.is_initialized():
|
||||
torch.distributed.init_process_group(
|
||||
backend="cpu:gloo,npu:hccl",
|
||||
world_size=world_size,
|
||||
rank=rank,
|
||||
)
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(
|
||||
temperature=temperature,
|
||||
top_p=0.95,
|
||||
max_tokens=10,
|
||||
)
|
||||
llm = LLM(
|
||||
model=model,
|
||||
tensor_parallel_size=tensor_parallel_size,
|
||||
enable_expert_parallel=enable_expert_parallel,
|
||||
enforce_eager=enforce_eager,
|
||||
trust_remote_code=trust_remote_code,
|
||||
distributed_executor_backend="external_launcher",
|
||||
seed=0,
|
||||
gpu_memory_utilization = 0.95,
|
||||
enable_sleep_mode=enable_sleep_mode,
|
||||
)
|
||||
model_path = model
|
||||
runmodel = llm.llm_engine.model_executor.driver_worker.worker.model_runner.model
|
||||
patch_vllm_moe_model_weight_loader(runmodel)
|
||||
sd = load_and_merge_safetensors(model_path)
|
||||
runmodel.load_weights(sd.items())
|
||||
print('load state dict done')
|
||||
tp_ranks = get_tp_group().ranks
|
||||
print(f'TP RANKS: {tp_ranks}')
|
||||
|
||||
outputs = llm.generate(prompts, sampling_params)
|
||||
|
||||
if enable_sleep_mode:
|
||||
if rank == 0:
|
||||
free_bytes_before_sleep, total = torch.npu.mem_get_info()
|
||||
llm.sleep(level=1)
|
||||
if rank == 0:
|
||||
free_bytes_after_sleep, total = torch.npu.mem_get_info()
|
||||
freed_bytes = free_bytes_after_sleep - free_bytes_before_sleep
|
||||
print(f"Freed memory: {freed_bytes / 1024 ** 3:.2f} GiB")
|
||||
# now the freed memory should be larger than the model weights
|
||||
assert freed_bytes >= model_weight_gib / tensor_parallel_size * GiB_bytes
|
||||
|
||||
llm.wake_up()
|
||||
outputs_after_wakeup = llm.generate(prompts, sampling_params)
|
||||
if rank == 0:
|
||||
# cmp output
|
||||
assert outputs[0].outputs[0].text == outputs_after_wakeup[0].outputs[0].text
|
||||
print("Sleep and wake up successfully!!")
|
||||
|
||||
for i, output in enumerate(outputs):
|
||||
if i >= 5:
|
||||
# print only 5 outputs
|
||||
break
|
||||
prompt = output.prompt
|
||||
generated_text = output.outputs[0].text
|
||||
print(f"Global rank: {rank}, Prompt: {prompt!r}, "
|
||||
f"Generated text: {generated_text!r}")
|
||||
|
||||
# Give engines time to pause their processing loops before exiting.
|
||||
sleep(5)
|
||||
del llm
|
||||
cleanup_env_and_memory()
|
||||
|
||||
|
||||
def cleanup_env_and_memory():
|
||||
destroy_model_parallel()
|
||||
destroy_distributed_environment()
|
||||
with contextlib.suppress(AssertionError):
|
||||
torch.distributed.destroy_process_group()
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
args = parse_args()
|
||||
|
||||
tp_size = args.tp_size
|
||||
node_size = args.node_size
|
||||
proc_per_node = args.proc_per_node
|
||||
node_rank = args.node_rank
|
||||
|
||||
if node_size == 1:
|
||||
master_addr = "127.0.0.1"
|
||||
master_port = get_open_port()
|
||||
else:
|
||||
master_addr = args.master_addr
|
||||
master_port = args.master_port
|
||||
|
||||
world_size = node_size * proc_per_node
|
||||
|
||||
procs = []
|
||||
for local_rank, rank in enumerate(
|
||||
range(proc_per_node * node_rank, proc_per_node * (node_rank + 1))):
|
||||
proc = Process(target=main,
|
||||
args=(
|
||||
local_rank,
|
||||
rank,
|
||||
master_addr,
|
||||
master_port,
|
||||
args.model_weight_gib,
|
||||
args.model,
|
||||
world_size,
|
||||
tp_size,
|
||||
args.enable_expert_parallel,
|
||||
args.enforce_eager,
|
||||
args.trust_remote_code,
|
||||
args.enable_sleep_mode,
|
||||
args.temperature,
|
||||
))
|
||||
|
||||
proc.start()
|
||||
procs.append(proc)
|
||||
exit_code = 0
|
||||
for proc in procs:
|
||||
proc.join(timeout=600)
|
||||
if proc.exitcode is None:
|
||||
print(
|
||||
f"Killing process {proc.pid} that didn't stop within 30 minutes."
|
||||
)
|
||||
proc.kill()
|
||||
exit_code = 1
|
||||
elif proc.exitcode:
|
||||
exit_code = proc.exitcode
|
||||
|
||||
exit(exit_code)
|
||||
@@ -29,4 +29,4 @@ vllm serve Qwen/Qwen1.5-MoE-A2.7B \
|
||||
--gpu-memory-utilization 0.9 \
|
||||
--trust-remote-code \
|
||||
--enforce-eager \
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "enable_multistream_moe":false, "use_cached_graph":false}}'
|
||||
--additional-config '{"ascend_scheduler_config":{"enabled":true},"torchair_graph_config":{"enabled":false, "use_cached_graph":false}}'
|
||||
|
||||
@@ -5,7 +5,7 @@ openai
|
||||
pytest >= 6.0
|
||||
pytest-asyncio
|
||||
pytest-mock
|
||||
lm-eval==0.4.8
|
||||
lm-eval[api] @ git+https://github.com/EleutherAI/lm-evaluation-harness.git@206b7722158f58c35b7ffcd53b035fdbdda5126d
|
||||
types-jsonschema
|
||||
xgrammar
|
||||
zmq
|
||||
|
||||
@@ -14,7 +14,7 @@ _err() { _red "Error: $*" && exit 1; }
|
||||
|
||||
CURL_TIMEOUT=1
|
||||
CURL_COOLDOWN=5
|
||||
CURL_MAX_TRIES=180
|
||||
CURL_MAX_TRIES=300
|
||||
|
||||
function wait_url_ready() {
|
||||
local serve_name="$1"
|
||||
|
||||
@@ -32,7 +32,14 @@ from transformers import (AutoConfig, AutoModelForCausalLM, AutoTokenizer,
|
||||
BatchEncoding, BatchFeature)
|
||||
from transformers.models.auto.auto_factory import _BaseAutoModelClass
|
||||
from vllm import LLM, SamplingParams
|
||||
from vllm.config import TaskOption, _get_and_verify_dtype
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
from vllm.config import TaskOption, _get_and_verify_dtype
|
||||
else:
|
||||
from vllm.config.model import TaskOption, _get_and_verify_dtype
|
||||
|
||||
from vllm.inputs import TextPrompt
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.transformers_utils.utils import maybe_model_redirect
|
||||
|
||||
@@ -57,8 +57,8 @@ function quickstart_online_test() {
|
||||
}
|
||||
|
||||
_info "====> Start simple_test"
|
||||
simple_test
|
||||
time simple_test
|
||||
_info "====> Start quickstart_offline_test"
|
||||
quickstart_offline_test
|
||||
time quickstart_offline_test
|
||||
_info "====> Start quickstart_online_test"
|
||||
quickstart_online_test
|
||||
time quickstart_online_test
|
||||
|
||||
@@ -59,4 +59,4 @@ function install_binary_test() {
|
||||
}
|
||||
|
||||
_info "====> Start install_binary_test"
|
||||
install_binary_test
|
||||
time install_binary_test
|
||||
|
||||
@@ -19,7 +19,12 @@
|
||||
|
||||
from typing import Dict, List, Optional, Sequence, Tuple, Union
|
||||
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
from vllm.sequence import PromptLogprobs, SampleLogprobs
|
||||
else:
|
||||
from vllm.logprobs import PromptLogprobs, SampleLogprobs
|
||||
|
||||
TokensText = Tuple[List[int], str]
|
||||
|
||||
|
||||
@@ -1,12 +1,16 @@
|
||||
model_name: "deepseek-ai/DeepSeek-V2-Lite"
|
||||
runner: "linux-aarch64-a2-2"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
- name: "exact_match,strict-match"
|
||||
value: 0.375
|
||||
value: 0.385
|
||||
- name: "exact_match,flexible-extract"
|
||||
value: 0.375
|
||||
value: 0.385
|
||||
tensor_parallel_size: 2
|
||||
batch_size: 32
|
||||
gpu_memory_utilization: 0.7
|
||||
apply_chat_template: False
|
||||
fewshot_as_multiturn: False
|
||||
trust_remote_code: True
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
model_name: "Qwen/Qwen2.5-VL-7B-Instruct"
|
||||
runner: "linux-aarch64-a2-1"
|
||||
hardware: "Atlas A2 Series"
|
||||
model: "vllm-vlm"
|
||||
tasks:
|
||||
- name: "mmmu_val"
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
model_name: "Qwen/Qwen3-30B-A3B"
|
||||
runner: "linux-aarch64-a2-2"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
|
||||
@@ -1,4 +1,6 @@
|
||||
model_name: "Qwen/Qwen3-8B-Base"
|
||||
runner: "linux-aarch64-a2-1"
|
||||
hardware: "Atlas A2 Series"
|
||||
tasks:
|
||||
- name: "gsm8k"
|
||||
metrics:
|
||||
|
||||
@@ -1,3 +1,4 @@
|
||||
DeepSeek-V2-Lite.yaml
|
||||
Qwen3-8B-Base.yaml
|
||||
Qwen2.5-VL-7B-Instruct.yaml
|
||||
Qwen3-30B-A3B.yaml
|
||||
@@ -2,16 +2,28 @@
|
||||
|
||||
- **vLLM Version**: vLLM: {{ vllm_version }} ([{{ vllm_commit[:7] }}](https://github.com/vllm-project/vllm/commit/{{ vllm_commit }})), **vLLM Ascend Version**: {{ vllm_ascend_version }} ([{{ vllm_ascend_commit[:7] }}](https://github.com/vllm-project/vllm-ascend/commit/{{ vllm_ascend_commit }}))
|
||||
- **Software Environment**: **CANN**: {{ cann_version }}, **PyTorch**: {{ torch_version }}, **torch-npu**: {{ torch_npu_version }}
|
||||
- **Hardware Environment**: Atlas A2 Series
|
||||
- **Hardware Environment**: {{ hardware }}
|
||||
- **Parallel mode**: {{ parallel_mode }}
|
||||
- **Execution mode**: ACLGraph
|
||||
- **Execution mode**: {{ execution_model }}
|
||||
|
||||
**Command**:
|
||||
|
||||
```bash
|
||||
export MODEL_ARGS={{ model_args }}
|
||||
lm_eval --model {{ model_type }} --model_args $MODEL_ARGS --tasks {{ datasets }} \
|
||||
{% if apply_chat_template %} --apply_chat_template {{ apply_chat_template }} {% endif %} {% if fewshot_as_multiturn %} --fewshot_as_multiturn {{ fewshot_as_multiturn }} {% endif %} {% if num_fewshot is defined and num_fewshot != "N/A" %} --num_fewshot {{ num_fewshot }} {% endif %} {% if limit is defined and limit != "N/A" %} --limit {{ limit }} {% endif %} --batch_size {{ batch_size}}
|
||||
{% if apply_chat_template is defined and (apply_chat_template|string|lower in ["true", "1"]) -%}
|
||||
--apply_chat_template \
|
||||
{%- endif %}
|
||||
{% if fewshot_as_multiturn is defined and (fewshot_as_multiturn|string|lower in ["true", "1"]) -%}
|
||||
--fewshot_as_multiturn \
|
||||
{%- endif %}
|
||||
{% if num_fewshot is defined and num_fewshot != "N/A" -%}
|
||||
--num_fewshot {{ num_fewshot }} \
|
||||
{%- endif %}
|
||||
{% if limit is defined and limit != "N/A" -%}
|
||||
--limit {{ limit }} \
|
||||
{%- endif %}
|
||||
--batch_size {{ batch_size }}
|
||||
```
|
||||
|
||||
| Task | Metric | Value | Stderr |
|
||||
|
||||
@@ -69,6 +69,8 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
if model_args.get('enable_expert_parallel', False):
|
||||
parallel_mode += " + EP"
|
||||
|
||||
execution_model = f"{'Eager' if model_args.get('enforce_eager', False) else 'ACLGraph'}"
|
||||
|
||||
report_content = template.render(
|
||||
vllm_version=env_config.vllm_version,
|
||||
vllm_commit=env_config.vllm_commit,
|
||||
@@ -77,6 +79,7 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
cann_version=env_config.cann_version,
|
||||
torch_version=env_config.torch_version,
|
||||
torch_npu_version=env_config.torch_npu_version,
|
||||
hardware=eval_config.get("hardware", "unknown"),
|
||||
model_name=eval_config["model_name"],
|
||||
model_args=f"'{','.join(f'{k}={v}' for k, v in model_args.items())}'",
|
||||
model_type=eval_config.get("model", "vllm"),
|
||||
@@ -84,10 +87,11 @@ def generate_report(tp_size, eval_config, report_data, report_dir, env_config):
|
||||
apply_chat_template=eval_config.get("apply_chat_template", True),
|
||||
fewshot_as_multiturn=eval_config.get("fewshot_as_multiturn", True),
|
||||
limit=eval_config.get("limit", "N/A"),
|
||||
batch_size="auto",
|
||||
batch_size=eval_config.get("batch_size", "auto"),
|
||||
num_fewshot=eval_config.get("num_fewshot", "N/A"),
|
||||
rows=report_data["rows"],
|
||||
parallel_mode=parallel_mode)
|
||||
parallel_mode=parallel_mode,
|
||||
execution_model=execution_model)
|
||||
|
||||
report_output = os.path.join(
|
||||
report_dir, f"{os.path.basename(eval_config['model_name'])}.md")
|
||||
@@ -110,7 +114,7 @@ def test_lm_eval_correctness_param(config_filename, tp_size, report_dir,
|
||||
"apply_chat_template": eval_config.get("apply_chat_template", True),
|
||||
"fewshot_as_multiturn": eval_config.get("fewshot_as_multiturn", True),
|
||||
"limit": eval_config.get("limit", None),
|
||||
"batch_size": "auto",
|
||||
"batch_size": eval_config.get("batch_size", "auto"),
|
||||
}
|
||||
for s in ["num_fewshot", "fewshot_as_multiturn", "apply_chat_template"]:
|
||||
val = eval_config.get(s, None)
|
||||
|
||||
@@ -14,14 +14,24 @@ def test_e2e_ep_correctness(model_name):
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(model_name, tensor_parallel_size=2,
|
||||
enforce_eager=True) as vllm_model:
|
||||
# FIXME: Really strange that chunked prefill might lead to different results, investigate further
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=True) as vllm_model:
|
||||
tp_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
with VllmRunner(model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
tensor_parallel_size=2,
|
||||
enable_expert_parallel=True,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": True
|
||||
}},
|
||||
enforce_eager=True) as vllm_model:
|
||||
ep_output = vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
|
||||
@@ -23,6 +23,7 @@ Run `pytest tests/test_offline_inference.py`.
|
||||
import os
|
||||
from unittest.mock import patch
|
||||
|
||||
import pytest
|
||||
from modelscope import snapshot_download # type: ignore
|
||||
from vllm import SamplingParams
|
||||
|
||||
@@ -30,6 +31,15 @@ from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
QWEN_DENSE_MODELS = [
|
||||
"vllm-ascend/Qwen3-8B-W8A8", "vllm-ascend/Qwen2.5-0.5B-Instruct-W8A8"
|
||||
]
|
||||
|
||||
DEEPSEEK_W4A8_MODELS = [
|
||||
"vllm-ascend/DeepSeek-V3-W4A8-Pruing",
|
||||
"vllm-ascend/DeepSeek-V3.1-W4A8-puring"
|
||||
]
|
||||
|
||||
|
||||
def test_models_distributed_QwQ():
|
||||
example_prompts = [
|
||||
@@ -61,8 +71,8 @@ def test_models_distributed_DeepSeek_multistream_moe():
|
||||
additional_config={
|
||||
"torchair_graph_config": {
|
||||
"enabled": True,
|
||||
"enable_multistream_moe": True,
|
||||
},
|
||||
"enable_multistream_moe": True,
|
||||
"ascend_scheduler_config": {
|
||||
"enabled": True,
|
||||
},
|
||||
@@ -104,14 +114,15 @@ def test_models_distributed_Qwen3_W4A8DYNAMIC():
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", DEEPSEEK_W4A8_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_MLA_PA": "1"})
|
||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC():
|
||||
def test_models_distributed_DeepSeek_W4A8DYNAMIC(model):
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
with VllmRunner(
|
||||
snapshot_download("vllm-ascend/DeepSeek-V3-W4A8-Pruing"),
|
||||
snapshot_download(model),
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
@@ -150,3 +161,46 @@ def test_sp_for_qwen3_moe() -> None:
|
||||
enable_expert_parallel=True,
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate(example_prompts, sampling_params)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_FLASHCOMM": "1"})
|
||||
def test_models_distributed_Qwen_Dense_with_flashcomm_v1(model, enforce_eager):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("enforce_eager", [True, False])
|
||||
@pytest.mark.parametrize("model", QWEN_DENSE_MODELS)
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_DENSE_OPTIMIZE": "1"})
|
||||
@patch.dict(os.environ, {"VLLM_ASCEND_ENABLE_PREFETCH_MLP": "1"})
|
||||
def test_models_distributed_Qwen_Dense_with_prefetch_mlp_weight(
|
||||
model, enforce_eager):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
]
|
||||
max_tokens = 5
|
||||
|
||||
with VllmRunner(
|
||||
snapshot_download(model),
|
||||
max_model_len=8192,
|
||||
enforce_eager=enforce_eager,
|
||||
dtype="auto",
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
@@ -116,20 +116,22 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
|
||||
with VllmRunner(model,
|
||||
additional_config={
|
||||
'ascend_scheduler_config': {
|
||||
'enabled': True,
|
||||
'enable_prefix_caching': True,
|
||||
"enable_chunked_prefill": True,
|
||||
},
|
||||
},
|
||||
enforce_eager=True,
|
||||
max_model_len=2048,
|
||||
tensor_parallel_size=2,
|
||||
gpu_memory_utilization=0.7) as vllm_model:
|
||||
chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
INPUT_PROMPTS, max_tokens)
|
||||
# TODO: enable apc and chunked prefill with ascend scheduler will lead accuracy problem.
|
||||
# Disable it now. Fix it or drop the ascend scheduler in the future.
|
||||
# with VllmRunner(model,
|
||||
# additional_config={
|
||||
# 'ascend_scheduler_config': {
|
||||
# 'enabled': True,
|
||||
# 'enable_prefix_caching': True,
|
||||
# "enable_chunked_prefill": True,
|
||||
# },
|
||||
# },
|
||||
# enforce_eager=True,
|
||||
# max_model_len=2048,
|
||||
# tensor_parallel_size=2,
|
||||
# gpu_memory_utilization=0.7) as vllm_model:
|
||||
# chunk_prefill_prefix_cache_output = vllm_model.generate_greedy(
|
||||
# INPUT_PROMPTS, max_tokens)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_output,
|
||||
@@ -138,9 +140,9 @@ def test_prefix_cache_with_ascend_scheduler(model: str,
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
outputs_1_lst=prefix_cache_output,
|
||||
name_0="chunk_prefill_prefix_cache_output",
|
||||
name_1="prefix_cache_output",
|
||||
)
|
||||
# check_outputs_equal(
|
||||
# outputs_0_lst=chunk_prefill_prefix_cache_output,
|
||||
# outputs_1_lst=prefix_cache_output,
|
||||
# name_0="chunk_prefill_prefix_cache_output",
|
||||
# name_1="prefix_cache_output",
|
||||
# )
|
||||
|
||||
@@ -66,7 +66,6 @@ def test_models_distributed_Qwen3_MOE_W8A8():
|
||||
max_model_len=8192,
|
||||
tensor_parallel_size=2,
|
||||
quantization="ascend",
|
||||
enforce_eager=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate_greedy(example_prompts, max_tokens)
|
||||
|
||||
|
||||
@@ -22,6 +22,8 @@ Run `pytest tests/multicard/test_torchair_graph_mode.py`.
|
||||
import os
|
||||
from typing import Dict
|
||||
|
||||
import pytest
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
@@ -153,6 +155,7 @@ def _pangu_torchair_test_fixture(
|
||||
print(f"Generated text: {vllm_output[i][1]!r}")
|
||||
|
||||
|
||||
@pytest.mark.skip("skipping test_e2e_pangu_with_torchair")
|
||||
def test_e2e_pangu_with_torchair():
|
||||
additional_config = {
|
||||
"torchair_graph_config": {
|
||||
|
||||
188
tests/e2e/multicard/test_weight_loader.py
Normal file
188
tests/e2e/multicard/test_weight_loader.py
Normal file
@@ -0,0 +1,188 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with and without aclgraph.
|
||||
|
||||
Run `pytest tests/multicard/test_external_launcher.py`.
|
||||
"""
|
||||
|
||||
import os
|
||||
import subprocess
|
||||
import sys
|
||||
|
||||
import pytest
|
||||
import torch_npu
|
||||
|
||||
MOE_MODELS = ["Qwen/Qwen3-30B-A3B"]
|
||||
MODELS = ["Qwen/Qwen3-8B"]
|
||||
DEVICE_NAME = torch_npu.npu.get_device_name(0)[:10]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
def test_external_launcher_eager(model):
|
||||
script = script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enforce-eager",
|
||||
"--enable-expert-parallel",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MOE_MODELS)
|
||||
def test_external_launcher_aclgraph(model):
|
||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-expert-parallel",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_external_launcher_dense(model):
|
||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
def test_external_launcher_dense_eager(model):
|
||||
script = "/usr/local/python3.11.13/bin/python3.11/__w/vllm-ascend/tests/examples/test_weight_loader.py"
|
||||
env = os.environ.copy()
|
||||
# TODO: Change to 2 when ci machine has 4 cards
|
||||
cmd = [
|
||||
sys.executable,
|
||||
str(script),
|
||||
"--model",
|
||||
model,
|
||||
"--tp-size",
|
||||
"2",
|
||||
"--proc-per-node",
|
||||
"2",
|
||||
"--trust-remote-code",
|
||||
"--enforce-eager",
|
||||
"--enable-sleep-mode",
|
||||
"--model-weight-gib",
|
||||
"20",
|
||||
]
|
||||
|
||||
print(f"Running subprocess: {' '.join(cmd)}")
|
||||
proc = subprocess.run(
|
||||
cmd,
|
||||
env=env,
|
||||
stdout=subprocess.PIPE,
|
||||
stderr=subprocess.STDOUT,
|
||||
timeout=600,
|
||||
)
|
||||
output = proc.stdout.decode()
|
||||
|
||||
print(output)
|
||||
|
||||
assert "TP RANKS: [0]" in output
|
||||
assert "TP RANKS: [1]" in output
|
||||
assert "Generated text:" in output
|
||||
assert proc.returncode == 0
|
||||
@@ -70,7 +70,7 @@ run_tests_for_model() {
|
||||
# Start prefill instance
|
||||
PREFILL_PORT=8001
|
||||
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=0 VLLM_ASCEND_LLMDD_RPC_PORT=5559 vllm serve $model_name \
|
||||
--port $PREFILL_PORT \
|
||||
--seed 1024 \
|
||||
--enforce-eager \
|
||||
@@ -90,7 +90,7 @@ run_tests_for_model() {
|
||||
DECODE_PORT=8002
|
||||
|
||||
# Build the command with or without model-specific args
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
||||
BASE_CMD="ASCEND_RT_VISIBLE_DEVICES=1 VLLM_ASCEND_LLMDD_RPC_PORT=6000 vllm serve $model_name \
|
||||
--port $DECODE_PORT \
|
||||
--seed 1024 \
|
||||
--enforce-eager \
|
||||
|
||||
@@ -22,7 +22,6 @@ set -eo errexit
|
||||
. $(dirname "$0")/common.sh
|
||||
|
||||
export VLLM_USE_MODELSCOPE=true
|
||||
export VLLM_LOGGING_LEVEL=ERROR
|
||||
|
||||
_info "====> Start Quickstart test"
|
||||
. "${SCRIPT_DIR}/doctests/001-quickstart-test.sh"
|
||||
|
||||
@@ -33,8 +33,8 @@ def test_bgmv_expand():
|
||||
y_npu = y.npu()
|
||||
|
||||
y_out = bgmv_expand_cpu_impl(x, w, indices, y, 0, 128)
|
||||
y_out_npu = torch.ops._C.bgmv_expand(x_npu, w_npu, indices_npu, y_npu, 0,
|
||||
128)
|
||||
y_out_npu = torch.ops._C_ascend.bgmv_expand(x_npu, w_npu, indices_npu,
|
||||
y_npu, 0, 128)
|
||||
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(y_out_npu.cpu(),
|
||||
|
||||
@@ -33,7 +33,7 @@ def test_bgmv_shrink():
|
||||
y_npu = y.npu()
|
||||
|
||||
y = bgmv_shrink_cpu_impl(x, w, indices, y, 0.5)
|
||||
torch.ops._C.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
|
||||
torch.ops._C_ascend.bgmv_shrink(x_npu, w_npu, indices_npu, y_npu, 0.5)
|
||||
|
||||
# Compare the results.
|
||||
torch.testing.assert_close(y_npu.cpu(),
|
||||
|
||||
@@ -28,12 +28,12 @@ import torch
|
||||
import torch_npu
|
||||
from vllm.model_executor.layers.activation import SiluAndMul
|
||||
|
||||
from vllm_ascend.ops.layers.experts_selector import select_experts
|
||||
from vllm_ascend.ops.moe_dispatcher.token_dispatcher import \
|
||||
TokenDispatcherWithAllGather
|
||||
from vllm_ascend.ops.moe.experts_selector import select_experts
|
||||
from vllm_ascend.ops.moe.moe_mlp import unified_apply_mlp
|
||||
from vllm_ascend.ops.moe.token_dispatcher import TokenDispatcherWithAllGather
|
||||
|
||||
NUM_EXPERTS = [8, 64]
|
||||
EP_SIZE = [1, 4]
|
||||
EP_SIZE = [1]
|
||||
TOP_KS = [2, 6]
|
||||
DEVICE = ["npu"]
|
||||
|
||||
@@ -115,19 +115,6 @@ def test_token_dispatcher_with_all_gather(
|
||||
w1_local = w1
|
||||
w2_local = w2
|
||||
|
||||
if ep_size > 1:
|
||||
local_e = e // ep_size
|
||||
e_ids = torch.arange(local_e * 0,
|
||||
local_e * (0 + 1),
|
||||
device=device,
|
||||
dtype=torch.int32)
|
||||
expert_map = torch.full((e, ), -1, device=device, dtype=torch.int32)
|
||||
expert_map[e_ids] = torch.arange(local_e,
|
||||
device=device,
|
||||
dtype=torch.int32)
|
||||
w1_local = w1[e_ids]
|
||||
w2_local = w2[e_ids]
|
||||
|
||||
score = torch.softmax(score, dim=-1, dtype=dtype)
|
||||
topk_weights, topk_ids = torch.topk(score, topk)
|
||||
topk_ids = topk_ids.to(torch.int32)
|
||||
@@ -179,6 +166,87 @@ def test_token_dispatcher_with_all_gather(
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("k", [128, 511, 1024])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@pytest.mark.parametrize("topk", TOP_KS)
|
||||
@pytest.mark.parametrize("ep_size", EP_SIZE)
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16])
|
||||
@pytest.mark.parametrize("device", DEVICE)
|
||||
def test_token_dispatcher_with_all_gather_quant(
|
||||
m: int,
|
||||
n: int,
|
||||
k: int,
|
||||
e: int,
|
||||
topk: int,
|
||||
ep_size: int,
|
||||
dtype: torch.dtype,
|
||||
device: str,
|
||||
):
|
||||
context_mock = MagicMock()
|
||||
context_mock.fused_moe_state = 0
|
||||
with patch("vllm_ascend.ops.moe.moe_mlp.get_forward_context",
|
||||
return_value=context_mock):
|
||||
a = torch.randn((m, k), device=device, dtype=dtype) / 10
|
||||
w1 = torch.randn((e, k, 2 * n), device=device, dtype=torch.int8)
|
||||
w1_scale = torch.empty((e, 2 * n), device=device, dtype=dtype)
|
||||
w2 = torch.randn((e, n, k), device=device, dtype=torch.int8)
|
||||
w2_scale = torch.empty((e, k), device=device, dtype=dtype)
|
||||
|
||||
score = torch.randn((m, e), device=device, dtype=dtype)
|
||||
expert_map = None
|
||||
local_e = e
|
||||
|
||||
score = torch.softmax(score, dim=-1, dtype=dtype)
|
||||
topk_weights, topk_ids = torch.topk(score, topk)
|
||||
topk_ids = topk_ids.to(torch.int32)
|
||||
row_idx = (torch.arange(
|
||||
0,
|
||||
m * topk,
|
||||
device=device,
|
||||
dtype=torch.int32,
|
||||
).view(topk, -1).permute(1, 0).contiguous())
|
||||
|
||||
dispatcher_kwargs = {
|
||||
"num_experts": e,
|
||||
"top_k": topk,
|
||||
"num_local_experts": local_e,
|
||||
}
|
||||
dispatcher = TokenDispatcherWithAllGather(**dispatcher_kwargs)
|
||||
|
||||
apply_router_weight_on_input = False
|
||||
dispatch_output = dispatcher.token_dispatch(
|
||||
hidden_states=a,
|
||||
topk_weights=topk_weights,
|
||||
topk_ids=topk_ids,
|
||||
row_idx=row_idx,
|
||||
expert_map=expert_map,
|
||||
apply_router_weight_on_input=apply_router_weight_on_input,
|
||||
with_quant=True)
|
||||
|
||||
sorted_hidden_states = dispatch_output["hidden_states"]
|
||||
group_list = dispatch_output["group_list"]
|
||||
group_list_type = dispatch_output.get("group_list_type", 1)
|
||||
dynamic_scale = dispatch_output["dynamic_scale"]
|
||||
|
||||
expert_output = unified_apply_mlp(hidden_states=sorted_hidden_states,
|
||||
w1=w1,
|
||||
w1_scale=w1_scale,
|
||||
w2=w2,
|
||||
w2_scale=w2_scale,
|
||||
group_list=group_list,
|
||||
group_list_type=group_list_type,
|
||||
dynamic_scale=dynamic_scale,
|
||||
with_quant=True)
|
||||
combined_output = dispatcher.token_combine(hidden_states=expert_output,
|
||||
bias=None)
|
||||
assert combined_output.shape == (m, k)
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("m", [1, 33, 64])
|
||||
@pytest.mark.parametrize("n", [128, 1024, 2048])
|
||||
@pytest.mark.parametrize("e", NUM_EXPERTS)
|
||||
@@ -222,7 +290,7 @@ def test_select_experts(
|
||||
dtype=torch.int32)
|
||||
custom_routing_function.return_value = (mock_weights, mock_ids)
|
||||
|
||||
with patch("vllm_ascend.ops.layers.experts_selector._native_grouped_topk"
|
||||
with patch("vllm_ascend.ops.moe.experts_selector._native_grouped_topk"
|
||||
) as mock_native_grouped_topk:
|
||||
mock_native_grouped_topk.side_effect = lambda x, num_groups, k: torch.randn_like(
|
||||
x)
|
||||
|
||||
@@ -1,175 +0,0 @@
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
|
||||
import gc
|
||||
from types import SimpleNamespace
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
from vllm.model_executor.layers.fused_moe.config import ( # isort: skip
|
||||
FusedMoEConfig, FusedMoEParallelConfig)
|
||||
|
||||
from vllm_ascend.distributed.moe_comm_method import ( # isort: skip
|
||||
AllGatherCommImpl, NativeAllGatherCommImpl)
|
||||
|
||||
|
||||
@pytest.mark.parametrize("num_tokens", [16, 128])
|
||||
@pytest.mark.parametrize("hidden_size", [64, 128])
|
||||
@pytest.mark.parametrize("global_num_experts", [8, 16])
|
||||
@pytest.mark.parametrize("num_local_experts", [4, 8])
|
||||
@pytest.mark.parametrize("top_k_num", [2, 4])
|
||||
@pytest.mark.parametrize("dtype", [torch.bfloat16, torch.float16])
|
||||
@pytest.mark.parametrize("ep_rank", [0, 1])
|
||||
@pytest.mark.parametrize("apply_a8_quantization", [False])
|
||||
def test_all_gather_comm_impl(
|
||||
num_tokens,
|
||||
hidden_size,
|
||||
global_num_experts,
|
||||
num_local_experts,
|
||||
top_k_num,
|
||||
dtype,
|
||||
ep_rank,
|
||||
apply_a8_quantization,
|
||||
mocker,
|
||||
):
|
||||
"""
|
||||
Tests the AllGatherCommImpl against the NativeAllGatherCommImpl.
|
||||
|
||||
This test compares the outputs of the NPU-optimized AllGatherCommImpl
|
||||
with a native PyTorch implementation (NativeAllGatherCommImpl) to ensure
|
||||
correctness across various configurations.
|
||||
"""
|
||||
if top_k_num > global_num_experts:
|
||||
pytest.skip("top_k_num cannot be greater than global_num_experts")
|
||||
if num_local_experts > global_num_experts:
|
||||
pytest.skip(
|
||||
"num_local_experts cannot be greater than global_num_experts")
|
||||
|
||||
device = torch.device("npu")
|
||||
|
||||
# mock get_tensor_model_parallel_rank to return ep_rank
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.fused_moe.config.get_tensor_model_parallel_rank",
|
||||
return_value=ep_rank,
|
||||
)
|
||||
|
||||
# make moe config
|
||||
parallel_config = SimpleNamespace(
|
||||
enable_expert_parallel=num_local_experts < global_num_experts)
|
||||
moe_parallel_config: FusedMoEParallelConfig = FusedMoEParallelConfig.make(
|
||||
tp_size_=max(2, global_num_experts // num_local_experts),
|
||||
dp_size_=1,
|
||||
vllm_parallel_config=parallel_config,
|
||||
)
|
||||
|
||||
moe_config = FusedMoEConfig(
|
||||
num_experts=global_num_experts,
|
||||
experts_per_token=top_k_num,
|
||||
hidden_dim=hidden_size,
|
||||
num_local_experts=num_local_experts,
|
||||
moe_parallel_config=moe_parallel_config,
|
||||
in_dtype=dtype,
|
||||
quant_config=None, # No quantization in this test
|
||||
max_num_tokens=num_tokens,
|
||||
)
|
||||
|
||||
# Instantiate implementations
|
||||
native_impl = NativeAllGatherCommImpl(moe_config)
|
||||
|
||||
all_gather_impl = AllGatherCommImpl(moe_config)
|
||||
|
||||
# --- Input Data ---
|
||||
hidden_states = torch.randn(num_tokens,
|
||||
hidden_size,
|
||||
device=device,
|
||||
dtype=dtype)
|
||||
topk_ids = torch.randint(0,
|
||||
global_num_experts, (num_tokens, top_k_num),
|
||||
device=device,
|
||||
dtype=torch.int32)
|
||||
topk_weights = torch.rand(num_tokens, top_k_num, device=device).to(dtype)
|
||||
topk_weights = torch.nn.functional.softmax(topk_weights, dim=1)
|
||||
|
||||
num_experts = global_num_experts
|
||||
|
||||
expert_map = None
|
||||
if num_local_experts < global_num_experts:
|
||||
# Create a map where some experts are local and some are not
|
||||
expert_map = torch.full((global_num_experts, ), -1, device=device)
|
||||
expert_map[ep_rank * num_local_experts:(ep_rank + 1) *
|
||||
num_local_experts] = torch.arange(num_local_experts,
|
||||
device=device)
|
||||
num_experts = num_local_experts
|
||||
|
||||
# --- Run Native Implementation (Golden Reference) ---
|
||||
native_hidden_states_out = hidden_states.clone()
|
||||
(
|
||||
native_permuted_hidden,
|
||||
native_expert_tokens,
|
||||
_,
|
||||
_,
|
||||
) = native_impl.permute(hidden_states, topk_ids, topk_weights, expert_map,
|
||||
num_experts, apply_a8_quantization)
|
||||
# Simulate MLP output
|
||||
native_mlp_output = torch.randn_like(native_permuted_hidden)
|
||||
native_impl.unpermute(native_mlp_output, native_hidden_states_out)
|
||||
|
||||
# --- Run AllGather Implementation ---
|
||||
all_gather_hidden_states_out = hidden_states.clone()
|
||||
(
|
||||
all_gather_permuted_hidden,
|
||||
all_gather_expert_tokens,
|
||||
_,
|
||||
_,
|
||||
) = all_gather_impl.permute(hidden_states, topk_ids, topk_weights,
|
||||
expert_map, num_experts, apply_a8_quantization)
|
||||
|
||||
# Use the same simulated MLP output for a fair comparison
|
||||
all_gather_mlp_output = native_mlp_output.clone()
|
||||
|
||||
all_gather_impl.unpermute(all_gather_mlp_output,
|
||||
all_gather_hidden_states_out)
|
||||
|
||||
# --- Assertions ---
|
||||
# Define tolerance based on dtype
|
||||
atol = 1e-3 if dtype == torch.float16 else 1e-2
|
||||
rtol = 1e-3 if dtype == torch.float16 else 1e-2
|
||||
|
||||
# 1. Compare expert_tokens from pre_process
|
||||
assert torch.allclose(native_expert_tokens.to(
|
||||
all_gather_expert_tokens.device),
|
||||
all_gather_expert_tokens,
|
||||
atol=atol,
|
||||
rtol=rtol), "Expert tokens do not match."
|
||||
|
||||
# 2. Compare permuted_hidden_states from pre_process
|
||||
num_valid_tokens = native_expert_tokens.sum()
|
||||
assert torch.allclose(native_permuted_hidden[:num_valid_tokens].to(
|
||||
all_gather_permuted_hidden.device),
|
||||
all_gather_permuted_hidden[:num_valid_tokens],
|
||||
atol=atol,
|
||||
rtol=rtol), "Permuted hidden states do not match."
|
||||
|
||||
# 3. Compare final hidden_states from post_process
|
||||
assert torch.allclose(native_hidden_states_out.to(
|
||||
all_gather_hidden_states_out.device),
|
||||
all_gather_hidden_states_out,
|
||||
atol=atol,
|
||||
rtol=rtol), "Final hidden states do not match."
|
||||
gc.collect()
|
||||
torch.npu.empty_cache()
|
||||
torch.npu.reset_peak_memory_stats()
|
||||
@@ -182,7 +182,7 @@ def test_rotary_embedding_quant_with_leading_dim(
|
||||
)
|
||||
|
||||
ref_query, ref_key = rope.forward_native(positions, query, key)
|
||||
query, key = torch.ops._C.rotary_embedding(
|
||||
query, key = torch.ops._C_ascend.rotary_embedding(
|
||||
positions,
|
||||
query,
|
||||
key,
|
||||
@@ -239,7 +239,7 @@ class ModelwithRotaryEmbedding(nn.Module):
|
||||
# we simulated a simple attention layer to test if it can be seamlessly captured into aclgraph
|
||||
qkv = self.qkv_proj(hidden_states)
|
||||
q, k, v = qkv.chunk(3, dim=-1)
|
||||
query, key = torch.ops._C.rotary_embedding(
|
||||
query, key = torch.ops._C_ascend.rotary_embedding(
|
||||
positions,
|
||||
q,
|
||||
k,
|
||||
@@ -299,7 +299,7 @@ def test_capture_rotary_embedding_in_aclgraph(
|
||||
# Validate if the rotary_embedding custom kernel is indeed inside the graph by
|
||||
# string match
|
||||
graph = str(gm.graph)
|
||||
assert "_C.rotary_embedding" in graph
|
||||
assert "_C_ascend.rotary_embedding" in graph
|
||||
return gm
|
||||
|
||||
static_positions = torch.randint(0, max_position_embeddings,
|
||||
|
||||
@@ -72,7 +72,7 @@ def test_get_masked_input_and_mask(
|
||||
|
||||
# Get custom op result
|
||||
print("input_tensor:", input_tensor)
|
||||
custom_masked_input, custom_mask = torch.ops._C.get_masked_input_and_mask(
|
||||
custom_masked_input, custom_mask = torch.ops._C_ascend.get_masked_input_and_mask(
|
||||
input_tensor, test_case["org_start"], test_case["org_end"],
|
||||
test_case["padding"], test_case["added_start"], test_case["added_end"])
|
||||
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sampling_config():
|
||||
@@ -20,9 +16,10 @@ def model_name():
|
||||
return "wemaster/deepseek_mtp_main_random_bf16"
|
||||
|
||||
|
||||
def test_mtp_correctness(
|
||||
def mtp_correctness(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
num_speculative_tokens: int,
|
||||
):
|
||||
example_prompts = [
|
||||
"Hello, my name is",
|
||||
@@ -38,7 +35,7 @@ def test_mtp_correctness(
|
||||
tensor_parallel_size=1,
|
||||
gpu_memory_utilization=0.7,
|
||||
max_model_len=256,
|
||||
enforce_eager=True) as ref_llm:
|
||||
enforce_eager=False) as ref_llm:
|
||||
ref_outputs = ref_llm.generate(example_prompts, sampling_config)
|
||||
|
||||
with VllmRunner(
|
||||
@@ -50,9 +47,9 @@ def test_mtp_correctness(
|
||||
enable_expert_parallel=True,
|
||||
speculative_config={
|
||||
"method": "deepseek_mtp",
|
||||
"num_speculative_tokens": 1,
|
||||
"num_speculative_tokens": num_speculative_tokens,
|
||||
},
|
||||
enforce_eager=True,
|
||||
enforce_eager=False,
|
||||
max_model_len=2000,
|
||||
additional_config={"ascend_scheduler_config": {
|
||||
"enabled": False
|
||||
@@ -74,3 +71,18 @@ def test_mtp_correctness(
|
||||
# Heuristic: expect at least 66% of the prompts to match exactly
|
||||
# Upon failure, inspect the outputs to check for inaccuracy.
|
||||
assert matches > int(0.66 * len(ref_outputs))
|
||||
del spec_llm
|
||||
|
||||
|
||||
def test_mtp1_correctness(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
):
|
||||
mtp_correctness(sampling_config, model_name, 1)
|
||||
|
||||
|
||||
def test_mtp2_correctness(
|
||||
sampling_config: SamplingParams,
|
||||
model_name: str,
|
||||
):
|
||||
mtp_correctness(sampling_config, model_name, 2)
|
||||
|
||||
@@ -1,14 +1,10 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["VLLM_WORKER_MULTIPROC_METHOD"] = "spawn"
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def sampling_config():
|
||||
|
||||
@@ -99,7 +99,6 @@ def test_ngram_correctness(
|
||||
assert matches > int(0.7 * len(ref_outputs))
|
||||
|
||||
|
||||
@pytest.mark.skipif(True, reason="oom in CI, fix me")
|
||||
@pytest.mark.parametrize("use_eagle3", [False, True], ids=["eagle", "eagle3"])
|
||||
def test_eagle_correctness(
|
||||
test_prompts: list[list[dict[str, Any]]],
|
||||
@@ -111,8 +110,6 @@ def test_eagle_correctness(
|
||||
Compare the outputs of a original LLM and a speculative LLM
|
||||
should be the same when using eagle speculative decoding.
|
||||
'''
|
||||
if not use_eagle3:
|
||||
pytest.skip("Not current support for the test.")
|
||||
|
||||
ref_llm = LLM(model=model_name, max_model_len=2048, enforce_eager=True)
|
||||
ref_outputs = ref_llm.chat(test_prompts, sampling_config)
|
||||
@@ -121,7 +118,6 @@ def test_eagle_correctness(
|
||||
spec_model_name = eagle3_model_name() if use_eagle3 else eagle_model_name()
|
||||
with VllmRunner(
|
||||
model_name,
|
||||
trust_remote_code=True,
|
||||
enable_chunked_prefill=True,
|
||||
max_num_seqs=1,
|
||||
max_num_batched_tokens=2048,
|
||||
|
||||
@@ -1,6 +1,7 @@
|
||||
# SPDX-License-Identifier: Apache-2.0
|
||||
# SPDX-FileCopyrightText: Copyright contributors to the vLLM project
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
@@ -86,3 +87,25 @@ def test_chunked_prefill_with_ascend_scheduler(
|
||||
name_0="vllm_output",
|
||||
name_1="chunked_prefill_output",
|
||||
)
|
||||
|
||||
|
||||
def test_async_scheduling() -> None:
|
||||
prompts = [
|
||||
"Hello, my name is",
|
||||
"The president of the United States is",
|
||||
"The capital of France is",
|
||||
"The future of AI is",
|
||||
] * 10
|
||||
sampling_params = SamplingParams(temperature=0.2,
|
||||
max_tokens=10,
|
||||
stop_token_ids=None)
|
||||
|
||||
with VllmRunner(
|
||||
"Qwen/Qwen2.5-0.5B-Instruct",
|
||||
max_model_len=4096,
|
||||
max_num_seqs=50,
|
||||
dtype="bfloat16",
|
||||
gpu_memory_utilization=0.9,
|
||||
async_scheduling=True,
|
||||
) as vllm_model:
|
||||
vllm_model.generate(prompts, sampling_params=sampling_params)
|
||||
|
||||
@@ -17,17 +17,23 @@
|
||||
# limitations under the License.
|
||||
#
|
||||
import json
|
||||
import os
|
||||
from typing import Any, Dict
|
||||
|
||||
import jsonschema
|
||||
import pytest
|
||||
import regex as re
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if vllm_version_is("0.10.2"):
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
else:
|
||||
from vllm.sampling_params import SamplingParams, StructuredOutputsParams
|
||||
|
||||
from vllm.outputs import RequestOutput
|
||||
from vllm.sampling_params import GuidedDecodingParams, SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
MODEL_NAME = "Qwen/Qwen3-0.6B"
|
||||
|
||||
GuidedDecodingBackend = ["xgrammar", "guidance", "outlines"]
|
||||
@@ -84,16 +90,29 @@ def sample_json_schema():
|
||||
@pytest.mark.parametrize("guided_decoding_backend", GuidedDecodingBackend)
|
||||
def test_guided_json_completion(guided_decoding_backend: str,
|
||||
sample_json_schema):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
|
||||
with VllmRunner(
|
||||
MODEL_NAME,
|
||||
seed=0,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
) as vllm_model:
|
||||
runner_kwargs: Dict[str, Any] = {}
|
||||
if vllm_version_is("0.10.2"):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
guided_decoding=GuidedDecodingParams(json=sample_json_schema))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"guided_decoding_backend": guided_decoding_backend,
|
||||
}
|
||||
else:
|
||||
sampling_params = SamplingParams(
|
||||
temperature=1.0,
|
||||
max_tokens=500,
|
||||
structured_outputs=StructuredOutputsParams(
|
||||
json=sample_json_schema))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"structured_outputs_config": {
|
||||
"backend": guided_decoding_backend
|
||||
},
|
||||
}
|
||||
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
|
||||
prompts = [
|
||||
f"Give an example JSON for an employee profile "
|
||||
f"that fits this schema: {sample_json_schema}"
|
||||
@@ -121,17 +140,29 @@ def test_guided_json_completion(guided_decoding_backend: str,
|
||||
def test_guided_regex(guided_decoding_backend: str, sample_regex):
|
||||
if guided_decoding_backend == "outlines":
|
||||
pytest.skip("Outlines doesn't support regex-based guided decoding.")
|
||||
runner_kwargs: Dict[str, Any] = {}
|
||||
if vllm_version_is("0.10.2"):
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"guided_decoding_backend": guided_decoding_backend,
|
||||
}
|
||||
else:
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
structured_outputs=StructuredOutputsParams(regex=sample_regex))
|
||||
runner_kwargs = {
|
||||
"seed": 0,
|
||||
"structured_outputs_config": {
|
||||
"backend": guided_decoding_backend
|
||||
},
|
||||
}
|
||||
|
||||
sampling_params = SamplingParams(
|
||||
temperature=0.8,
|
||||
top_p=0.95,
|
||||
guided_decoding=GuidedDecodingParams(regex=sample_regex))
|
||||
|
||||
with VllmRunner(
|
||||
MODEL_NAME,
|
||||
seed=0,
|
||||
guided_decoding_backend=guided_decoding_backend,
|
||||
) as vllm_model:
|
||||
with VllmRunner(MODEL_NAME, **runner_kwargs) as vllm_model:
|
||||
prompts = [
|
||||
f"Give an example IPv4 address with this regex: {sample_regex}"
|
||||
] * 2
|
||||
|
||||
103
tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
Normal file
103
tests/e2e/singlecard/test_multistream_overlap_shared_expert.py
Normal file
@@ -0,0 +1,103 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
"""
|
||||
Compare the outputs of vLLM with multistream_overlap_shared_expert
|
||||
enabled and disabled.
|
||||
|
||||
Run `pytest tests/e2e/singlecard/test_multistream_overlap_shared_expert.py`.
|
||||
"""
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
from tests.e2e.model_utils import check_outputs_equal
|
||||
|
||||
MODELS = [
|
||||
"Qwen/Qwen3-0.6B",
|
||||
]
|
||||
|
||||
|
||||
@pytest.mark.parametrize("model", MODELS)
|
||||
@pytest.mark.parametrize("max_tokens", [32])
|
||||
def test_models_with_multistream_overlap_shared_expert(
|
||||
model: str,
|
||||
max_tokens: int,
|
||||
) -> None:
|
||||
prompts = [
|
||||
"Hello, my name is", "The president of the United States is",
|
||||
"The capital of France is", "The future of AI is"
|
||||
]
|
||||
|
||||
sampling_params = SamplingParams(max_tokens=max_tokens, temperature=0.0)
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
) as runner:
|
||||
vllm_moe_ms_eager_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=False,
|
||||
additional_config={
|
||||
"multistream_overlap_shared_expert": True,
|
||||
},
|
||||
) as runner:
|
||||
vllm_moe_ms_aclgraph_outputs = runner.model.generate(
|
||||
prompts, sampling_params)
|
||||
|
||||
with VllmRunner(
|
||||
model,
|
||||
max_model_len=1024,
|
||||
enforce_eager=True,
|
||||
) as runner:
|
||||
vllm_eager_outputs = runner.model.generate(prompts, sampling_params)
|
||||
|
||||
vllm_moe_ms_eager_outputs_list = []
|
||||
for output in vllm_moe_ms_eager_outputs:
|
||||
vllm_moe_ms_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_moe_ms_aclgraph_outputs_list = []
|
||||
for output in vllm_moe_ms_aclgraph_outputs:
|
||||
vllm_moe_ms_aclgraph_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
vllm_eager_outputs_list = []
|
||||
for output in vllm_eager_outputs:
|
||||
vllm_eager_outputs_list.append(
|
||||
(output.outputs[0].index, output.outputs[0].text))
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=vllm_moe_ms_eager_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="vllm_moe_ms_eager_outputs",
|
||||
)
|
||||
|
||||
check_outputs_equal(
|
||||
outputs_0_lst=vllm_eager_outputs_list,
|
||||
outputs_1_lst=vllm_moe_ms_aclgraph_outputs_list,
|
||||
name_0="vllm_eager_outputs",
|
||||
name_1="vllm_moe_ms_aclgraph_outputs",
|
||||
)
|
||||
@@ -20,19 +20,14 @@
|
||||
|
||||
Run `pytest tests/test_offline_inference.py`.
|
||||
"""
|
||||
import os
|
||||
|
||||
import pytest
|
||||
from vllm import SamplingParams
|
||||
from vllm.assets.audio import AudioAsset
|
||||
from vllm.assets.image import ImageAsset
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
os.environ["PYTORCH_NPU_ALLOC_CONF"] = "max_split_size_mb:256"
|
||||
|
||||
|
||||
@pytest.mark.skip(reason="fix me")
|
||||
def test_multimodal_vl(prompt_template):
|
||||
image = ImageAsset("cherry_blossom") \
|
||||
.pil_image.convert("RGB")
|
||||
@@ -52,9 +47,12 @@ def test_multimodal_vl(prompt_template):
|
||||
"fps": 1,
|
||||
},
|
||||
enforce_eager=True) as vllm_model:
|
||||
vllm_model.generate_greedy(prompts=prompts,
|
||||
images=images,
|
||||
max_tokens=64)
|
||||
outputs = vllm_model.generate_greedy(prompts=prompts,
|
||||
images=images,
|
||||
max_tokens=64)
|
||||
assert len(outputs) == len(prompts)
|
||||
for _, output_str in outputs:
|
||||
assert output_str, "Generated output should not be empty."
|
||||
|
||||
|
||||
def test_multimodal_audio():
|
||||
@@ -86,4 +84,7 @@ def test_multimodal_audio():
|
||||
dtype="bfloat16",
|
||||
limit_mm_per_prompt={"audio": 2},
|
||||
gpu_memory_utilization=0.9) as runner:
|
||||
runner.generate(inputs, sampling_params=sampling_params)
|
||||
outputs = runner.generate(inputs, sampling_params=sampling_params)
|
||||
|
||||
assert outputs is not None, "Generated outputs should not be None."
|
||||
assert len(outputs) > 0, "Generated outputs should not be empty."
|
||||
|
||||
36
tests/e2e/vllm_interface/singlecard/test_sampler.py
Normal file
36
tests/e2e/vllm_interface/singlecard/test_sampler.py
Normal file
@@ -0,0 +1,36 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
# Adapted from vllm/tests/entrypoints/llm/test_guided_generate.py
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
#
|
||||
from vllm import SamplingParams
|
||||
|
||||
from tests.e2e.conftest import VllmRunner
|
||||
|
||||
|
||||
def test_models_topk() -> None:
|
||||
example_prompts = [
|
||||
"The capital of France is",
|
||||
]
|
||||
sampling_params = SamplingParams(max_tokens=10,
|
||||
temperature=0.0,
|
||||
top_k=10,
|
||||
top_p=0.9)
|
||||
|
||||
with VllmRunner("Qwen/Qwen3-0.6B",
|
||||
max_model_len=4096,
|
||||
gpu_memory_utilization=0.7) as runner:
|
||||
runner.generate(example_prompts, sampling_params)
|
||||
2
tests/e2e/vllm_interface/vllm_test.cfg
Normal file
2
tests/e2e/vllm_interface/vllm_test.cfg
Normal file
@@ -0,0 +1,2 @@
|
||||
# Base docker image used to build the vllm-ascend e2e test image, which is built in the vLLM repository
|
||||
BASE_IMAGE_NAME="quay.io/ascend/cann:8.2.rc1-910b-ubuntu22.04-py3.11"
|
||||
@@ -7,8 +7,7 @@ from vllm_ascend.attention.attention_v1 import (AscendAttentionBackend,
|
||||
AscendAttentionBackendImpl,
|
||||
AscendAttentionMetadataBuilder,
|
||||
AscendAttentionState,
|
||||
AscendMetadata,
|
||||
CommonAttentionState)
|
||||
AscendMetadata)
|
||||
from vllm_ascend.attention.utils import AscendCommonAttentionMetadata
|
||||
|
||||
|
||||
@@ -25,10 +24,6 @@ class TestAscendAttentionBackend(TestBase):
|
||||
self.assertEqual(AscendAttentionBackend.get_metadata_cls(),
|
||||
AscendMetadata)
|
||||
|
||||
def test_get_state_cls(self):
|
||||
self.assertEqual(AscendAttentionBackend.get_state_cls(),
|
||||
CommonAttentionState)
|
||||
|
||||
def test_get_builder_cls(self):
|
||||
self.assertEqual(AscendAttentionBackend.get_builder_cls(),
|
||||
AscendAttentionMetadataBuilder)
|
||||
@@ -72,7 +67,8 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
self.mock_vllm_config.model_config.max_model_len = 640
|
||||
self.mock_vllm_config.cache_config.block_size = 64
|
||||
self.mock_device = 'cpu:0'
|
||||
self.builder = AscendAttentionMetadataBuilder(self.mock_vllm_config,
|
||||
self.builder = AscendAttentionMetadataBuilder(None, None,
|
||||
self.mock_vllm_config,
|
||||
self.mock_device)
|
||||
|
||||
def test_reorder_batch(self):
|
||||
@@ -100,19 +96,21 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
max_query_len=5,
|
||||
decode_token_per_req=torch.tensor([1, 1]),
|
||||
block_table_tensor=torch.zeros((10, 10)),
|
||||
slot_mapping_cpu=torch.tensor(range(20)),
|
||||
slot_mapping=torch.tensor(range(20)),
|
||||
actual_seq_lengths_q=torch.tensor([0, 1]),
|
||||
positions=torch.tensor([10, 10]),
|
||||
attn_mask=torch.ones((10, 10)),
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.PrefillNoCache)
|
||||
attn_state=AscendAttentionState.PrefillNoCache,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
|
||||
mock_nz_tensor = MagicMock()
|
||||
mock_model = MagicMock()
|
||||
mock_nd_to_nz_2d.return_value = mock_nz_tensor
|
||||
mock_npu_format_cast.return_value = mock_nz_tensor
|
||||
|
||||
self.builder.build(common_attn_metadata, mock_model)
|
||||
self.builder.build(1, common_attn_metadata, mock_model)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
|
||||
@patch('torch_npu.npu_format_cast')
|
||||
@@ -131,12 +129,14 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
max_query_len=6,
|
||||
decode_token_per_req=torch.tensor([1, 1, 1]),
|
||||
block_table_tensor=torch.zeros((10, 10)),
|
||||
slot_mapping_cpu=torch.tensor(range(20)),
|
||||
slot_mapping=torch.tensor(range(20)),
|
||||
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
|
||||
positions=torch.tensor([10, 10]),
|
||||
attn_mask=torch.ones((15, 15)),
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.ChunkedPrefill)
|
||||
attn_state=AscendAttentionState.ChunkedPrefill,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
|
||||
mock_ascend_attention_state = MagicMock()
|
||||
mock_ascend_attention_state.PrefillNoCache = 0
|
||||
@@ -146,7 +146,7 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
mock_nd_to_nz_spec.return_value = mock_nz_tensor
|
||||
mock_npu_format_cast.return_value = mock_nz_tensor
|
||||
|
||||
self.builder.build(common_attn_metadata, mock_model)
|
||||
self.builder.build(1, common_attn_metadata, mock_model)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.AscendMetadata')
|
||||
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
|
||||
@@ -160,15 +160,17 @@ class TestAscendAttentionMetadataBuilder(TestBase):
|
||||
max_query_len=6,
|
||||
decode_token_per_req=torch.tensor([1, 1, 1]),
|
||||
block_table_tensor=torch.zeros((10, 10)),
|
||||
slot_mapping_cpu=torch.tensor(range(20)),
|
||||
slot_mapping=torch.tensor(range(20)),
|
||||
actual_seq_lengths_q=torch.tensor([0, 1, 2]),
|
||||
positions=torch.tensor([10, 10]),
|
||||
attn_mask=torch.ones((15, 15)),
|
||||
spec_attn_mask=None,
|
||||
attn_state=AscendAttentionState.ChunkedPrefill)
|
||||
attn_state=AscendAttentionState.ChunkedPrefill,
|
||||
num_computed_tokens_cpu=None,
|
||||
seq_lens=None)
|
||||
mock_model = MagicMock()
|
||||
|
||||
self.builder.build(common_attn_metadata, mock_model)
|
||||
self.builder.build(1, common_attn_metadata, mock_model)
|
||||
|
||||
|
||||
class TestAscendAttentionBackendImpl(TestBase):
|
||||
@@ -341,36 +343,6 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
mock_flash_attention.assert_called_once()
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('torch_npu._npu_flash_attention')
|
||||
def test_forward_prefill_no_cache_swa(self, mock_flash_attention,
|
||||
mock_reshape_cache):
|
||||
"""Test forward pass in PrefillNoCache state"""
|
||||
query = torch.randn(10, 8 * 64)
|
||||
key = torch.randn(10, 8 * 64)
|
||||
value = torch.randn(10, 8 * 64)
|
||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||
metadata = self.attn_metadata
|
||||
metadata.attn_state = AscendAttentionState.PrefillNoCache
|
||||
metadata.attn_mask = torch.randn(1, 1, 10, 10)
|
||||
metadata.seq_lens = torch.tensor([10])
|
||||
metadata.num_actual_tokens = 10
|
||||
metadata.slot_mapping = torch.zeros(10, dtype=torch.long)
|
||||
layer = self.layer_no_quant
|
||||
# layer.quant_method.apply.return_value = metadata
|
||||
print(self.layer_no_quant._v_scale_float)
|
||||
output = self.impl_swa.forward(layer,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
kv_cache,
|
||||
metadata,
|
||||
trace_flag=False)
|
||||
|
||||
mock_reshape_cache.assert_called_once()
|
||||
mock_flash_attention.assert_called_once()
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('torch_npu._npu_flash_attention_qlens')
|
||||
def test_forward_prefill_cache_hit(self, mock_flash_attention_qlens,
|
||||
@@ -401,10 +373,12 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
mock_flash_attention_qlens.assert_called_once()
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('torch_npu._npu_paged_attention')
|
||||
def test_forward_decode_only(self, mock_paged_attention,
|
||||
mock_npu_reshape_and_cache):
|
||||
mock_npu_reshape_and_cache,
|
||||
mock_get_forward_context):
|
||||
"""Test forward pass in DecodeOnly state"""
|
||||
query = torch.randn(10, 8 * 64)
|
||||
key = torch.randn(10, 8 * 64)
|
||||
@@ -418,6 +392,8 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
metadata.slot_mapping = torch.zeros(10, dtype=torch.long)
|
||||
layer = self.layer_no_quant
|
||||
|
||||
mock_get_forward_context.return_value = MagicMock(capturing=False)
|
||||
|
||||
output = self.impl.forward(layer,
|
||||
query,
|
||||
key,
|
||||
@@ -458,6 +434,44 @@ class TestAscendAttentionBackendImpl(TestBase):
|
||||
mock_fused_infer_attention_score.assert_called_once()
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.get_forward_context')
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('torch_npu._npu_paged_attention')
|
||||
@patch('torch_npu.npu_fused_infer_attention_score')
|
||||
def test_forward_decode_only_swa_seq_len_mismatch(
|
||||
self, mock_fused_infer_attention_score, mock_paged_attention,
|
||||
mock_npu_reshape_and_cache, mock_get_forward_context):
|
||||
"""Test forward pass in DecodeOnly state when seq)len_mismatch"""
|
||||
query = torch.randn(10, 8 * 64)
|
||||
key = torch.randn(10, 8 * 64)
|
||||
value = torch.randn(10, 8 * 64)
|
||||
kv_cache = torch.empty(2, 5, 128, 8, 64)
|
||||
|
||||
metadata = self.attn_metadata
|
||||
metadata.attn_state = AscendAttentionState.DecodeOnly
|
||||
metadata.seq_lens = torch.tensor([10]) # len == 1 != query.size(0)==10
|
||||
metadata.block_tables = torch.zeros(1, 5, dtype=torch.long)
|
||||
metadata.num_actual_tokens = 10
|
||||
metadata.slot_mapping = torch.zeros(10, dtype=torch.long)
|
||||
|
||||
mock_fused_infer_attention_score.return_value = (torch.ones(10, 8,
|
||||
64), 1)
|
||||
|
||||
mock_get_forward_context.return_value = MagicMock(capturing=False)
|
||||
|
||||
output = self.impl_swa.forward(self.layer_no_quant,
|
||||
query,
|
||||
key,
|
||||
value,
|
||||
kv_cache,
|
||||
metadata,
|
||||
trace_flag=False)
|
||||
|
||||
mock_paged_attention.assert_called_once()
|
||||
mock_fused_infer_attention_score.assert_not_called()
|
||||
|
||||
assert output.shape == (10, 8 * 64)
|
||||
|
||||
@patch('vllm_ascend.attention.attention_v1.is_310p', return_value=False)
|
||||
@patch('torch_npu._npu_reshape_and_cache')
|
||||
@patch('vllm_ascend.attention.attention_v1.vanilla_chunked_prefill')
|
||||
|
||||
@@ -186,10 +186,39 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
ascend_config = MagicMock()
|
||||
with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
|
||||
return_value=ascend_config):
|
||||
builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
|
||||
builder = AscendMLAMetadataBuilder(None, None, mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
self.assertEqual(builder.block_size,
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
self.assertEqual(
|
||||
builder.chunked_prefill_enabled,
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled)
|
||||
|
||||
def test_ascend_mla_metadata_builder_spec_decode(self):
|
||||
mock_vllm_config = MagicMock()
|
||||
mock_vllm_config.model_config.max_model_len = 1024
|
||||
mock_vllm_config.model_config.get_head_size.return_value = 64
|
||||
mock_vllm_config.model_config.dtype = torch.float16
|
||||
mock_vllm_config.cache_config.block_size = 16
|
||||
mock_vllm_config.scheduler_config.max_num_seqs = 4
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_spec_config = MagicMock()
|
||||
mock_spec_config.num_speculative_tokens = 3
|
||||
mock_vllm_config.speculative_config = mock_spec_config
|
||||
|
||||
ascend_config = MagicMock()
|
||||
with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
|
||||
return_value=ascend_config):
|
||||
builder = AscendMLAMetadataBuilder(None, None, mock_vllm_config,
|
||||
mock_device)
|
||||
|
||||
self.assertEqual(builder.block_size,
|
||||
mock_vllm_config.cache_config.block_size)
|
||||
@@ -207,9 +236,12 @@ class TestAscendMLAMetadataBuilder(TestBase):
|
||||
mock_vllm_config.scheduler_config.chunked_prefill_enabled = False
|
||||
mock_device = 'cpu'
|
||||
|
||||
mock_vllm_config.speculative_config = None
|
||||
|
||||
with patch("vllm_ascend.attention.mla_v1.get_ascend_config",
|
||||
return_value=ascend_config):
|
||||
builder = AscendMLAMetadataBuilder(mock_vllm_config, mock_device)
|
||||
builder = AscendMLAMetadataBuilder(None, None, mock_vllm_config,
|
||||
mock_device)
|
||||
builder.decode_threshold = 1
|
||||
|
||||
input_batch = MagicMock()
|
||||
@@ -522,7 +554,11 @@ class TestAscendMLAImpl(TestBase):
|
||||
self.impl.num_kv_heads = self.impl.num_heads
|
||||
|
||||
decode_res, prefill_res = self.impl._mla_preprocess(
|
||||
hidden_states, kv_cache, attn_metadata, need_gather_q_kv=False)
|
||||
"mock_layer",
|
||||
hidden_states,
|
||||
kv_cache,
|
||||
attn_metadata,
|
||||
need_gather_q_kv=False)
|
||||
|
||||
self.assertIsNotNone(decode_res)
|
||||
self.assertIsNotNone(prefill_res)
|
||||
|
||||
720
tests/ut/compilation/test_acl_graph.py
Normal file
720
tests/ut/compilation/test_acl_graph.py
Normal file
@@ -0,0 +1,720 @@
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
#
|
||||
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import torch
|
||||
from vllm.compilation.cuda_graph import CUDAGraphOptions
|
||||
from vllm.config import CUDAGraphMode, VllmConfig
|
||||
from vllm.forward_context import BatchDescriptor, ForwardContext
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.compilation.acl_graph import ACLGraphEntry, ACLGraphWrapper
|
||||
|
||||
|
||||
class TestACLGraphEntry(TestBase):
|
||||
|
||||
def test_aclgraph_entry_initialization(self):
|
||||
"""Test ACLGraphEntry initialization with default values"""
|
||||
batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
)
|
||||
|
||||
entry = ACLGraphEntry(batch_descriptor=batch_descriptor)
|
||||
|
||||
self.assertEqual(entry.batch_descriptor, batch_descriptor)
|
||||
self.assertIsNone(entry.aclgraph)
|
||||
self.assertIsNone(entry.output)
|
||||
self.assertIsNone(entry.input_addresses)
|
||||
|
||||
def test_aclgraph_entry_with_values(self):
|
||||
"""Test ACLGraphEntry initialization with specified values"""
|
||||
batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
)
|
||||
|
||||
mock_graph = MagicMock()
|
||||
mock_output = MagicMock()
|
||||
input_addresses = [12345, 67890]
|
||||
|
||||
entry = ACLGraphEntry(batch_descriptor=batch_descriptor,
|
||||
aclgraph=mock_graph,
|
||||
output=mock_output,
|
||||
input_addresses=input_addresses)
|
||||
|
||||
self.assertEqual(entry.batch_descriptor, batch_descriptor)
|
||||
self.assertEqual(entry.aclgraph, mock_graph)
|
||||
self.assertEqual(entry.output, mock_output)
|
||||
self.assertEqual(entry.input_addresses, input_addresses)
|
||||
|
||||
|
||||
class TestACLGraphWrapper(TestBase):
|
||||
|
||||
def setUp(self):
|
||||
"""Set up test fixtures"""
|
||||
super().setUp()
|
||||
|
||||
# Mock VllmConfig
|
||||
self.mock_vllm_config = MagicMock(spec=VllmConfig)
|
||||
self.mock_vllm_config.compilation_config = MagicMock()
|
||||
|
||||
# Mock runnable function
|
||||
self.mock_runnable = MagicMock(return_value="test_output")
|
||||
|
||||
# Mock graph pool
|
||||
self.mock_graph_pool = MagicMock()
|
||||
|
||||
# Mock CUDAGraphOptions
|
||||
self.mock_cudagraph_options = MagicMock(spec=CUDAGraphOptions)
|
||||
self.mock_cudagraph_options.debug_log_enable = False
|
||||
self.mock_cudagraph_options.gc_disable = False
|
||||
self.mock_cudagraph_options.weak_ref_output = False
|
||||
|
||||
# Mock BatchDescriptor
|
||||
self.mock_batch_descriptor = BatchDescriptor(
|
||||
num_tokens=30,
|
||||
uniform_decode=False,
|
||||
)
|
||||
|
||||
# Mock ForwardContext
|
||||
self.mock_forward_context = MagicMock(spec=ForwardContext)
|
||||
self.mock_forward_context.batch_descriptor = self.mock_batch_descriptor
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
def test_initialization_with_default_options(self, mock_envs,
|
||||
mock_current_platform):
|
||||
"""Test ACLGraphWrapper initialization with default CUDAGraphOptions"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
|
||||
wrapper = ACLGraphWrapper(runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool)
|
||||
|
||||
self.assertEqual(wrapper.runnable, self.mock_runnable)
|
||||
self.assertEqual(wrapper.vllm_config, self.mock_vllm_config)
|
||||
self.assertEqual(wrapper.graph_pool, self.mock_graph_pool)
|
||||
self.assertEqual(wrapper.runtime_mode, CUDAGraphMode.FULL)
|
||||
self.assertFalse(wrapper.is_debugging_mode)
|
||||
self.assertIsInstance(wrapper.aclgraph_options, CUDAGraphOptions)
|
||||
self.assertEqual(wrapper.concrete_aclgraph_entries, {})
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
def test_initialization_with_custom_options(self, mock_envs,
|
||||
mock_current_platform):
|
||||
"""Test ACLGraphWrapper initialization with custom CUDAGraphOptions"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "DEBUG"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
self.assertEqual(wrapper.runnable, self.mock_runnable)
|
||||
self.assertEqual(wrapper.vllm_config, self.mock_vllm_config)
|
||||
self.assertEqual(wrapper.graph_pool, self.mock_graph_pool)
|
||||
self.assertEqual(wrapper.runtime_mode, CUDAGraphMode.FULL)
|
||||
self.assertTrue(wrapper.is_debugging_mode)
|
||||
self.assertEqual(wrapper.aclgraph_options, self.mock_cudagraph_options)
|
||||
self.assertEqual(wrapper.concrete_aclgraph_entries, {})
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
def test_initialization_assertion_error(self, mock_envs,
|
||||
mock_current_platform):
|
||||
"""Test ACLGraphWrapper initialization raises AssertionError for NONE mode"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
|
||||
with self.assertRaises(AssertionError):
|
||||
ACLGraphWrapper(runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.NONE,
|
||||
graph_pool=self.mock_graph_pool)
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
def test_call_with_none_runtime_mode(self, mock_envs,
|
||||
mock_current_platform,
|
||||
mock_get_forward_context):
|
||||
"""Test __call__ method when runtime mode is NONE"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.NONE
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
result = wrapper("arg1", "arg2")
|
||||
|
||||
# Should call the runnable directly without graph capture
|
||||
self.mock_runnable.assert_called_once_with("arg1", "arg2")
|
||||
self.assertEqual(result, "test_output")
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
def test_call_with_mismatched_runtime_mode(self, mock_envs,
|
||||
mock_current_platform,
|
||||
mock_get_forward_context):
|
||||
"""Test __call__ method when runtime mode doesn't match wrapper mode"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.PIECEWISE # Different from FULL
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
result = wrapper("arg1", "arg2")
|
||||
|
||||
# Should call the runnable directly without graph capture
|
||||
self.mock_runnable.assert_called_once_with("arg1", "arg2")
|
||||
self.assertEqual(result, "test_output")
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.torch')
|
||||
@patch(
|
||||
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
|
||||
)
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
|
||||
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
|
||||
def test_call_capture_graph_first_time(
|
||||
self, mock_weak_ref_tensors, mock_compilation_counter, mock_envs,
|
||||
mock_current_platform, mock_get_forward_context,
|
||||
mock_validate_cudagraph_capturing_enabled, mock_torch):
|
||||
"""Test __call__ method captures graph for the first time"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
# Mock torch.npu.NPUGraph
|
||||
mock_npu_graph = MagicMock()
|
||||
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
|
||||
|
||||
# Mock torch.npu.graph context manager
|
||||
mock_graph_context = MagicMock()
|
||||
mock_torch.npu.graph.return_value = mock_graph_context
|
||||
mock_graph_context.__enter__ = Mock(return_value=None)
|
||||
mock_graph_context.__exit__ = Mock(return_value=None)
|
||||
|
||||
# Mock weak_ref_tensors to return the same output
|
||||
mock_weak_ref_tensors.return_value = "weak_ref_output"
|
||||
|
||||
# Ensure torch.Tensor can be correctly identified by isinstance
|
||||
mock_torch.Tensor = torch.Tensor
|
||||
|
||||
# Set up the compilation counter mock
|
||||
mock_compilation_counter.num_cudagraph_captured = 0
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
test_tensor = torch.tensor([1, 2, 3])
|
||||
|
||||
# Call the wrapper
|
||||
result = wrapper(test_tensor, "arg2")
|
||||
|
||||
# Verify graph capture happened
|
||||
mock_validate_cudagraph_capturing_enabled.assert_called_once()
|
||||
mock_torch.npu.NPUGraph.assert_called_once()
|
||||
mock_torch.npu.graph.assert_called_once_with(mock_npu_graph,
|
||||
pool=self.mock_graph_pool)
|
||||
self.mock_runnable.assert_called_once_with(test_tensor, "arg2")
|
||||
|
||||
# Verify the entry was created and updated
|
||||
self.assertIn(self.mock_batch_descriptor,
|
||||
wrapper.concrete_aclgraph_entries)
|
||||
entry = wrapper.concrete_aclgraph_entries[self.mock_batch_descriptor]
|
||||
self.assertEqual(entry.aclgraph, mock_npu_graph)
|
||||
self.assertEqual(entry.output, "weak_ref_output")
|
||||
|
||||
# Verify compilation counter was incremented
|
||||
self.assertEqual(mock_compilation_counter.num_cudagraph_captured, 1)
|
||||
|
||||
# Should return the original output (not weak ref)
|
||||
self.assertEqual(result, "test_output")
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.torch')
|
||||
@patch(
|
||||
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
|
||||
)
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
|
||||
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
|
||||
def test_call_replay_graph(self, mock_weak_ref_tensors,
|
||||
mock_compilation_counter, mock_envs,
|
||||
mock_current_platform, mock_get_forward_context,
|
||||
mock_validate_cudagraph_capturing_enabled,
|
||||
mock_torch):
|
||||
"""Test __call__ method replays graph when already captured"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
# Mock torch.npu.NPUGraph
|
||||
mock_npu_graph = MagicMock()
|
||||
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
|
||||
|
||||
# Mock torch.npu.graph context manager
|
||||
mock_graph_context = MagicMock()
|
||||
mock_torch.npu.graph.return_value = mock_graph_context
|
||||
mock_graph_context.__enter__ = Mock(return_value=None)
|
||||
mock_graph_context.__exit__ = Mock(return_value=None)
|
||||
|
||||
# Mock weak_ref_tensors to return the same output
|
||||
mock_weak_ref_tensors.return_value = "weak_ref_output"
|
||||
|
||||
# Ensure torch.Tensor can be correctly identified by isinstance
|
||||
mock_torch.Tensor = torch.Tensor
|
||||
|
||||
# Set up the compilation counter mock
|
||||
mock_compilation_counter.num_cudagraph_captured = 0
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
test_tensor = torch.tensor([1, 2, 3])
|
||||
|
||||
# First call to capture the graph
|
||||
first_result = wrapper(test_tensor, "arg2")
|
||||
|
||||
# Verify graph capture happened during first call
|
||||
mock_validate_cudagraph_capturing_enabled.assert_called_once()
|
||||
mock_torch.npu.NPUGraph.assert_called_once()
|
||||
mock_torch.npu.graph.assert_called_once()
|
||||
|
||||
# Reset mock to track second call
|
||||
self.mock_runnable.reset_mock()
|
||||
mock_npu_graph.reset_mock()
|
||||
|
||||
# Second call should replay the graph
|
||||
second_result = wrapper(test_tensor, "arg2")
|
||||
|
||||
# Verify runnable was called only during capture (not during replay)
|
||||
self.mock_runnable.assert_not_called()
|
||||
|
||||
# Verify graph replay happened
|
||||
mock_npu_graph.replay.assert_called_once()
|
||||
|
||||
# Both calls should return the weak ref output
|
||||
self.assertEqual(first_result, "test_output") # Original output
|
||||
self.assertEqual(second_result, "weak_ref_output") # Weak ref output
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.torch')
|
||||
@patch(
|
||||
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
|
||||
)
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
|
||||
def test_call_with_debug_mode_input_address_check(
|
||||
self, mock_weak_ref_tensors, mock_envs, mock_current_platform,
|
||||
mock_get_forward_context,
|
||||
mock_validate_cudagraph_capturing_enabled, mock_torch):
|
||||
"""Test __call__ method with debug mode input address checking"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "DEBUG" # Enable debug mode
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
# Mock torch.npu.NPUGraph
|
||||
mock_npu_graph = MagicMock()
|
||||
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
|
||||
|
||||
# Mock torch.npu.graph context manager
|
||||
mock_graph_context = MagicMock()
|
||||
mock_torch.npu.graph.return_value = mock_graph_context
|
||||
mock_graph_context.__enter__ = Mock(return_value=None)
|
||||
mock_graph_context.__exit__ = Mock(return_value=None)
|
||||
|
||||
# Mock weak_ref_tensors
|
||||
mock_weak_ref_tensors.return_value = "weak_ref_output"
|
||||
|
||||
# Ensure torch.Tensor can be correctly identified by isinstance
|
||||
mock_torch.Tensor = torch.Tensor
|
||||
|
||||
# Create a mock tensor as the output of runnable
|
||||
mock_output_tensor = torch.tensor([4, 5, 6])
|
||||
self.mock_runnable.return_value = mock_output_tensor
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# First call to capture the graph
|
||||
tensor = torch.tensor([1, 2, 3]) # Create tensor once
|
||||
_ = wrapper(tensor, "arg2")
|
||||
|
||||
# Second call with same tensor addresses should work
|
||||
_ = wrapper(tensor, "arg2") # Use the same tensor object
|
||||
|
||||
# Should not raise AssertionError
|
||||
self.assertTrue(True)
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.torch')
|
||||
@patch(
|
||||
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
|
||||
)
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
|
||||
def test_call_with_debug_mode_input_address_mismatch(
|
||||
self, mock_weak_ref_tensors, mock_envs, mock_current_platform,
|
||||
mock_get_forward_context,
|
||||
mock_validate_cudagraph_capturing_enabled, mock_torch):
|
||||
"""Test __call__ method with debug mode input address mismatch raises AssertionError"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "DEBUG" # Enable debug mode
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
# Mock torch.npu.NPUGraph
|
||||
mock_npu_graph = MagicMock()
|
||||
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
|
||||
|
||||
# Mock torch.npu.graph context manager
|
||||
mock_graph_context = MagicMock()
|
||||
mock_torch.npu.graph.return_value = mock_graph_context
|
||||
mock_graph_context.__enter__ = Mock(return_value=None)
|
||||
mock_graph_context.__exit__ = Mock(return_value=None)
|
||||
|
||||
# Mock weak_ref_tensors
|
||||
mock_weak_ref_tensors.return_value = "weak_ref_output"
|
||||
|
||||
# Ensure torch.Tensor can be correctly identified by isinstance
|
||||
mock_torch.Tensor = torch.Tensor
|
||||
|
||||
# Create a mock tensor as the output of runnable
|
||||
mock_output_tensor = torch.tensor([4, 5, 6])
|
||||
self.mock_runnable.return_value = mock_output_tensor
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# First call to capture the graph
|
||||
tensor1 = torch.tensor([1, 2, 3])
|
||||
_ = wrapper(tensor1, "arg2")
|
||||
|
||||
# Second call with different tensor addresses should raise AssertionError
|
||||
tensor2 = torch.tensor([4, 5,
|
||||
6]) # Different values, different address
|
||||
|
||||
with self.assertRaises(AssertionError) as context:
|
||||
wrapper(tensor2, "arg2")
|
||||
|
||||
self.assertIn("Input addresses for aclgraphs are different",
|
||||
str(context.exception))
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.torch')
|
||||
@patch(
|
||||
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
|
||||
)
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
|
||||
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
|
||||
@patch('vllm_ascend.compilation.acl_graph.patch')
|
||||
def test_call_capture_graph_with_gc_disable(
|
||||
self, mock_patch, mock_weak_ref_tensors, mock_compilation_counter,
|
||||
mock_envs, mock_current_platform, mock_get_forward_context,
|
||||
mock_validate_cudagraph_capturing_enabled, mock_torch):
|
||||
"""Test __call__ method captures graph with gc_disable option enabled"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
# Enable gc_disable option
|
||||
self.mock_cudagraph_options.gc_disable = True
|
||||
# weak_ref_output is not enabled by default
|
||||
|
||||
# Mock torch.npu.NPUGraph
|
||||
mock_npu_graph = MagicMock()
|
||||
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
|
||||
|
||||
# Mock torch.npu.graph context manager
|
||||
mock_graph_context = MagicMock()
|
||||
mock_torch.npu.graph.return_value = mock_graph_context
|
||||
mock_graph_context.__enter__ = Mock(return_value=None)
|
||||
mock_graph_context.__exit__ = Mock(return_value=None)
|
||||
|
||||
# Mock patch context manager
|
||||
mock_exit_stack = MagicMock()
|
||||
mock_patch.return_value = mock_exit_stack
|
||||
mock_exit_stack.enter_context = Mock()
|
||||
|
||||
# Mock weak_ref_tensors to simulate the actual behavior:
|
||||
# 1. First call (inside the graph context) should return "inner_output"
|
||||
# 2. Second call (for entry.output) should return "weak_ref_output"
|
||||
mock_weak_ref_tensors.side_effect = ["inner_output", "weak_ref_output"]
|
||||
|
||||
# Ensure torch.Tensor can be correctly identified by isinstance
|
||||
mock_torch.Tensor = torch.Tensor
|
||||
|
||||
# Set up the compilation counter mock
|
||||
mock_compilation_counter.num_cudagraph_captured = 0
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
test_tensor = torch.tensor([1, 2, 3])
|
||||
|
||||
# Call the wrapper
|
||||
result = wrapper(test_tensor, "arg2")
|
||||
|
||||
# Verify patch was called to disable gc
|
||||
self.assertTrue(mock_patch.called)
|
||||
|
||||
# Verify graph capture happened
|
||||
mock_validate_cudagraph_capturing_enabled.assert_called_once()
|
||||
mock_torch.npu.NPUGraph.assert_called_once()
|
||||
mock_torch.npu.graph.assert_called_once_with(mock_npu_graph,
|
||||
pool=self.mock_graph_pool)
|
||||
|
||||
# Should return the original output (not weak ref) since weak_ref_output is not enabled
|
||||
self.assertEqual(result, "test_output")
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.torch')
|
||||
@patch(
|
||||
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
|
||||
)
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
@patch('vllm_ascend.compilation.acl_graph.compilation_counter')
|
||||
@patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors')
|
||||
def test_call_capture_graph_with_weak_ref_output(
|
||||
self, mock_weak_ref_tensors, mock_compilation_counter, mock_envs,
|
||||
mock_current_platform, mock_get_forward_context,
|
||||
mock_validate_cudagraph_capturing_enabled, mock_torch):
|
||||
"""Test __call__ method captures graph with weak_ref_output option enabled"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
# Enable weak_ref_output option
|
||||
self.mock_cudagraph_options.weak_ref_output = True
|
||||
|
||||
# Mock torch.npu.NPUGraph
|
||||
mock_npu_graph = MagicMock()
|
||||
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
|
||||
|
||||
# Mock torch.npu.graph context manager
|
||||
mock_graph_context = MagicMock()
|
||||
mock_torch.npu.graph.return_value = mock_graph_context
|
||||
mock_graph_context.__enter__ = Mock(return_value=None)
|
||||
mock_graph_context.__exit__ = Mock(return_value=None)
|
||||
|
||||
# Mock weak_ref_tensors to simulate the actual behavior:
|
||||
# 1. First call (inside the graph context with weak_ref_output=True) should return "weak_ref_output"
|
||||
# 2. Second call (for entry.output) should return "weak_ref_output"
|
||||
mock_weak_ref_tensors.side_effect = [
|
||||
"weak_ref_output", "weak_ref_output"
|
||||
]
|
||||
|
||||
# Ensure torch.Tensor can be correctly identified by isinstance
|
||||
mock_torch.Tensor = torch.Tensor
|
||||
|
||||
# Set up the compilation counter mock
|
||||
mock_compilation_counter.num_cudagraph_captured = 0
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
test_tensor = torch.tensor([1, 2, 3])
|
||||
|
||||
# Call the wrapper
|
||||
result = wrapper(test_tensor, "arg2")
|
||||
|
||||
# Verify weak_ref_tensors was called twice (once for inner output, once for final output)
|
||||
self.assertEqual(mock_weak_ref_tensors.call_count, 2)
|
||||
|
||||
# Verify graph capture happened
|
||||
mock_validate_cudagraph_capturing_enabled.assert_called_once()
|
||||
mock_torch.npu.NPUGraph.assert_called_once()
|
||||
mock_torch.npu.graph.assert_called_once_with(mock_npu_graph,
|
||||
pool=self.mock_graph_pool)
|
||||
|
||||
# Should return the weak ref output when weak_ref_output option is enabled
|
||||
self.assertEqual(result, "weak_ref_output")
|
||||
|
||||
@patch('vllm_ascend.compilation.acl_graph.get_forward_context')
|
||||
@patch('vllm_ascend.compilation.acl_graph.current_platform')
|
||||
@patch('vllm_ascend.compilation.acl_graph.envs')
|
||||
@patch('vllm_ascend.compilation.acl_graph.logger')
|
||||
def test_call_capture_graph_with_debug_log(self, mock_logger, mock_envs,
|
||||
mock_current_platform,
|
||||
mock_get_forward_context):
|
||||
"""Test __call__ method captures graph with debug logging enabled"""
|
||||
mock_envs.VLLM_LOGGING_LEVEL = "INFO"
|
||||
mock_current_platform.get_global_graph_pool.return_value = self.mock_graph_pool
|
||||
mock_get_forward_context.return_value = self.mock_forward_context
|
||||
self.mock_forward_context.cudagraph_runtime_mode = CUDAGraphMode.FULL
|
||||
|
||||
# Enable debug logging
|
||||
self.mock_cudagraph_options.debug_log_enable = True
|
||||
# weak_ref_output is not enabled by default
|
||||
|
||||
# Mock torch
|
||||
with patch('vllm_ascend.compilation.acl_graph.torch') as mock_torch:
|
||||
# Mock torch.npu.NPUGraph
|
||||
mock_npu_graph = MagicMock()
|
||||
mock_torch.npu.NPUGraph.return_value = mock_npu_graph
|
||||
|
||||
# Mock torch.npu.graph context manager
|
||||
mock_graph_context = MagicMock()
|
||||
mock_torch.npu.graph.return_value = mock_graph_context
|
||||
mock_graph_context.__enter__ = Mock(return_value=None)
|
||||
mock_graph_context.__exit__ = Mock(return_value=None)
|
||||
|
||||
# Ensure torch.Tensor can be correctly identified by isinstance
|
||||
mock_torch.Tensor = torch.Tensor
|
||||
|
||||
# Mock weak_ref_tensors
|
||||
with patch('vllm_ascend.compilation.acl_graph.weak_ref_tensors'
|
||||
) as mock_weak_ref_tensors:
|
||||
# Mock weak_ref_tensors to simulate the actual behavior:
|
||||
# 1. First call (inside the graph context) should return "inner_output"
|
||||
# 2. Second call (for entry.output) should return "weak_ref_output"
|
||||
mock_weak_ref_tensors.side_effect = [
|
||||
"inner_output", "weak_ref_output"
|
||||
]
|
||||
|
||||
# Mock validate_cudagraph_capturing_enabled
|
||||
with patch(
|
||||
'vllm_ascend.compilation.acl_graph.validate_cudagraph_capturing_enabled'
|
||||
):
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Create a real torch tensor for the test, not a mock
|
||||
test_tensor = torch.tensor([1, 2, 3])
|
||||
|
||||
# Call the wrapper
|
||||
_ = wrapper(test_tensor, "arg2")
|
||||
|
||||
# Verify debug log was called
|
||||
mock_logger.debug.assert_called_once()
|
||||
|
||||
def test_getattr_access_runnable_attributes(self):
|
||||
"""Test __getattr__ method accesses runnable attributes"""
|
||||
mock_runnable = MagicMock()
|
||||
mock_runnable.test_attr = "test_value"
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Should be able to access attributes of the runnable
|
||||
self.assertEqual(wrapper.test_attr, "test_value")
|
||||
|
||||
def test_getattr_attribute_not_exists(self):
|
||||
"""Test __getattr__ method raises AttributeError for non-existent attributes"""
|
||||
|
||||
# Create a simple object without any attributes
|
||||
class EmptyRunnable:
|
||||
pass
|
||||
|
||||
mock_runnable = EmptyRunnable()
|
||||
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
# Should raise AttributeError for non-existent attributes
|
||||
with self.assertRaises(AttributeError) as context:
|
||||
_ = wrapper.non_existent_attr
|
||||
|
||||
self.assertIn("Attribute non_existent_attr not exists",
|
||||
str(context.exception))
|
||||
|
||||
def test_unwrap_method(self):
|
||||
"""Test unwrap method returns the original runnable"""
|
||||
wrapper = ACLGraphWrapper(
|
||||
runnable=self.mock_runnable,
|
||||
vllm_config=self.mock_vllm_config,
|
||||
runtime_mode=CUDAGraphMode.FULL,
|
||||
graph_pool=self.mock_graph_pool,
|
||||
cudagraph_options=self.mock_cudagraph_options)
|
||||
|
||||
unwrapped = wrapper.unwrap()
|
||||
self.assertEqual(unwrapped, self.mock_runnable)
|
||||
@@ -27,7 +27,6 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
max_model_len=8192,
|
||||
is_multimodal_model=False,
|
||||
send_delta_data=False,
|
||||
scheduler_delay_factor=0,
|
||||
)
|
||||
|
||||
def test_initialize_from_config_with_default(self):
|
||||
@@ -36,7 +35,6 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
self.basic_scheduler_config, {})
|
||||
self.assertEqual(ascend_config.enable_chunked_prefill, False)
|
||||
self.assertEqual(ascend_config.policy, "fcfs")
|
||||
self.assertEqual(ascend_config.num_scheduler_steps, 1)
|
||||
self.assertEqual(ascend_config.scheduler_cls,
|
||||
"vllm_ascend.core.scheduler.AscendScheduler")
|
||||
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
|
||||
@@ -49,19 +47,21 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
AscendSchedulerConfig(
|
||||
enable_chunked_prefill=False,
|
||||
policy="fcfs",
|
||||
num_scheduler_steps=1,
|
||||
scheduler_cls="vllm_ascend.core.scheduler.AscendScheduler",
|
||||
max_num_batched_tokens=2048,
|
||||
max_model_len=2048,
|
||||
max_long_partial_prefills=1,
|
||||
long_prefill_token_threshold=512,
|
||||
),
|
||||
)
|
||||
self.assertEqual(ascend_config.enable_chunked_prefill, False)
|
||||
self.assertEqual(ascend_config.policy, "fcfs")
|
||||
self.assertEqual(ascend_config.num_scheduler_steps, 1)
|
||||
self.assertEqual(ascend_config.scheduler_cls,
|
||||
"vllm_ascend.core.scheduler.AscendScheduler")
|
||||
self.assertEqual(ascend_config.max_num_batched_tokens, 2048)
|
||||
self.assertEqual(ascend_config.encoder_cache_size, 2048)
|
||||
self.assertEqual(ascend_config.max_long_partial_prefills, 1)
|
||||
self.assertEqual(ascend_config.long_prefill_token_threshold, 512)
|
||||
|
||||
def test_not_implemented_policy(self):
|
||||
with self.assertRaises(NotImplementedError) as context:
|
||||
@@ -78,28 +78,6 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
str(context.exception),
|
||||
)
|
||||
|
||||
def test_not_implemented_multimodal(self):
|
||||
with self.assertRaises(NotImplementedError) as context:
|
||||
AscendSchedulerConfig.initialize_from_config(
|
||||
SchedulerConfig(is_multimodal_model=True), {})
|
||||
self.assertIn("currently AscendScheduler only supports LLM models",
|
||||
str(context.exception))
|
||||
|
||||
def test_not_implemented_multi_step(self):
|
||||
with self.assertRaises(NotImplementedError) as context:
|
||||
AscendSchedulerConfig.initialize_from_config(
|
||||
self.basic_scheduler_config,
|
||||
AscendSchedulerConfig(
|
||||
num_scheduler_steps=2,
|
||||
max_num_batched_tokens=2048,
|
||||
max_model_len=2048,
|
||||
),
|
||||
)
|
||||
self.assertIn(
|
||||
"currently AscendScheduler doesn't support multi-step",
|
||||
str(context.exception),
|
||||
)
|
||||
|
||||
def test_not_implemented_send_delta_data(self):
|
||||
with self.assertRaises(NotImplementedError) as context:
|
||||
AscendSchedulerConfig.initialize_from_config(
|
||||
@@ -115,27 +93,17 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
str(context.exception),
|
||||
)
|
||||
|
||||
def test_not_implemented_delay_factor(self):
|
||||
with self.assertRaises(NotImplementedError) as context:
|
||||
AscendSchedulerConfig.initialize_from_config(
|
||||
self.basic_scheduler_config,
|
||||
AscendSchedulerConfig(
|
||||
delay_factor=1,
|
||||
max_num_batched_tokens=2048,
|
||||
max_model_len=2048,
|
||||
),
|
||||
)
|
||||
self.assertIn(
|
||||
"currently AscendScheduler doesn't support scheduler_delay_factor",
|
||||
str(context.exception),
|
||||
)
|
||||
|
||||
def test_no_override(self):
|
||||
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
||||
self.basic_scheduler_config, {})
|
||||
self.assertEqual(ascend_config.max_num_encoder_input_tokens, 8192)
|
||||
self.assertEqual(ascend_config.encoder_cache_size, 8192)
|
||||
|
||||
def test_valid_config_with_multimodal(self):
|
||||
config = AscendSchedulerConfig.initialize_from_config(
|
||||
SchedulerConfig(is_multimodal_model=True), {})
|
||||
self.assertTrue(config.is_multimodal_model)
|
||||
|
||||
def test_valid_config_with_chunked_prefill(self):
|
||||
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
||||
self.basic_scheduler_config,
|
||||
@@ -165,3 +133,16 @@ class TestAscendSchedulerConfig(TestBase):
|
||||
)
|
||||
self.assertIn("max_num_batched_tokens (2048)", str(context.exception))
|
||||
self.assertIn("max_model_len (4096)", str(context.exception))
|
||||
|
||||
def test_initialize_from_config_with_pd_transfer(self):
|
||||
ascend_config = AscendSchedulerConfig.initialize_from_config(
|
||||
self.basic_scheduler_config,
|
||||
AscendSchedulerConfig(
|
||||
enable_pd_transfer=True,
|
||||
decode_max_num_seqs=48,
|
||||
max_num_batched_tokens=4096,
|
||||
max_model_len=4096,
|
||||
),
|
||||
)
|
||||
self.assertEqual(ascend_config.enable_pd_transfer, True)
|
||||
self.assertEqual(ascend_config.decode_max_num_seqs, 48)
|
||||
|
||||
@@ -6,25 +6,21 @@ from unittest.mock import MagicMock, patch
|
||||
import torch
|
||||
from vllm.config import (CacheConfig, KVTransferConfig, ModelConfig,
|
||||
SchedulerConfig, SpeculativeConfig, VllmConfig)
|
||||
from vllm.multimodal.inputs import PlaceholderRange
|
||||
from vllm.multimodal.inputs import (MultiModalFeatureSpec,
|
||||
MultiModalKwargsItem, PlaceholderRange)
|
||||
from vllm.sampling_params import SamplingParams
|
||||
from vllm.utils import sha256
|
||||
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
||||
init_none_hash)
|
||||
from vllm.v1.core.sched.output import SchedulerOutput
|
||||
from vllm.v1.kv_cache_interface import (FullAttentionSpec, KVCacheConfig,
|
||||
KVCacheGroupSpec)
|
||||
from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.outputs import DraftTokenIds, ModelRunnerOutput
|
||||
from vllm.v1.request import Request, RequestStatus
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from tests.ut.base import TestBase
|
||||
from vllm_ascend.core.scheduler import AscendScheduler
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
|
||||
from vllm.v1.outputs import DraftTokenIds
|
||||
else:
|
||||
DraftTokenIds = None
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
MODEL = "Qwen3-0.6B"
|
||||
@@ -44,7 +40,7 @@ def create_requests(
|
||||
max_tokens: int = 16,
|
||||
stop_token_ids: Optional[list[int]] = None,
|
||||
block_size: int = 3,
|
||||
hash_fn=hash,
|
||||
hash_fn=sha256,
|
||||
):
|
||||
init_none_hash(hash_fn)
|
||||
prompt_logprobs = PROMPT_LOGPROBS
|
||||
@@ -54,25 +50,25 @@ def create_requests(
|
||||
prompt_logprobs=prompt_logprobs)
|
||||
requests = []
|
||||
for i in range(num_requests):
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
request = Request(request_id=f"{i}",
|
||||
prompt_token_ids=[i] * num_tokens,
|
||||
sampling_params=sampling_params,
|
||||
multi_modal_kwargs=None,
|
||||
multi_modal_placeholders=None,
|
||||
multi_modal_hashes=None,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
pooling_params=None,
|
||||
block_hasher=get_request_block_hasher(
|
||||
block_size, hash_fn))
|
||||
else:
|
||||
request = Request(request_id=f"{i}",
|
||||
prompt_token_ids=[i] * num_tokens,
|
||||
sampling_params=sampling_params,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
pooling_params=None,
|
||||
block_hasher=get_request_block_hasher(
|
||||
block_size, hash_fn))
|
||||
mm_features = []
|
||||
if mm_positions is not None:
|
||||
mm_position = mm_positions[i]
|
||||
for j, position in enumerate(mm_position):
|
||||
identifier = f"hash{i}_{j}"
|
||||
mm_feature = MultiModalFeatureSpec(
|
||||
data=MultiModalKwargsItem.dummy("dummy_m"),
|
||||
mm_position=position,
|
||||
identifier=identifier,
|
||||
modality="image")
|
||||
mm_features.append(mm_feature)
|
||||
request = Request(request_id=f"{i}",
|
||||
prompt_token_ids=[i] * num_tokens,
|
||||
sampling_params=sampling_params,
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
pooling_params=None,
|
||||
mm_features=mm_features if mm_features else None,
|
||||
block_hasher=get_request_block_hasher(
|
||||
block_size, hash_fn))
|
||||
requests.append(request)
|
||||
return requests
|
||||
|
||||
@@ -85,25 +81,15 @@ def make_output(scheduler):
|
||||
}
|
||||
sampled_token_ids = [[1000]] * len(scheduler.running)
|
||||
logprobs = None
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
modelrunner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
spec_token_ids=None,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
)
|
||||
else:
|
||||
modelrunner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
)
|
||||
|
||||
modelrunner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
logprobs=logprobs,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
)
|
||||
return modelrunner_output
|
||||
|
||||
|
||||
@@ -113,7 +99,7 @@ class TestAscendScheduler(TestBase):
|
||||
@patch("vllm.config.VllmConfig.__post_init__", MagicMock())
|
||||
@patch('vllm.v1.core.sched.scheduler.compute_encoder_budget')
|
||||
def create_scheduler(self, mock_compute_encoder_budget):
|
||||
mock_compute_encoder_budget.return_value = [10, 20]
|
||||
mock_compute_encoder_budget.return_value = [100, 100]
|
||||
use_kv_connector = False
|
||||
block_size = 16
|
||||
|
||||
@@ -235,7 +221,7 @@ class TestAscendScheduler(TestBase):
|
||||
len(requests) - i - 1)
|
||||
|
||||
def test_schedule(self):
|
||||
'''Test scheduling.
|
||||
'''Test scheduling.
|
||||
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
|
||||
'''
|
||||
scheduler = self.create_scheduler()
|
||||
@@ -260,6 +246,60 @@ class TestAscendScheduler(TestBase):
|
||||
for i, request in enumerate(requests):
|
||||
self.assertEqual(scheduler.running[i], request)
|
||||
|
||||
def test_schedule_multimodal_requests(self):
|
||||
scheduler = self.create_scheduler()
|
||||
scheduler.scheduler_config.chunked_prefill_enabled = False
|
||||
mm_positions = [[PlaceholderRange(offset=i, length=10)]
|
||||
for i in range(10)]
|
||||
requests = create_requests(
|
||||
num_requests=10,
|
||||
mm_positions=mm_positions,
|
||||
)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
output = scheduler.schedule()
|
||||
self.assertEqual(len(output.scheduled_new_reqs), len(requests))
|
||||
self.assertEqual(output.scheduled_cached_reqs.num_reqs, 0)
|
||||
self.assertEqual(len(output.finished_req_ids), 0)
|
||||
for req_id, num_tokens in output.num_scheduled_tokens.items():
|
||||
assert num_tokens == len(requests[int(req_id)].prompt_token_ids)
|
||||
|
||||
# Verify all requests are scheduled.
|
||||
for req_id, num_tokens in output.num_scheduled_tokens.items():
|
||||
self.assertEqual(num_tokens,
|
||||
len(requests[int(req_id)].prompt_token_ids))
|
||||
self.assertEqual(len(output.scheduled_encoder_inputs), len(requests))
|
||||
for req_id, encoder_input in output.scheduled_encoder_inputs.items():
|
||||
assert len(encoder_input) == 1
|
||||
|
||||
# Verify requests moved from waiting to running
|
||||
self.assertEqual(len(scheduler.waiting), 0)
|
||||
self.assertEqual(len(scheduler.running), len(requests))
|
||||
for i, request in enumerate(requests):
|
||||
self.assertEqual(scheduler.running[i], request)
|
||||
|
||||
def test_concurrent_partial_prefills_schedule(self):
|
||||
'''Test concurrent partial prefills scheduling.
|
||||
total requests = 10, every request has 10 token.
|
||||
while set long_prefill_token_threshold = 1, scheduler can
|
||||
only schedule max_long_partial_prefills long request.
|
||||
'''
|
||||
scheduler = self.create_scheduler()
|
||||
scheduler.scheduler_config.chunked_prefill_enabled = False
|
||||
scheduler.scheduler_config.max_long_partial_prefills = 2
|
||||
scheduler.scheduler_config.long_prefill_token_threshold = 1
|
||||
requests = create_requests(num_requests=10, num_tokens=20)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
# Test initial scheduling
|
||||
output = scheduler.schedule()
|
||||
self.assertEqual(len(output.scheduled_new_reqs),
|
||||
scheduler.scheduler_config.max_long_partial_prefills)
|
||||
self.assertEqual(output.scheduled_cached_reqs.num_reqs, 0)
|
||||
self.assertEqual(len(output.finished_req_ids), 0)
|
||||
|
||||
def test_schedule_enable_prefix_caching(self):
|
||||
'''Test scheduling.
|
||||
Two cases: default APC/no prompt logprobs; APC=True + prompt logprobs
|
||||
@@ -304,69 +344,34 @@ class TestAscendScheduler(TestBase):
|
||||
scheduler.running.append(req)
|
||||
req.status = RequestStatus.RUNNING
|
||||
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 1,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [],
|
||||
requests[1].request_id: [10]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [
|
||||
10, 11
|
||||
]], # First request hits EOS, second continues
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
else:
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 1,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [],
|
||||
requests[1].request_id: [10]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [
|
||||
10, 11
|
||||
]], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 1,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [],
|
||||
requests[1].request_id: [10]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID], [10, 11]
|
||||
], # First request hits EOS, second continues
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
@@ -391,67 +396,35 @@ class TestAscendScheduler(TestBase):
|
||||
scheduler.running.append(req)
|
||||
req.status = RequestStatus.RUNNING
|
||||
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
total_num_scheduled_tokens=5,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [10, 42],
|
||||
requests[1].request_id: [13]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
else:
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
total_num_scheduled_tokens=5,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [10, 42],
|
||||
requests[1].request_id: [13]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 2
|
||||
},
|
||||
total_num_scheduled_tokens=5,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id:
|
||||
[10, 42],
|
||||
requests[1].request_id: [13]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 42, 12],
|
||||
[13, 14]], # First request hits stop token
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
@@ -475,67 +448,35 @@ class TestAscendScheduler(TestBase):
|
||||
scheduler.running.append(req)
|
||||
req.status = RequestStatus.RUNNING
|
||||
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 1
|
||||
},
|
||||
total_num_scheduled_tokens=4,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [10, 11],
|
||||
requests[1].request_id: []
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
else:
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 1
|
||||
},
|
||||
total_num_scheduled_tokens=4,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [10, 11],
|
||||
requests[1].request_id: []
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
scheduler_output = SchedulerOutput(scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={
|
||||
requests[0].request_id: 3,
|
||||
requests[1].request_id: 1
|
||||
},
|
||||
total_num_scheduled_tokens=4,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id:
|
||||
[10, 11],
|
||||
requests[1].request_id: []
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[req.request_id for req in requests],
|
||||
req_id_to_index={
|
||||
req.request_id: i
|
||||
for i, req in enumerate(requests)
|
||||
},
|
||||
sampled_token_ids=[[10, 11, 12],
|
||||
[13]], # First request exceeds max_tokens
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
# Verify first request stopped due to length
|
||||
@@ -556,52 +497,27 @@ class TestAscendScheduler(TestBase):
|
||||
scheduler.requests[requests[0].request_id] = requests[0]
|
||||
scheduler.running.append(requests[0])
|
||||
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={requests[0].request_id: 3},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [EOS_TOKEN_ID, 10]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_input_ids=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
else:
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={requests[0].request_id: 3},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [EOS_TOKEN_ID, 10]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
scheduler_output = SchedulerOutput(
|
||||
scheduled_new_reqs=[],
|
||||
scheduled_cached_reqs=[],
|
||||
num_scheduled_tokens={requests[0].request_id: 3},
|
||||
total_num_scheduled_tokens=3,
|
||||
scheduled_encoder_inputs={},
|
||||
scheduled_spec_decode_tokens={
|
||||
requests[0].request_id: [EOS_TOKEN_ID, 10]
|
||||
},
|
||||
num_common_prefix_blocks=0,
|
||||
finished_req_ids=set(),
|
||||
free_encoder_mm_hashes=[],
|
||||
structured_output_request_ids={},
|
||||
grammar_bitmask=None)
|
||||
model_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[EOS_TOKEN_ID, 10, 11]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output, model_output)
|
||||
|
||||
@@ -652,23 +568,13 @@ class TestAscendScheduler(TestBase):
|
||||
512)
|
||||
|
||||
# Model output of the first request.
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
else:
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[0].request_id],
|
||||
req_id_to_index={requests[0].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output0,
|
||||
model_runner_output)
|
||||
@@ -678,23 +584,13 @@ class TestAscendScheduler(TestBase):
|
||||
# request is still running.
|
||||
scheduler.schedule()
|
||||
# Model output of the second request.
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
else:
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=[requests[1].request_id],
|
||||
req_id_to_index={requests[1].request_id: 0},
|
||||
sampled_token_ids=[[0]],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
scheduler.update_from_output(scheduler_output1,
|
||||
model_runner_output)
|
||||
@@ -746,29 +642,19 @@ class TestAscendScheduler(TestBase):
|
||||
req_id = requests[i].request_id
|
||||
self.assertEqual(output.num_scheduled_tokens[req_id], 1)
|
||||
self.assertNotIn(req_id, output.scheduled_spec_decode_tokens)
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
spec_token_ids=spec_tokens,
|
||||
pooler_output=[])
|
||||
else:
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=[[0] for _ in range(len(requests))],
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
draft_token_ids = DraftTokenIds(req_ids, spec_tokens)
|
||||
|
||||
engine_core_outputs = scheduler.update_from_output(
|
||||
output, model_runner_output)
|
||||
if not (vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1")):
|
||||
scheduler.update_draft_token_ids(draft_token_ids)
|
||||
scheduler.update_draft_token_ids(draft_token_ids)
|
||||
|
||||
for i in range(len(requests)):
|
||||
running_req = scheduler.running[i]
|
||||
@@ -804,23 +690,14 @@ class TestAscendScheduler(TestBase):
|
||||
else:
|
||||
self.assertNotIn(req_id,
|
||||
output.scheduled_spec_decode_tokens)
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=output_tokens,
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
else:
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=output_tokens,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_to_index,
|
||||
sampled_token_ids=output_tokens,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[])
|
||||
|
||||
engine_core_outputs = scheduler.update_from_output(
|
||||
output, model_runner_output)
|
||||
@@ -896,3 +773,34 @@ class TestAscendScheduler(TestBase):
|
||||
|
||||
# Confirm no memory leak.
|
||||
self.assert_scheduler_empty(scheduler)
|
||||
|
||||
def test_scheduler_with_pd_transfer(self):
|
||||
scheduler = self.create_scheduler()
|
||||
scheduler.phase = "prefill"
|
||||
requests = create_requests(num_requests=32)
|
||||
for request in requests:
|
||||
scheduler.add_request(request)
|
||||
|
||||
# 1st iteration, move 16 requests from waiting to running for prefill
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = make_output(scheduler)
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
first_iter_prefilled_req_num = len(scheduler.running)
|
||||
self.assertEqual(len(scheduler_output.scheduled_new_reqs),
|
||||
scheduler.max_num_running_reqs)
|
||||
self.assertEqual(scheduler_output.scheduled_cached_reqs.num_reqs, 0)
|
||||
self.assertEqual(len(scheduler_output.finished_req_ids), 0)
|
||||
|
||||
# 2nd iteration, move 16 prefilled requests to finished_prefill_reqs
|
||||
# and move 16 requests from waiting to running for prefill
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = make_output(scheduler)
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
self.assertEqual(len(scheduler.finished_prefill_reqs),
|
||||
first_iter_prefilled_req_num)
|
||||
|
||||
# 3rd iteration, all requests prefilled, change scheduler phase to decode
|
||||
scheduler_output = scheduler.schedule()
|
||||
model_runner_output = make_output(scheduler)
|
||||
scheduler.update_from_output(scheduler_output, model_runner_output)
|
||||
self.assertEqual(scheduler.phase, "decode")
|
||||
|
||||
@@ -1,139 +0,0 @@
|
||||
#
|
||||
# Copyright (c) 2025 Huawei Technologies Co., Ltd. All Rights Reserved.
|
||||
# Copyright 2023 The vLLM team.
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# This file is a part of the vllm-ascend project.
|
||||
|
||||
import importlib
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from pytest_mock import MockerFixture
|
||||
|
||||
from tests.ut.base import PytestBase
|
||||
from vllm_ascend.distributed.tensor_parallel import (
|
||||
_gather_along_first_dim, _gather_along_last_dim,
|
||||
_reduce_scatter_along_first_dim, _reduce_scatter_along_last_dim,
|
||||
all_to_all_hp2sp, all_to_all_sp2hp)
|
||||
|
||||
|
||||
class TestDistributedCommunication(PytestBase):
|
||||
|
||||
@pytest.fixture(autouse=True)
|
||||
def context(self, mocker: MockerFixture):
|
||||
mocker.patch("torch.npu.current_device", return_value="cpu")
|
||||
mocker.patch("torch.distributed.get_world_size", return_value=4)
|
||||
|
||||
mocker.patch("torch.distributed.get_rank", return_value=0)
|
||||
|
||||
@pytest.mark.parametrize("world_size, test_tensor, expected",
|
||||
[(1, torch.randn(8, 16), (8, 16)),
|
||||
(4, torch.randn(8, 16), (32, 16))])
|
||||
def test_gather_along_first_dim(self, test_tensor, expected, world_size,
|
||||
mocker: MockerFixture):
|
||||
"""test _gather_along_first_dim"""
|
||||
mocker.patch("torch.distributed.get_world_size",
|
||||
return_value=world_size)
|
||||
|
||||
result = _gather_along_first_dim(test_tensor, mocker.MagicMock())
|
||||
|
||||
assert result.shape == expected
|
||||
|
||||
@pytest.mark.parametrize("test_tensor, output_split_sizes, expected", [
|
||||
(torch.randn(8, 16), [5, 10, 15, 2], (32, 16)),
|
||||
])
|
||||
def test_gather_along_first_dim_unequal_split(self, test_tensor, expected,
|
||||
output_split_sizes,
|
||||
mocker: MockerFixture):
|
||||
"""test _gather_along_first_dim"""
|
||||
|
||||
result = _gather_along_first_dim(test_tensor, mocker.MagicMock(),
|
||||
output_split_sizes)
|
||||
|
||||
assert result.shape == expected
|
||||
|
||||
@pytest.mark.parametrize("world_size, test_tensor, expected",
|
||||
[(1, torch.randn(8, 16, 32), (8, 16, 32)),
|
||||
(4, torch.randn(8, 16, 32), (8, 16, 32 * 4))])
|
||||
def test_gather_along_last_dim(self, test_tensor, expected, world_size,
|
||||
mocker: MockerFixture):
|
||||
"""test _gather_along_last_dim"""
|
||||
mocker.patch("torch.distributed.get_world_size",
|
||||
return_value=world_size)
|
||||
|
||||
result = _gather_along_last_dim(test_tensor, mocker.MagicMock())
|
||||
|
||||
assert result.shape == expected
|
||||
|
||||
@pytest.mark.parametrize("input_shape,expected_shape", [
|
||||
((32, 16), (8, 16)),
|
||||
((40, 10), (10, 10)),
|
||||
])
|
||||
def test_reduce_scatter_along_first_dim(self, input_shape, expected_shape,
|
||||
mocker: MockerFixture):
|
||||
input_tensor = torch.randn(*input_shape)
|
||||
result = _reduce_scatter_along_first_dim(input_tensor,
|
||||
mocker.MagicMock())
|
||||
assert result.shape == expected_shape
|
||||
|
||||
@pytest.mark.parametrize("input_shape,expected_shape", [
|
||||
((8, 16, 32), (8, 16, 8)),
|
||||
])
|
||||
def test_reduce_scatter_along_last_dim(self, input_shape, expected_shape,
|
||||
mocker: MockerFixture):
|
||||
input_tensor = torch.randn(*input_shape)
|
||||
result = _reduce_scatter_along_last_dim(input_tensor,
|
||||
mocker.MagicMock())
|
||||
assert result.shape == expected_shape
|
||||
|
||||
@pytest.mark.parametrize("func,input_shape,expected_shape", [
|
||||
("all_gather_last_dim_from_tensor_parallel_region", (8, 16, 32),
|
||||
(8, 16, 128)),
|
||||
("reduce_scatter_to_sequence_parallel_region", (32, 16), (8, 16)),
|
||||
("reduce_scatter_last_dim_to_tensor_parallel_region", (8, 16, 32),
|
||||
(8, 16, 8)),
|
||||
("gather_from_sequence_parallel_region", (8, 16), (32, 16)),
|
||||
])
|
||||
def test_wrapper_functions(self, func, input_shape, expected_shape,
|
||||
mocker: MockerFixture):
|
||||
"""test wrapper funcs"""
|
||||
mod = importlib.import_module(
|
||||
'vllm_ascend.distributed.tensor_parallel')
|
||||
globals = mod.__dict__
|
||||
test_func = globals[func]
|
||||
input_tensor = torch.randn(*input_shape)
|
||||
result = test_func(input_tensor, mocker.MagicMock())
|
||||
assert result.shape == expected_shape
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_shape,output_shape",
|
||||
[
|
||||
((8, 16), (32, 4)), # [num_tokens/TP, H] -> [num_tokens, H/TP]
|
||||
])
|
||||
def test_all_to_all_sp2hp(self, input_shape, output_shape,
|
||||
mocker: MockerFixture):
|
||||
input_tensor = torch.randn(*input_shape)
|
||||
result = all_to_all_sp2hp(input_tensor, mocker.MagicMock())
|
||||
assert result.shape == output_shape
|
||||
|
||||
@pytest.mark.parametrize(
|
||||
"input_shape,output_shape",
|
||||
[
|
||||
((32, 4), (8, 16)), # [num_tokens, H/TP] -> [num_tokens/TP, H]
|
||||
])
|
||||
def test_all_to_all_hp2sp(self, input_shape, output_shape,
|
||||
mocker: MockerFixture):
|
||||
input_tensor = torch.randn(*input_shape)
|
||||
result = all_to_all_hp2sp(input_tensor, mocker.MagicMock())
|
||||
assert result.shape == output_shape
|
||||
@@ -4,8 +4,8 @@ import pytest
|
||||
from vllm.config import ParallelConfig
|
||||
|
||||
from vllm_ascend.distributed.parallel_state import (
|
||||
_LMTP, _MC2, destroy_ascend_model_parallel, get_lmhead_tp_group,
|
||||
get_mc2_group, init_ascend_model_parallel)
|
||||
_LMTP, _MC2, _OTP, destroy_ascend_model_parallel, get_lmhead_tp_group,
|
||||
get_mc2_group, get_otp_group, init_ascend_model_parallel)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
@@ -29,16 +29,20 @@ def mock_distributed():
|
||||
def test_init_ascend_model_parallel(mock_distributed, parallel_config):
|
||||
mock_ascend_config = MagicMock()
|
||||
mock_ascend_config.lmhead_tensor_parallel_size = 2
|
||||
mock_ascend_config.oproj_tensor_parallel_size = 2
|
||||
with patch('vllm_ascend.distributed.parallel_state.model_parallel_initialized', return_value=False), \
|
||||
patch('vllm_ascend.distributed.parallel_state.init_model_parallel_group'), \
|
||||
patch('vllm_ascend.distributed.parallel_state.get_ascend_config', return_value=mock_ascend_config):
|
||||
init_ascend_model_parallel(parallel_config)
|
||||
|
||||
mc2_group = get_mc2_group()
|
||||
assert mc2_group is not None
|
||||
lmheadtp_group = get_lmhead_tp_group()
|
||||
otp_group = get_otp_group()
|
||||
assert mc2_group is not None
|
||||
assert otp_group is not None
|
||||
assert lmheadtp_group is not None
|
||||
|
||||
destroy_ascend_model_parallel()
|
||||
assert _MC2 is None
|
||||
assert _LMTP is None
|
||||
assert _OTP is None
|
||||
|
||||
73
tests/ut/eplb/adaptor/test_abstract_adaptor.py
Normal file
73
tests/ut/eplb/adaptor/test_abstract_adaptor.py
Normal file
@@ -0,0 +1,73 @@
|
||||
import pytest
|
||||
|
||||
from vllm_ascend.eplb.adaptor.abstract_adaptor import EplbAdaptor
|
||||
|
||||
|
||||
class DummyAdaptor(EplbAdaptor):
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
super().__init__(**kwargs)
|
||||
self.args = kwargs
|
||||
|
||||
def get_rank_expert_workload(self):
|
||||
return "workload"
|
||||
|
||||
def get_init_expert_map(self, num_moe_layers):
|
||||
return {"layers": num_moe_layers}
|
||||
|
||||
def do_update_expert_map(self, layer_id, updated_expert_map):
|
||||
return {"layer_id": layer_id, "map": updated_expert_map}
|
||||
|
||||
def do_update_expert_weight(self, layer_id, local_expert_to_replace,
|
||||
buffer_tensor_id):
|
||||
return {
|
||||
"layer_id": layer_id,
|
||||
"replace": local_expert_to_replace,
|
||||
"buffer": buffer_tensor_id,
|
||||
}
|
||||
|
||||
|
||||
def test_base_class_methods_raise():
|
||||
adaptor = EplbAdaptor()
|
||||
with pytest.raises(NotImplementedError):
|
||||
adaptor.get_rank_expert_workload()
|
||||
with pytest.raises(NotImplementedError):
|
||||
adaptor.get_init_expert_map(1)
|
||||
with pytest.raises(NotImplementedError):
|
||||
adaptor.do_update_expert_map(1, {})
|
||||
with pytest.raises(NotImplementedError):
|
||||
adaptor.do_update_expert_weight(1, "x", "y")
|
||||
|
||||
|
||||
def test_dummy_adaptor_init_and_args():
|
||||
adaptor = DummyAdaptor(test_arg=123)
|
||||
assert adaptor.args["test_arg"] == 123
|
||||
|
||||
|
||||
def test_get_rank_expert_workload():
|
||||
adaptor = DummyAdaptor()
|
||||
result = adaptor.get_rank_expert_workload()
|
||||
assert result == "workload"
|
||||
|
||||
|
||||
def test_get_init_expert_map():
|
||||
adaptor = DummyAdaptor()
|
||||
result = adaptor.get_init_expert_map(5)
|
||||
assert isinstance(result, dict)
|
||||
assert result["layers"] == 5
|
||||
|
||||
|
||||
def test_do_update_expert_map():
|
||||
adaptor = DummyAdaptor()
|
||||
updated = {"expert": 1}
|
||||
result = adaptor.do_update_expert_map(2, updated)
|
||||
assert result["layer_id"] == 2
|
||||
assert result["map"] == updated
|
||||
|
||||
|
||||
def test_do_update_expert_weight():
|
||||
adaptor = DummyAdaptor()
|
||||
result = adaptor.do_update_expert_weight(1, "expertA", "bufferX")
|
||||
assert result["layer_id"] == 1
|
||||
assert result["replace"] == "expertA"
|
||||
assert result["buffer"] == "bufferX"
|
||||
31
tests/ut/eplb/core/policy/test_policy_abstract.py
Normal file
31
tests/ut/eplb/core/policy/test_policy_abstract.py
Normal file
@@ -0,0 +1,31 @@
|
||||
# test_policy_abstract.py
|
||||
from vllm_ascend.eplb.core.policy.policy_abstract import (DynamicConfig,
|
||||
EplbPolicy)
|
||||
|
||||
|
||||
class DummyPolicy(EplbPolicy):
|
||||
|
||||
def rebalance_experts(self, current_expert_table, expert_workload):
|
||||
return 1, current_expert_table
|
||||
|
||||
|
||||
def test_dynamic_config_attributes():
|
||||
config = DynamicConfig()
|
||||
assert config.placement_policy is None
|
||||
assert config.max_transferred_expert_per_layer == 100
|
||||
assert config.ep_worldsize == 64
|
||||
assert config.num_die_per_host == 8
|
||||
|
||||
|
||||
def test_eplb_policy_init_and_method():
|
||||
config = DynamicConfig()
|
||||
policy = DummyPolicy(config)
|
||||
|
||||
assert policy.config == config
|
||||
|
||||
expert_table = [[0, 1, 2]]
|
||||
workload = [10]
|
||||
res, new_table = policy.rebalance_experts(expert_table, workload)
|
||||
|
||||
assert res == 1
|
||||
assert new_table == expert_table
|
||||
98
tests/ut/eplb/core/policy/test_policy_dynamic_ep.py
Normal file
98
tests/ut/eplb/core/policy/test_policy_dynamic_ep.py
Normal file
@@ -0,0 +1,98 @@
|
||||
from unittest.mock import patch
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from vllm_ascend.eplb.core.policy.policy_dynamic_ep import DynamicEplb
|
||||
|
||||
|
||||
class TestDynamicEplb:
|
||||
|
||||
def test_add_redundant_basic(self):
|
||||
current_expert_table = np.array([[[0, 1], [1, 0]]])
|
||||
expert_workload = np.array([[[2, 3], [4, 1]]])
|
||||
num_original_expert = 2
|
||||
result = DynamicEplb.add_redundant(current_expert_table,
|
||||
expert_workload,
|
||||
num_original_expert)
|
||||
expected = np.array([[2 + 1, 3 + 4]])
|
||||
assert np.array_equal(result, expected)
|
||||
|
||||
def test_get_redundant_num(self):
|
||||
counts = np.array([2, 1, 3])
|
||||
assert DynamicEplb.get_redundant_num(3, counts) == 3
|
||||
|
||||
def test_calculate_max_heat_per_layer(self):
|
||||
workload_table = np.array([[[1, 2], [3, 4]], [[2, 2], [1, 1]]])
|
||||
max_heat = DynamicEplb.calculate_max_heat_per_layer(workload_table, 2)
|
||||
assert max_heat == [7, 4]
|
||||
|
||||
def test_constraint_expert_local_exchange(self):
|
||||
current = [[[0, 1], [2, 3]]]
|
||||
global_dep = [[[1, 0], [3, 2]]]
|
||||
new_dep = DynamicEplb.constraint_expert_local_exchange(
|
||||
current, global_dep)
|
||||
assert new_dep == [[[0, 1], [2, 3]]]
|
||||
|
||||
def test_compute_balanced_pack_redundancy_normal(self):
|
||||
origin_weights = [(0, 10), (1, 20)]
|
||||
result, boxes = DynamicEplb.compute_balanced_pack_redundancy(
|
||||
origin_weights, 2, 1)
|
||||
assert isinstance(result, list) and len(result) == 2
|
||||
|
||||
def test_compute_balanced_pack_redundancy_card0(self):
|
||||
origin_weights = [(0, 10)]
|
||||
with pytest.raises(RuntimeError):
|
||||
DynamicEplb.compute_balanced_pack_redundancy(origin_weights, 0, 0)
|
||||
|
||||
def test_compute_balanced_pack_normal(self):
|
||||
origin_weights = np.array([(0, 10), (1, 20)], dtype=object)
|
||||
result, boxes = DynamicEplb.compute_balanced_pack(origin_weights, 2)
|
||||
assert isinstance(result, list) and len(result) == 2
|
||||
|
||||
def test_compute_balanced_pack_card0(self):
|
||||
origin_weights = np.array([(0, 10)], dtype=object)
|
||||
with pytest.raises(RuntimeError):
|
||||
DynamicEplb.compute_balanced_pack(origin_weights, 0)
|
||||
|
||||
def test_original_compute_balanced_pack_redundancy(self):
|
||||
origin_weights = [(0, 5), (1, 10)]
|
||||
result, boxes = DynamicEplb.original_compute_balanced_pack_redundancy(
|
||||
origin_weights, 2, 1)
|
||||
assert isinstance(result, list) and len(result) == 2
|
||||
|
||||
def test_rebalance_experts_normal(self):
|
||||
expert_table = np.array([[[0, 1], [1, 0]]])
|
||||
workload = np.array([[[2, 3], [4, 1]]])
|
||||
policy = DynamicEplb(config=None)
|
||||
change, priority, new_dep = policy.rebalance_experts(
|
||||
expert_table, workload)
|
||||
assert change in [0, 1]
|
||||
assert isinstance(priority, np.ndarray)
|
||||
assert isinstance(new_dep, list)
|
||||
assert np.array(new_dep).shape == expert_table.shape
|
||||
|
||||
def test_rebalance_experts_exceptions(self):
|
||||
policy = DynamicEplb(config=None)
|
||||
|
||||
# case1: num_original_expert != expert_num
|
||||
expert_table = np.array([[[0, 1], [1, 0]]])
|
||||
workload = np.array([[[2, 3], [4, 1]]])
|
||||
with patch.object(DynamicEplb,
|
||||
'add_redundant',
|
||||
return_value=np.array([[1, 2, 3]])):
|
||||
with pytest.raises(ValueError):
|
||||
policy.rebalance_experts(expert_table, workload)
|
||||
|
||||
# case2: num_npus <= 0
|
||||
expert_table_zero = np.array([[]]) # 1 layer, 0 NPU, 0 experts
|
||||
workload_zero = np.array([[]])
|
||||
with pytest.raises(ValueError):
|
||||
policy.rebalance_experts(expert_table_zero, workload_zero)
|
||||
|
||||
# case3: num_npus < num_redundancy_expert
|
||||
expert_table_small = np.array([[[0, 0]]]) # 1 layer, 1 NPU, 2 experts
|
||||
workload_small = np.array([[[1, 1]]])
|
||||
with patch.object(DynamicEplb, 'get_redundant_num', return_value=2):
|
||||
with pytest.raises(ValueError):
|
||||
policy.rebalance_experts(expert_table_small, workload_small)
|
||||
99
tests/ut/eplb/core/policy/test_policy_dynamic_ep_v2.py
Normal file
99
tests/ut/eplb/core/policy/test_policy_dynamic_ep_v2.py
Normal file
@@ -0,0 +1,99 @@
|
||||
from typing import Dict, Set
|
||||
|
||||
import numpy as np
|
||||
import pytest
|
||||
|
||||
from vllm_ascend.eplb.core.policy.policy_dynamic_ep_v2 import (DynamicConfig,
|
||||
DynamicEplbV2)
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def config():
|
||||
return DynamicConfig()
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def policy(config):
|
||||
return DynamicEplbV2(config)
|
||||
|
||||
|
||||
def test_safe_operations(policy):
|
||||
# safe_divide
|
||||
assert policy.safe_divide(10, 2) == 5
|
||||
assert policy.safe_divide(1, 0) == 0
|
||||
|
||||
# safe_exact_divide
|
||||
assert policy.safe_exact_divide(10, 3) == 3
|
||||
assert policy.safe_exact_divide(1, 0) == 0
|
||||
|
||||
# safe_mod
|
||||
assert policy.safe_mod(10, 3) == 1
|
||||
assert policy.safe_mod(1, 0) == 0
|
||||
|
||||
|
||||
def test_add_redundant():
|
||||
workload = np.array([[[1, 2], [3, 4]]])
|
||||
placement = np.array([[[0, 1], [0, 1]]])
|
||||
result = DynamicEplbV2.add_redundant(placement, workload, 2)
|
||||
assert result.shape == (1, 2)
|
||||
assert np.all(result[0] == [4, 6]) # 0:1+3, 1:2+4
|
||||
|
||||
|
||||
def test_get_redundant_num():
|
||||
counts = np.array([1, 2, 1])
|
||||
assert DynamicEplbV2.get_redundant_num(3, counts) == 1 # sum(counts-1)
|
||||
|
||||
|
||||
def test_calculate_max_heat_per_layer():
|
||||
workload = np.array([[[1, 2], [3, 4]], [[5, 6], [7, 8]]])
|
||||
result = DynamicEplbV2.calculate_max_heat_per_layer(workload, 2)
|
||||
assert result == [7, 15]
|
||||
|
||||
|
||||
def test_calculate_initial_imbalance(policy):
|
||||
deployment = np.array([[[0, 1], [0, 1]]])
|
||||
workloads = np.array([[1, 1]])
|
||||
result = policy.calculate_initial_imbalance(deployment, workloads)
|
||||
assert isinstance(result, list)
|
||||
assert len(result) == 1
|
||||
|
||||
|
||||
def test_compute_redundant_assignments(policy):
|
||||
base_experts = [(0, 10), (1, 5)]
|
||||
redundant, sorted_weights = policy.compute_redundant_assignments(
|
||||
base_experts, num_redundant_experts=2, num_experts=2)
|
||||
assert len(redundant) == 2
|
||||
assert len(sorted_weights) == 2
|
||||
|
||||
|
||||
def test_prepare_expert_list():
|
||||
base_experts = [(0, 10), (1, 5)]
|
||||
redundant_assignments = [[2], []]
|
||||
result = DynamicEplbV2.prepare_expert_list(base_experts,
|
||||
redundant_assignments, 1)
|
||||
assert isinstance(result, list)
|
||||
assert len(result) == 1
|
||||
|
||||
|
||||
def test_non_redundant_expert_information():
|
||||
origin_deployment = np.array([[0, 1]])
|
||||
updated_weights = [(0, 10), (1, 5)]
|
||||
rendun_pos: Dict[int, Set[int]] = {0: set()}
|
||||
assignments, weights, loads, counts = DynamicEplbV2.non_redundant_expert_information(
|
||||
origin_deployment, updated_weights, rendun_pos)
|
||||
assert assignments[0] == [0, 1]
|
||||
assert loads[0] == 15
|
||||
|
||||
|
||||
def test_recomputing_initial_weight(policy):
|
||||
layer_workloads = [10, 5]
|
||||
device_assignments = [[0, 1]]
|
||||
cur_layer_workload, num_all_experts = policy.recomputing_initial_weight(
|
||||
layer_workloads, device_assignments)
|
||||
assert cur_layer_workload[0] == 10
|
||||
assert num_all_experts[0] == 1
|
||||
|
||||
|
||||
def test_safe_divide_zero_edge_case(policy):
|
||||
assert policy.safe_divide(0, 1) == 0
|
||||
assert policy.safe_divide(0, 5) == 0
|
||||
23
tests/ut/eplb/core/policy/test_policy_factor.py
Normal file
23
tests/ut/eplb/core/policy/test_policy_factor.py
Normal file
@@ -0,0 +1,23 @@
|
||||
import pytest
|
||||
|
||||
from vllm_ascend.eplb.core.policy.policy_abstract import DynamicConfig
|
||||
from vllm_ascend.eplb.core.policy.policy_dynamic_ep import DynamicEplb
|
||||
from vllm_ascend.eplb.core.policy.policy_dynamic_ep_v2 import DynamicEplbV2
|
||||
from vllm_ascend.eplb.core.policy.policy_factory import PolicyFactory
|
||||
from vllm_ascend.eplb.core.policy.policy_random import RandomLoadBalance
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def dummy_config():
|
||||
return DynamicConfig()
|
||||
|
||||
|
||||
@pytest.mark.parametrize("policy_type, expected_class", [
|
||||
(0, RandomLoadBalance),
|
||||
(1, DynamicEplb),
|
||||
(2, DynamicEplbV2),
|
||||
(999, RandomLoadBalance),
|
||||
])
|
||||
def test_generate_policy(policy_type, expected_class, dummy_config):
|
||||
policy_instance = PolicyFactory.generate_policy(policy_type, dummy_config)
|
||||
assert isinstance(policy_instance, expected_class)
|
||||
122
tests/ut/eplb/core/test_eplb_device_transfer_loader.py
Normal file
122
tests/ut/eplb/core/test_eplb_device_transfer_loader.py
Normal file
@@ -0,0 +1,122 @@
|
||||
from typing import Any
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
|
||||
import vllm_ascend.eplb.core.eplb_device_transfer_loader as loader
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_adaptor():
|
||||
adaptor = MagicMock()
|
||||
|
||||
adaptor.expert_map_per_layer_cpu = {
|
||||
0: {
|
||||
10: torch.tensor(1),
|
||||
20: torch.tensor(0)
|
||||
}
|
||||
}
|
||||
|
||||
adaptor.expert_param_per_layer = {
|
||||
0: {
|
||||
0: [[torch.tensor([1.0])]],
|
||||
1: [[torch.tensor([2.0])]]
|
||||
}
|
||||
}
|
||||
|
||||
adaptor.buffer_tensor_list = [[[torch.tensor([3.0])],
|
||||
[torch.tensor([4.0])]]]
|
||||
return adaptor
|
||||
|
||||
|
||||
def test_generate_task_and_state_flow(mock_adaptor):
|
||||
loader_obj = loader.D2DExpertWeightLoader()
|
||||
loader_obj.set_adator(mock_adaptor)
|
||||
|
||||
with patch("torch.distributed.P2POp") as mock_p2p, \
|
||||
patch("torch.distributed.isend", return_value="isend_op"), \
|
||||
patch("torch.distributed.irecv", return_value="irecv_op"):
|
||||
|
||||
mock_p2p.side_effect = lambda op, tensor, rank: (op, tensor, rank)
|
||||
|
||||
loader_obj.state = loader.ExpertWeightUpdateState.READY
|
||||
loader_obj.generate_expert_d2d_transfer_task([(1, 10)], [(2, 20)],
|
||||
{20: torch.tensor(0)}, 0)
|
||||
assert loader_obj.comm_op_list is None
|
||||
loader_obj.state = loader.ExpertWeightUpdateState.WAITING
|
||||
|
||||
loader_obj.generate_expert_d2d_transfer_task([], [], {}, 0)
|
||||
assert loader_obj.comm_op_list is None
|
||||
|
||||
updated_map = {20: torch.tensor(0)}
|
||||
loader_obj.generate_expert_d2d_transfer_task([(1, 10)], [(2, 20)],
|
||||
updated_map, 0)
|
||||
assert loader_obj.state == loader.ExpertWeightUpdateState.READY
|
||||
assert loader_obj.comm_op_list
|
||||
assert loader_obj.recv_expert_list
|
||||
|
||||
|
||||
def test_asyn_transfer_and_update(mock_adaptor):
|
||||
loader_obj = loader.D2DExpertWeightLoader()
|
||||
loader_obj.set_adator(mock_adaptor)
|
||||
|
||||
loader_obj.comm_op_list = ["fake_op"]
|
||||
loader_obj.state = loader.ExpertWeightUpdateState.READY
|
||||
|
||||
reqs: list[MagicMock] = []
|
||||
|
||||
with patch("torch.distributed.batch_isend_irecv",
|
||||
return_value=[MagicMock(), MagicMock()]):
|
||||
loader_obj.asyn_expert_weight_transfer(reqs)
|
||||
|
||||
assert loader_obj.state == loader.ExpertWeightUpdateState.TRANSFERRING
|
||||
assert len(reqs) > 0
|
||||
|
||||
mock_req = MagicMock()
|
||||
mock_req.wait.return_value = None
|
||||
reqs = [mock_req]
|
||||
|
||||
loader_obj.recv_expert_list = [(0, 0)]
|
||||
loader_obj.updated_expert_map = {20: torch.tensor(0)}
|
||||
loader_obj.updated_log2phy_map = {"dummy": 1}
|
||||
loader_obj.layer_id = 0
|
||||
loader_obj.comm_op_list = ["op"]
|
||||
|
||||
loader_obj.update_expert_map_and_weight(reqs)
|
||||
|
||||
mock_adaptor.do_update_expert_map.assert_called_once()
|
||||
mock_adaptor.do_update_log2phy_map.assert_called_once()
|
||||
mock_adaptor.do_update_expert_weight.assert_called_once()
|
||||
|
||||
assert loader_obj.state == loader.ExpertWeightUpdateState.WAITING
|
||||
assert loader_obj.recv_expert_list == []
|
||||
|
||||
|
||||
def test_set_log2phy_map(mock_adaptor):
|
||||
loader_obj = loader.D2DExpertWeightLoader()
|
||||
loader_obj.set_adator(mock_adaptor)
|
||||
loader_obj.set_log2phy_map({"a": 1})
|
||||
assert loader_obj.updated_log2phy_map == {"a": 1}
|
||||
|
||||
|
||||
def test_invalid_state_asyn_update(mock_adaptor):
|
||||
loader_obj = loader.D2DExpertWeightLoader()
|
||||
loader_obj.set_adator(mock_adaptor)
|
||||
|
||||
loader_obj.state = loader.ExpertWeightUpdateState.WAITING
|
||||
reqs: list[Any] = []
|
||||
loader_obj.asyn_expert_weight_transfer(reqs)
|
||||
assert reqs == []
|
||||
|
||||
loader_obj.state = loader.ExpertWeightUpdateState.READY
|
||||
loader_obj.update_expert_map_and_weight([])
|
||||
|
||||
assert not mock_adaptor.do_update_expert_map.called
|
||||
|
||||
|
||||
def test_load_impl_not_implemented(mock_adaptor):
|
||||
loader_obj = loader.D2DExpertWeightLoader()
|
||||
loader_obj.set_adator(mock_adaptor)
|
||||
with pytest.raises(NotImplementedError):
|
||||
loader_obj.load_impl({}, {})
|
||||
79
tests/ut/eplb/core/test_eplb_utils.py
Normal file
79
tests/ut/eplb/core/test_eplb_utils.py
Normal file
@@ -0,0 +1,79 @@
|
||||
import random
|
||||
|
||||
import torch
|
||||
|
||||
from vllm_ascend.eplb.core import eplb_utils
|
||||
|
||||
|
||||
def test_determine_default_expert_map_single_world():
|
||||
count, expert_map = eplb_utils.determine_default_expert_map(
|
||||
global_expert_num=4,
|
||||
world_size=1,
|
||||
rank_id=0,
|
||||
global_redundant_expert_num=0)
|
||||
assert count == 4
|
||||
assert torch.equal(expert_map, torch.arange(4, dtype=torch.int32))
|
||||
|
||||
|
||||
def test_determine_default_expert_map_multiple_worlds_no_redundant():
|
||||
count, expert_map = eplb_utils.determine_default_expert_map(
|
||||
global_expert_num=8,
|
||||
world_size=2,
|
||||
rank_id=0,
|
||||
global_redundant_expert_num=0)
|
||||
|
||||
assert count == 4
|
||||
assert torch.all(expert_map[:4] >= 0)
|
||||
assert torch.all(expert_map[4:] == -1)
|
||||
|
||||
|
||||
def test_determine_default_expert_map_multiple_worlds_with_redundant():
|
||||
count, expert_map = eplb_utils.determine_default_expert_map(
|
||||
global_expert_num=5,
|
||||
world_size=2,
|
||||
rank_id=0,
|
||||
global_redundant_expert_num=1)
|
||||
|
||||
assert count == 3
|
||||
assert torch.all(expert_map[0:3] >= 0)
|
||||
|
||||
|
||||
def test_generate_log2phy_map_single_rank_holding():
|
||||
|
||||
expert_map = torch.tensor([[0, -1], [-1, 0]], dtype=torch.int32)
|
||||
log2phy_map = eplb_utils.generate_log2phy_map(expert_map)
|
||||
|
||||
assert torch.all(log2phy_map[:, 0] == log2phy_map[0, 0])
|
||||
assert torch.all(log2phy_map[:, 1] == log2phy_map[1, 1])
|
||||
|
||||
|
||||
def test_generate_log2phy_map_multiple_rank_holding(monkeypatch):
|
||||
|
||||
expert_map = torch.tensor([[0], [0]], dtype=torch.int32)
|
||||
|
||||
monkeypatch.setattr(random, "choice", lambda x: x[0])
|
||||
|
||||
log2phy_map = eplb_utils.generate_log2phy_map(expert_map)
|
||||
|
||||
assert log2phy_map.shape == (2, 1)
|
||||
assert (log2phy_map >= 0).all()
|
||||
|
||||
|
||||
def test_determine_default_log2phy_map_world_size_1():
|
||||
log2phy = eplb_utils.determine_default_log2phy_map(
|
||||
global_expert_num=3,
|
||||
world_size=1,
|
||||
rank_id=0,
|
||||
global_redundant_expert_num=0)
|
||||
assert log2phy.shape == (3, )
|
||||
assert (log2phy >= 0).all()
|
||||
|
||||
|
||||
def test_determine_default_log2phy_map_world_size_multiple():
|
||||
log2phy = eplb_utils.determine_default_log2phy_map(
|
||||
global_expert_num=6,
|
||||
world_size=2,
|
||||
rank_id=1,
|
||||
global_redundant_expert_num=1)
|
||||
assert log2phy.shape == (6, )
|
||||
assert (log2phy >= 0).all()
|
||||
@@ -7,6 +7,7 @@ import time
|
||||
import types
|
||||
import unittest
|
||||
from collections import defaultdict, deque
|
||||
from typing import OrderedDict
|
||||
from unittest.mock import MagicMock, patch
|
||||
|
||||
import msgspec
|
||||
@@ -34,7 +35,7 @@ class TestKVCacheTaskTrackerInit(unittest.TestCase):
|
||||
tracker = KVCacheTaskTracker()
|
||||
self.assertIsInstance(tracker.done_task_lock, type(threading.Lock()))
|
||||
self.assertIsInstance(tracker.finished_requests, set)
|
||||
self.assertIsInstance(tracker.delayed_free_requests, deque)
|
||||
self.assertIsInstance(tracker.delayed_free_requests, OrderedDict)
|
||||
|
||||
|
||||
class TestGetAndClearFinishedSingleRequests(unittest.TestCase):
|
||||
@@ -495,18 +496,42 @@ class TestKVCacheTaskTracker(unittest.TestCase):
|
||||
def test_update_done_task_count(self):
|
||||
self.assertEqual(len(self.tracker.finished_requests), 0)
|
||||
self.assertEqual(len(self.tracker.delayed_free_requests), 0)
|
||||
self.assertEqual(len(self.tracker.record_finished_requests), 0)
|
||||
|
||||
current_time = time.time()
|
||||
self.tracker.add_delayed_request("req_1", current_time)
|
||||
result = self.tracker.delayed_free_requests
|
||||
result_record = self.tracker.record_finished_requests
|
||||
self.assertEqual(len(result), 1)
|
||||
self.assertEqual(result[0], ("req_1", current_time))
|
||||
self.assertEqual(result["req_1"], current_time)
|
||||
self.assertEqual(len(result_record), 0)
|
||||
|
||||
self.tracker.update_done_task_count("req_1")
|
||||
result_finished = self.tracker.finished_requests
|
||||
result_delayed = self.tracker.delayed_free_requests
|
||||
result_record = self.tracker.record_finished_requests
|
||||
self.assertEqual(result_finished, {"req_1"})
|
||||
self.assertEqual(len(result_delayed), 0)
|
||||
self.assertEqual(len(result_record), 0)
|
||||
|
||||
self.tracker.update_done_task_count("req_2")
|
||||
result_finished = self.tracker.finished_requests
|
||||
result_delayed = self.tracker.delayed_free_requests
|
||||
result_record = self.tracker.record_finished_requests
|
||||
self.assertEqual(result_finished, {"req_1", "req_2"})
|
||||
self.assertEqual(len(result_delayed), 0)
|
||||
self.assertEqual(len(result_record), 1)
|
||||
self.assertEqual(result_record, {"req_2"})
|
||||
|
||||
def test_updtate_add_delayed_request(self) -> None:
|
||||
self.tracker.update_done_task_count("req2")
|
||||
result_start_record = self.tracker.record_finished_requests
|
||||
self.assertEqual(len(result_start_record), 1)
|
||||
self.tracker.add_delayed_request("req2", time.time())
|
||||
result_delayed = self.tracker.delayed_free_requests
|
||||
result_end_record = self.tracker.record_finished_requests
|
||||
self.assertEqual(len(result_delayed), 0)
|
||||
self.assertEqual(len(result_end_record), 0)
|
||||
|
||||
def test_retrieve_expired_requests(self):
|
||||
current_time = time.time()
|
||||
@@ -518,7 +543,7 @@ class TestKVCacheTaskTracker(unittest.TestCase):
|
||||
})
|
||||
result_delay = self.tracker.delayed_free_requests
|
||||
self.assertEqual(len(result_delay), 1)
|
||||
self.assertEqual(result_delay[0], ("req_2", current_time))
|
||||
self.assertIn("req_2", result_delay)
|
||||
|
||||
def test_duplicate_task_update(self):
|
||||
self.tracker.update_done_task_count("req1")
|
||||
@@ -961,6 +986,46 @@ class TestMooncakeConnectorWorker(unittest.TestCase):
|
||||
for p in self.patches:
|
||||
p.stop() # type: ignore
|
||||
|
||||
def test_worker_use_ascend_direct(self):
|
||||
test_case = [True, False]
|
||||
|
||||
for use_ascend_direct in test_case:
|
||||
with self.subTest(use_ascend_direct=use_ascend_direct):
|
||||
config = MagicMock()
|
||||
config.kv_transfer_config = MagicMock()
|
||||
config.kv_transfer_config.get_from_extra_config.side_effect = (
|
||||
lambda k, d: {
|
||||
"prefill": {
|
||||
"tp_size": 2,
|
||||
"dp_size": 1
|
||||
},
|
||||
"decode": {
|
||||
"tp_size": 2,
|
||||
"dp_size": 1
|
||||
},
|
||||
"use_ascend_direct": use_ascend_direct,
|
||||
}.get(k, d))
|
||||
|
||||
config.parallel_config = MagicMock()
|
||||
config.parallel_config.tensor_parallel_size = 2
|
||||
config.parallel_config.data_parallel_rank_local = 0
|
||||
config.parallel_config.data_parallel_size_local = 1
|
||||
config.kv_transfer_config.kv_port = 8000
|
||||
config.kv_transfer_config.kv_role = 'worker'
|
||||
|
||||
with patch(
|
||||
"vllm_ascend.distributed.mooncake_connector.get_tensor_model_parallel_rank",
|
||||
return_value=0):
|
||||
with patch(
|
||||
"vllm_ascend.distributed.mooncake_connector.get_tp_group",
|
||||
return_value=None):
|
||||
with patch(
|
||||
"vllm_ascend.distributed.mooncake_connector.get_ip",
|
||||
return_value="127.0.0.1"):
|
||||
worker = MooncakeConnectorWorker(
|
||||
config, self.engine_id)
|
||||
self.assertIsNotNone(worker)
|
||||
|
||||
def test_register_kv_caches_producer(self):
|
||||
worker = MooncakeConnectorWorker(self.vllm_config, self.engine_id)
|
||||
worker.register_kv_caches(self.kv_caches)
|
||||
|
||||
@@ -10,6 +10,7 @@ import torch
|
||||
from vllm import SamplingParams
|
||||
from vllm.config import (CacheConfig, DeviceConfig, KVTransferConfig,
|
||||
ModelConfig, SchedulerConfig, VllmConfig)
|
||||
from vllm.utils import sha256
|
||||
from vllm.v1.core.kv_cache_utils import (get_request_block_hasher,
|
||||
init_none_hash)
|
||||
from vllm.v1.core.sched.scheduler import Scheduler
|
||||
@@ -19,8 +20,6 @@ from vllm.v1.outputs import ModelRunnerOutput
|
||||
from vllm.v1.request import Request
|
||||
from vllm.v1.structured_output import StructuredOutputManager
|
||||
|
||||
from vllm_ascend.utils import vllm_version_is
|
||||
|
||||
EOS_TOKEN_ID = 50256
|
||||
os.environ["VLLM_USE_V1"] = "1"
|
||||
|
||||
@@ -131,10 +130,10 @@ def create_request(
|
||||
"""Make dummy request for testing."""
|
||||
global _none_hash_initialized
|
||||
if not _none_hash_initialized:
|
||||
init_none_hash(hash)
|
||||
init_none_hash(sha256)
|
||||
_none_hash_initialized = True
|
||||
|
||||
block_hasher = get_request_block_hasher(block_size, hash)
|
||||
block_hasher = get_request_block_hasher(block_size, sha256)
|
||||
|
||||
kv_transfer_params: Optional[dict[str, Any]] = None
|
||||
|
||||
@@ -160,27 +159,14 @@ def create_request(
|
||||
else:
|
||||
prompt_token_ids = [i * request_id for i in range(num_tokens)]
|
||||
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
req = Request(
|
||||
request_id=f"id-{request_id}",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
multi_modal_kwargs=None,
|
||||
multi_modal_placeholders=None,
|
||||
multi_modal_hashes=None,
|
||||
pooling_params=[],
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
block_hasher=block_hasher,
|
||||
)
|
||||
else:
|
||||
req = Request(
|
||||
request_id=f"id-{request_id}",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=[],
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
block_hasher=block_hasher,
|
||||
)
|
||||
req = Request(
|
||||
request_id=f"id-{request_id}",
|
||||
prompt_token_ids=prompt_token_ids,
|
||||
sampling_params=sampling_params,
|
||||
pooling_params=[],
|
||||
eos_token_id=EOS_TOKEN_ID,
|
||||
block_hasher=block_hasher,
|
||||
)
|
||||
req.kv_transfer_params = kv_transfer_params
|
||||
return req
|
||||
|
||||
@@ -208,26 +194,15 @@ def create_model_runner_output(
|
||||
kv_connector_output = KVConnectorOutput(finished_sending=finished_sending,
|
||||
finished_recving=finished_recving)
|
||||
extra_args = {"kv_connector_output": kv_connector_output}
|
||||
if vllm_version_is("0.10.1.1") or vllm_version_is("0.10.1"):
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
spec_token_ids=None,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
**extra_args,
|
||||
)
|
||||
else:
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
**extra_args,
|
||||
)
|
||||
|
||||
model_runner_output = ModelRunnerOutput(
|
||||
req_ids=req_ids,
|
||||
req_id_to_index=req_id_to_index,
|
||||
sampled_token_ids=sampled_token_ids,
|
||||
logprobs=None,
|
||||
prompt_logprobs_dict={},
|
||||
pooler_output=[],
|
||||
**extra_args,
|
||||
)
|
||||
|
||||
return model_runner_output
|
||||
|
||||
114
tests/ut/models/conftest.py
Normal file
114
tests/ut/models/conftest.py
Normal file
@@ -0,0 +1,114 @@
|
||||
from types import SimpleNamespace
|
||||
from unittest.mock import MagicMock, Mock, patch
|
||||
|
||||
import pytest
|
||||
import torch
|
||||
from transformers import PretrainedConfig
|
||||
from vllm.config import CacheConfig, EPLBConfig, ParallelConfig
|
||||
from vllm.distributed.parallel_state import GroupCoordinator
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def base_config():
|
||||
config = PretrainedConfig(
|
||||
hidden_size=128,
|
||||
num_attention_heads=8,
|
||||
num_hidden_layers=2,
|
||||
intermediate_size=256,
|
||||
hidden_act="silu",
|
||||
rms_norm_eps=1e-6,
|
||||
rope_theta=10000.0,
|
||||
max_position_embeddings=2048,
|
||||
n_routed_experts=4,
|
||||
n_shared_experts=1,
|
||||
moe_intermediate_size=256,
|
||||
num_experts_per_tok=2,
|
||||
routed_scaling_factor=1.0,
|
||||
first_k_dense_replace=0,
|
||||
moe_layer_freq=1,
|
||||
kv_lora_rank=16,
|
||||
qk_nope_head_dim=16,
|
||||
qk_rope_head_dim=16,
|
||||
v_head_dim=32,
|
||||
topk_method="noaux_tc",
|
||||
scoring_func="softmax",
|
||||
norm_topk_prob=True,
|
||||
n_group=1,
|
||||
topk_group=1,
|
||||
vocab_size=10000,
|
||||
)
|
||||
return config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def vllm_config(base_config):
|
||||
model_config = SimpleNamespace(
|
||||
hf_config=base_config,
|
||||
tensor_parallel_size=1,
|
||||
dtype=torch.float32,
|
||||
use_mla=True,
|
||||
quant_config=None,
|
||||
max_model_len=2048,
|
||||
)
|
||||
parallel_config = MagicMock(spec=ParallelConfig)
|
||||
eplb_config = MagicMock(spec=EPLBConfig)
|
||||
eplb_config.num_redundant_experts = 0
|
||||
parallel_config.eplb_config = eplb_config
|
||||
|
||||
cache_config = CacheConfig()
|
||||
vllm_config = Mock()
|
||||
vllm_config.model_config = model_config
|
||||
vllm_config.cache_config = cache_config
|
||||
vllm_config.quant_config = None
|
||||
vllm_config.parallel_config = parallel_config
|
||||
return vllm_config
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_distributed():
|
||||
tp_group = Mock(spec=GroupCoordinator)
|
||||
tp_group.rank_in_group = 0
|
||||
tp_group.world_size = 1
|
||||
tp_group.device_group = Mock()
|
||||
|
||||
dp_group = Mock(spec=GroupCoordinator)
|
||||
dp_group.rank_in_group = 0
|
||||
dp_group.world_size = 1
|
||||
|
||||
ep_group = Mock(spec=GroupCoordinator)
|
||||
ep_group.rank_in_group = 0
|
||||
ep_group.world_size = 1
|
||||
ep_group.device_group = Mock()
|
||||
ep_group.device_group.rank.return_value = 0
|
||||
ep_group.device_group.size.return_value = 1
|
||||
|
||||
pp_group = Mock(spec=GroupCoordinator)
|
||||
pp_group.rank_in_group = 0
|
||||
pp_group.world_size = 1
|
||||
|
||||
mock_vllm_config = Mock()
|
||||
mock_vllm_config.scheduler_config = Mock(max_num_seqs=256)
|
||||
mock_vllm_config.model_config = Mock(max_model_len=2048, quant_config=None)
|
||||
|
||||
with patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_rank", return_value=0), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_tensor_model_parallel_world_size", return_value=1), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_tp_group", return_value=tp_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_pp_group", return_value=pp_group), \
|
||||
patch("vllm_ascend.models.deepseek_v2.get_pp_group",
|
||||
return_value=Mock(is_first_rank=False, is_last_rank=False)), \
|
||||
patch("vllm_ascend.ops.fused_moe.get_current_vllm_config", return_value=mock_vllm_config), \
|
||||
patch("vllm_ascend.ops.moe.token_dispatcher.torch.distributed.get_rank", return_value=0), \
|
||||
patch("vllm_ascend.ops.moe.token_dispatcher.get_ascend_soc_version", return_value=None), \
|
||||
patch.dict("vllm.distributed.parallel_state.__dict__", _TP=tp_group, _EP=ep_group, _DP=dp_group,
|
||||
_PP=pp_group), \
|
||||
patch.dict("vllm_ascend.distributed.parallel_state.__dict__", _MC2=ep_group), \
|
||||
patch("torch.npu.current_device", return_value=0):
|
||||
yield
|
||||
|
||||
|
||||
@pytest.fixture
|
||||
def mock_forward_context():
|
||||
forward_context = Mock(in_profile_run=False, with_prefill=False)
|
||||
with patch("vllm_ascend.models.deepseek_v2.get_forward_context",
|
||||
return_value=forward_context):
|
||||
yield
|
||||
@@ -13,10 +13,13 @@ from vllm_ascend.models.deepseek_mtp import (
|
||||
class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
|
||||
|
||||
@pytest.fixture
|
||||
def setup_mtp_layer(self, mocker: MockerFixture):
|
||||
def setup_mtp_layer(self, mocker: MockerFixture, vllm_config: VllmConfig,
|
||||
mock_distributed):
|
||||
config = PretrainedConfig(vocab_size=1000,
|
||||
hidden_size=768,
|
||||
rms_norm_eps=1e-5)
|
||||
mocker.patch("vllm_ascend.models.deepseek_mtp.get_current_vllm_config",
|
||||
return_value=vllm_config)
|
||||
mocker.patch(
|
||||
"vllm.model_executor.layers.vocab_parallel_embedding.VocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
@@ -29,15 +32,15 @@ class TestCustomDeepSeekMultiTokenPredictorLayer(PytestBase):
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekShareHead.__init__",
|
||||
return_value=None)
|
||||
mocker_deepseek_v2_decode_layer = mocker.patch(
|
||||
"vllm_ascend.models.deepseek_v2.CustomDeepseekV2DecoderLayer.__init__",
|
||||
"vllm.model_executor.models.deepseek_v2.DeepseekV2DecoderLayer.__init__",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm_ascend.utils.get_ascend_config",
|
||||
mocker.patch("vllm_ascend.models.deepseek_v2.get_ascend_config",
|
||||
return_value=mocker.Mock())
|
||||
|
||||
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "", None)
|
||||
mtp_layer = CustomDeepSeekMultiTokenPredictorLayer(config, "0", None)
|
||||
mocker_deepseek_v2_decode_layer.assert_called_once()
|
||||
return mtp_layer
|
||||
|
||||
@@ -165,8 +168,6 @@ class TestCustomDeepSeekMTP(PytestBase):
|
||||
mocker.patch(
|
||||
"vllm_ascend.models.deepseek_mtp.CustomDeepSeekMultiTokenPredictorLayer.__call__",
|
||||
return_value=None)
|
||||
mocker.patch("vllm.model_executor.layers.sampler.get_sampler",
|
||||
return_value=None)
|
||||
mocker.patch(
|
||||
"vllm_ascend.ops.vocab_parallel_embedding.AscendVocabParallelEmbedding.__init__",
|
||||
return_value=None)
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user